
    sgq#                     Z    d Z ddlmZ ddlmZ  ej
                  e      Z G d de      Zy)zM-CTC-T model configuration   )PretrainedConfig)loggingc                   \     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MCTCTConfiga  
    This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
    M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the M-CTC-T
    [speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 8065):
            Vocabulary size of the M-CTC-T model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MCTCTModel`].
        hidden_size (`int`, *optional*, defaults to 1536):
            Dimension of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 36):
            Number of hidden layers in the Transformer encoder.
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 4):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 384):
            Dimensions of each attention head for each attention layer in the Transformer encoder.
        max_position_embeddings (`int`, *optional*, defaults to 920):
            The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        layerdrop (`float`, *optional*, defaults to 0.3):
            The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
            implementation.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
            The dropout ratio for the attention probabilities.
        pad_token_id (`int`, *optional*, defaults to 1):
            The tokenizer index of the pad token.
        bos_token_id (`int`, *optional*, defaults to 0):
            The tokenizer index of the bos token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The tokenizer index of the eos token.
        conv_glu_dim (`int`, *optional*, defaults to 1):
            The dimension of the output of the `Conv1dSubsampler` layer in which GLU is applied on. Though the original
            Flashlight code uses the value of 2, here it's adapted to 1 due to transposition differences.
        conv_dropout (`int`, *optional*, defaults to 0.3):
            The probability of randomly dropping the `Conv1dSubsampler` layer during training.
        num_conv_layers (`int`, *optional*, defaults to 1):
            Number of convolution layers before applying transformer encoder layers.
        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
            The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
            to `num_conv_layers`.
        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
            The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
            to `num_conv_layers`.
        input_feat_per_channel (`int`, *optional*, defaults to 80):
            Feature dimensions of the channels of the input to the Conv1D layer.
        input_channels (`int`, *optional*, defaults to 1):
            Number of input channels of the input to the Conv1D layer.
        conv_channels (`List[int]`, *optional*):
            Channel sizes of intermediate Conv1D layers.
        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
            instance of [`MCTCTForCTC`].
        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
            of [`MCTCTForCTC`].

    Example:

    ```python
    >>> from transformers import MCTCTConfig, MCTCTModel

    >>> # Initializing a M-CTC-T mctct-large style configuration
    >>> configuration = MCTCTConfig()

    >>> # Initializing a model (with random weights) from the mctct-large style configuration
    >>> model = MCTCTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mctctc                 b   t        |   di ||||d || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t5        |      | _        t5        |      | _        t;        | j6                        | j(                  k7  r/t=        dt;        | j6                         d| j(                   d      y )N)pad_token_idbos_token_ideos_token_idzConfiguration for convolutional module is incorrect. It is required that `len(config.conv_kernel)` == `config.num_conv_layers` but is `len(config.conv_kernel) = z`, `config.num_conv_layers = z`. )super__init__
vocab_sizehidden_sizenum_hidden_layersintermediate_sizenum_attention_headsattention_head_dimmax_position_embeddingslayer_norm_eps	layerdrop
hidden_actinitializer_rangehidden_dropout_probattention_probs_dropout_probr	   r
   r   conv_glu_dimconv_dropoutnum_conv_layersinput_feat_per_channelinput_channelsconv_channelsctc_loss_reductionctc_zero_infinitylistconv_kernelconv_stridelen
ValueError)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r%   r&   r   r    r!   r"   r#   kwargs	__class__s                               k/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.pyr   zMCTCTConfig.__init__s   sS   < 	s6s<frs$&!2!2#6 "4'>$,"$!2#6 ,H)(((((.&<#,*"4!2  ,,t D$8$885589I9I5J4K L--1-A-A,B"F  9    )i  i   $   i   r   i  i  gh㈵>333333?relug{Gz?r/   r/             r1   r/   r1   )   )   P   r1   NsumF)__name__
__module____qualname____doc__
model_typer   __classcell__)r+   s   @r,   r   r      sm    Vp J  #%(! 7B Br-   r   N)	r;   configuration_utilsr   utilsr   
get_loggerr8   loggerr   r   r-   r,   <module>rB      s3    " 4  
		H	%]" ]r-   