
    sg                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej                  e
      Z G d de      Z G d	 d
e      Zy)zUMT5 model configuration    )Mapping   )PretrainedConfig)OnnxSeq2SeqConfigWithPast)loggingc                   f     e Zd ZdZdZdgZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fd		Z xZS )
UMT5ConfigaR  
    This is the configuration class to store the configuration of a [`UMT5Model`]. It is used to instantiate a UMT5
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the UMT5
    [google/umt5-small](https://huggingface.co/google/umt5-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 250112):
            Vocabulary size of the UMT5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`UMT5Model`] or [`TFUMT5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
            num_heads`.
        d_ff (`int`, *optional*, defaults to 1024):
            Size of the intermediate feed forward layer in each `UMT5Block`.
        num_layers (`int`, *optional*, defaults to 8):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    umt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimc           
         || _         || _        || _        || _        || _        ||n| j                  | _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        | j                  j                  d      }|d   | _        |d   dk(  | _        t%        |      dkD  r|d   dk7  st%        |      dkD  rt'        d| d      |d	k(  rd
| _        t)        | T  d||||||d| y )N-r   gated      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'
gated-gelugelu_new)is_encoder_decodertokenizer_classtie_word_embeddingspad_token_ideos_token_iddecoder_start_token_id )
vocab_sizer   r   d_ffr   num_decoder_layersr   relative_attention_num_bucketsrelative_attention_max_distancedropout_rateclassifier_dropoutlayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachesplitdense_act_fnis_gated_actlen
ValueErrorsuper__init__)selfr#   r   r   r$   r   r%   r   r&   r'   r(   r*   r+   r,   r   r-   r   r   r   r    r!   r)   kwargsact_info	__class__s                           ^/var/www/html/venv/lib/python3.12/site-packages/transformers/models/umt5/configuration_umt5.pyr4   zUMT5Config.__init__R   s<   2 %		$"4"@doo 	 #.L+/N,("4"4"4!2"))//4$RL$QK72x=1!!73x=1;L'(9': ;) )  , *D 	
1+ 3%%#9	
 	
    )i  i   @   i      N          g?gư>g      ?r   TTT5TokenizerTr   r   r   g        )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr4   __classcell__)r8   s   @r9   r	   r	      sy    +Z J#4"5 *)	M ')(+&%  -A
 A
r:   r	   c                   b    e Zd Zedeeeeef   f   fd       Zedefd       Zede	fd       Z
y)UMT5OnnxConfigreturnc                     ddddddd}| j                   rd|d   d<   ddi|d	<   dd
d|d<   nddd|d	<   ddd|d<   | j                   r| j                  |d       |S )Nbatchencoder_sequence)r   r   )	input_idsattention_maskz past_encoder_sequence + sequencerP   r   r   decoder_input_idsz past_decoder_sequence + sequencedecoder_attention_maskdecoder_sequenceinputs)	direction)use_pastfill_with_past_key_values_)r5   common_inputss     r9   rT   zUMT5OnnxConfig.inputs   s     %);<").@A
 ==1SM*+A.23WM-.:AFh6iM235<AS1TM-.:AFX6YM23==++MX+Nr:   c                      y)N   r"   r5   s    r9   default_onnx_opsetz!UMT5OnnxConfig.default_onnx_opset   s     r:   c                      y)NgMb@?r"   r[   s    r9   atol_for_validationz"UMT5OnnxConfig.atol_for_validation   s    r:   N)rA   rB   rC   propertyr   strintrT   r\   floatr^   r"   r:   r9   rJ   rJ      sd    WS#X%6 67  $ C   U  r:   rJ   N)rD   typingr   configuration_utilsr   onnxr   utilsr   
get_loggerrA   loggerr	   rJ   r"   r:   r9   <module>ri      sG      3 -  
		H	%x
! x
v. r:   