
    sg;                         d Z ddlmZ ddlmZ  ej
                  e      Z G d de      Z G d de      Z	 G d d	e      Z
y
)zIdefics model configuration   )PretrainedConfig)loggingc                   H     e Zd ZdZdZddiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )IdeficsVisionConfiga	  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        embed_dim (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of image channels.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
    idefics_visionhidden_size	embed_dimc                     || _         || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _        t        | 4  di | y N )r	   
image_sizeintermediate_size
patch_sizenum_hidden_layersnum_attention_headsnum_channelslayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor
hidden_actsuper__init__)selfr	   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/idefics/configuration_idefics.pyr   zIdeficsVisionConfig.__init__J   sq      #$!2$!2#6 (,!2!2"4$"6"    )i      i             r   gelugh㈵>        {Gz?g      ?)__name__
__module____qualname____doc__
model_typeattribute_mapr   __classcell__r   s   @r   r   r      sL    %N "J{M # #r   r   c                   4     e Zd ZdZdZ	 	 	 	 	 	 d fd	Z xZS )IdeficsPerceiverConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        use_resampler (`bool`, *optional*, defaults to `False`):
            Whether or not to use the resampler
        resampler_n_latents (`int`, *optional*, defaults to 64):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 6):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
            Whether or not to use qk layer norms in perceiver
    idefics_percieverc                 x    || _         || _        || _        || _        || _        || _        t        |   di | y r   )use_resamplerresampler_n_latentsresampler_depthresampler_n_headsresampler_head_dimqk_layer_norms_perceiverr   r   )	r   r2   r3   r4   r5   r6   r7   r   r   s	           r   r   zIdeficsPerceiverConfig.__init__   sE     +#6 .!2"4(@%"6"r   )F@      r"   `   F)r&   r'   r(   r)   r*   r   r,   r-   s   @r   r/   r/   j   s-    2 %J !&# #r   r/   c                   j     e Zd ZdZdZeedZddddddd	d
ddd	ddddddddddg ddg dddf fd	Z xZ	S )IdeficsConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~IdeficsModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
            Initialization type for the alphas.
        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
            Attention.
        alpha_type (`str`, *optional*, defaults to `"float"`):
            Whether the gating alphas should be vectors or single floats.
        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0)
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1)
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        cross_layer_interval (`int`, *optional*, default to 1)
            Interval for cross attention (from text to image) layers.
        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing text layers when `freeze_text_layers` is `True`
        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict

    Example:

    ```python
    >>> from transformers import IdeficsModel, IdeficsConfig

    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()

    >>> # Initializing a model from the idefics-9b style configuration
    >>> model = IdeficsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```idefics)perceiver_configvision_configi }      i   i +  r!   r$   silur%   zerosfloatgư>T      FNc                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |t-               | _        n8t1        |t2              rt-        di || _        nt1        |t,              r|| _        |t5               | _        n8t1        |t2              rt5        di || _        nt1        |t4              r|| _        t9        | t  d||||d| y )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingsr   )
vocab_sizeadditional_vocab_sizer   r   r   r   dropoutr   r   alpha_initializeralphas_initializer_range
alpha_typerms_norm_eps	use_cachecross_layer_intervalqk_layer_normsfreeze_vision_layersfreeze_text_layersfreeze_text_module_exceptionsfreeze_vision_module_exceptionsfreeze_lm_headr2   r/   r>   
isinstancedictr   r?   r   r   )r   rK   rL   r   r   r   r   rM   r   r   rN   rO   rP   rQ   rR   rG   rH   rI   rJ   rS   rT   rV   rW   rY   rU   rX   r2   r?   r>   r   r   s                                 r   r   zIdeficsConfig.__init__   sX   @ %%:"&!2!2#6 $!2!2(@%$("$8!,$8!"4-J*/N,,*#$:$<D!($/$:$N=M$ND!(*@A$4D! !4!6Dt,!4!E}!ED':;!.D 	
%%% 3		

 	
r   )
r&   r'   r(   r)   r*   r/   r   sub_configsr   r,   r-   s   @r   r<   r<      s~    N` J'=PcdK !!$!&(!(*;N
 N
r   r<   N)r)   configuration_utilsr   utilsr   
get_loggerr&   loggerr   r/   r<   r   r   r   <module>ra      sQ   ( " 3  
		H	%J#* J#Z-#- -#`b
$ b
r   