
    sgI                     B   d Z ddlZddlmZ ddlmZmZ ddlmZmZ ddl	m
Z
 ddlmZmZmZmZ  ej                   e      Zd	Z G d
 dej(                        ZdZdZ ede       G d dej(                  e
             Z edee       G d dej(                               Zy)zPyTorch MMBT model.    N)nn)CrossEntropyLossMSELoss   )BaseModelOutputWithPoolingSequenceClassifierOutput)ModuleUtilsMixin)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings
MMBTConfigc                   *     e Zd ZdZ fdZddZ xZS )ModalEmbeddingszPGeneric Modal Embeddings which takes in an encoder, and a transformer embedding.c                 n   t         |           || _        || _        t	        j
                  |j                  |j                        | _        |j                  | _	        |j                  | _
        |j                  | _        |j                  | _        t	        j                  |j                        | _        y )N)p)super__init__configencoderr   Linearmodal_hidden_sizehidden_sizeproj_embeddingsposition_embeddingstoken_type_embeddingsword_embeddings	LayerNormDropouthidden_dropout_probdropout)selfr   r   
embeddings	__class__s       d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.pyr   zModalEmbeddings.__init__#   s    !yy)A)A6CUCUV#-#A#A %/%E%E")99#--zzF$>$>?    c                 J   | j                  | j                  |            }|j                  d      }|>| j                  |      }|dz  }t	        j
                  |j                  d      |gd      }|>| j                  |      }	|dz  }t	        j
                  ||	j                  d      gd      }|`t	        j                  |t        j                  |j                        }|j                  d      j                  |j                  d      |      }|At	        j                  |j                  d      |ft        j                  |j                        }| j                  |      }
| j                  |      }||
z   |z   }| j                  |      }| j                  |      }|S )N   dimdtypedevicer   )r   r   sizer   torchcat	unsqueezearangelongr-   expandzerosr   r   r   r!   )r"   input_modalstart_token	end_tokenposition_idstoken_type_idstoken_embeddings
seq_lengthstart_token_embedsend_token_embedsr   r   r#   s                r%   forwardzModalEmbeddings.forward.   s   //[0IJ%**1-
"!%!5!5k!B!OJ$yy*<*F*Fq*IK[)\bcd #33I>!OJ$yy*:<L<V<VWX<Y)Z`ab <<
%**[M_M_`L'11!4;;K<L<LQ<OQ[\L!"[[!!!$j1KL^L^N #66|D $ : :> J%(;;>SS
^^J/
\\*-
r&   )NNNN__name__
__module____qualname____doc__r   r?   __classcell__r$   s   @r%   r   r       s    Z	@r&   r   a  
    MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
    Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
    obtain state-of-the-art performance on various multimodal classification benchmark tasks.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration.
        transformer (`nn.Module`): A text transformer that is used by MMBT.
            It should have embeddings, encoder, and pooler attributes.
        encoder (`nn.Module`): Encoder for the second modality.
            It should take in a batch of modal inputs and return k, n dimension embeddings.
a4  
    Args:
        input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`):
            The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image
            Encoder, the shape would be (batch_size, channels, height, width)
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
            appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification
            tasks.
        modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
        attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`:
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`:
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`:
            Segment token indices to indicate different portions of the non-text modality. The embeddings from these
            tokens will be summed with the respective token embeddings for the non-text modality.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
            Selected in the range `[0, config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zRThe bare MMBT Model outputting raw hidden-states without any specific head on top.c                        e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd              Z	d Z
d Z xZS )	MMBTModelc                 v    t         |           || _        || _        t	        |||j
                        | _        y N)r   r   r   transformerr   r#   modal_encoderr"   r   rK   r   r$   s       r%   r   zMMBTModel.__init__   s4    &,VWk>T>TUr&   )output_typeconfig_classc           
      B   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      ||j                         }n!||j                         dd }nt	        d      ||j                  n|j                  }| j                  ||||	|      }|j                         dd }|&t        j                  |t        j                  |      }| j                  j                  ||||      }t        j                  ||gd      }|j                         dd }|t        j                  ||	      }n=t        j                  t        j                  ||t        j                  
      |gd      }|t        j                  ||	      }n.t        j                  t        j                  ||	      |gd      }| j                  ||      }| j                  |      }| j!                  |
| j                   j"                        }
| j                  j%                  |||
|||||      }|d   }| j                  j'                  |      }|s
||f|dd z   S t)        |||j*                  |j,                        S )a  
        Returns:

        Examples:

        ```python
        # For example purposes. Not runnable.
        transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
        encoder = ImageEncoder(args)
        mmbt = MMBTModel(config, transformer, encoder)
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embeds)r7   r8   r9   r:   r+   )	input_idsr9   r:   inputs_embedsr(   )r-   )r-   r,   r)   )attention_mask	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr   )last_hidden_statepooler_outputhidden_states
attentions)r   rX   rY   use_return_dict
ValueErrorr.   r-   rL   r/   onesr3   rK   r#   r0   get_extended_attention_maskinvert_attention_maskget_head_masknum_hidden_layersr   poolerr   r]   r^   )r"   r6   rR   modal_start_tokensmodal_end_tokensrT   r:   modal_token_type_idsr9   modal_position_idsrU   rS   rV   rW   rX   rY   rZ   input_txt_shaper-   modal_embeddingsinput_modal_shapetxt_embeddingsembedding_outputinput_shapeextended_attention_maskencoder_extended_attention_maskencoder_outputssequence_outputpooled_outputs                                r%   r?   zMMBTModel.forward   s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"'nn.O&+0023B7OTUU%.%:!!@T@T--*&+/ . 
 -113CR8!"ZZuzzRXYN))44l>iv 5 
 !99&6%GK&++-cr2!"ZZFCN"YY-fEJJOQ_`fgN ")%*ZZF%K"%*YY-f=?UV\]&" #'"B"B>S^"_*.*D*DE[*\'&&y$++2O2OP	**222"7#B/!5# 3 	
 *!,((//@#]3oab6III)-')77&11	
 	
r&   c                 .    | j                   j                  S rJ   r#   r   )r"   s    r%   get_input_embeddingszMMBTModel.get_input_embeddings*  s    ...r&   c                 &    || j                   _        y rJ   rw   )r"   values     r%   set_input_embeddingszMMBTModel.set_input_embeddings-  s    */'r&   )NNNNNNNNNNNNNNN)rA   rB   rC   r   r   MMBT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr?   rx   r{   rE   rF   s   @r%   rH   rH      sr    
V ++@A+ETcd !"#!#k
 e Bk
Z/0r&   rH   zw
    MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
    c                   B     e Zd ZdZ fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )MMBTForClassificationa  
    **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
    (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
    regression if config.num_labels==1) loss. **logits**:
        `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
        config.num_labels==1) scores (before SoftMax).
    **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
    the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
    Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
    (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
    `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
    to compute the weighted average in the self-attention heads.

    Examples:

    ```python
    # For example purposes. Not runnable.
    transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
    encoder = ImageEncoder(args)
    model = MMBTForClassification(config, transformer, encoder)
    outputs = model(input_modal, input_ids, labels=labels)
    loss, logits = outputs[:2]
    ```c                    t         |           |j                  | _        t        |||      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        y rJ   )r   r   
num_labelsrH   mmbtr   r   r    r!   r   r   
classifierrM   s       r%   r   zMMBTForClassification.__init__V  s_     ++fk7;	zz&"<"<=))F$6$68I8IJr&   c                 0   ||n| j                   j                  }| j                  |||||||||	|
||      }|d   }| j                  |      }| j	                  |      }d }|}| j
                  dk(  r2t               } ||j                  d      |j                  d            }n<t               } ||j                  d| j
                        |j                  d            }|s|f|dd  z   }||f|z   S |S t        |||j                  |j                        S )N)r6   rR   rg   rh   rT   r:   ri   r9   rj   rU   rS   rZ   r(   rQ      )losslogitsr]   r^   )r   r_   r   r!   r   r   r   viewr   r   r]   r^   )r"   r6   rR   rg   rh   rT   r:   ri   r9   rj   rU   rS   labelsrZ   outputsru   r   r   loss_fctoutputs                       r%   r?   zMMBTForClassification.forward^  s.     &1%<k$++B]B]))#1-))!5%1'#  
  
]3/!#"9BRA+-B @&++b/RY,F)-)9TGf$EvE'!//))	
 	
r&   )NNNNNNNNNNNNr@   rF   s   @r%   r   r   1  s9    :K !9
r&   r   )rD   r/   r   torch.nnr   r   modeling_outputsr   r   modeling_utilsr	   utilsr
   r   r   r   
get_loggerrA   loggerr}   Moduler   MMBT_START_DOCSTRINGr|   rH   r    r&   r%   <module>r      s        . U / u u 
		H	%*bii *Z 0H V Xz0		+ z0	z0z  _
BII _
_
r&   