
    sgx                     x   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%  e"jL                  e'      Z(dZ)dZ*e G d de             Z+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d dejX                        Z0 G d dejX                        Z1 G d dejX                        Z2 G d d ejX                        Z3 G d! d"ejX                        Z4 G d# d$ejX                        Z5 G d% d&ejX                        Z6 G d' d(e      Z7d)Z8d*Z9d+Z: e d,e8       G d- d.e7             Z; G d/ d0ejX                        Z< e d1e8       G d2 d3e7             Z= G d4 d5ejX                        Z> G d6 d7ejX                        Z? e d8e8       G d9 d:e7             Z@ e d;e8       G d< d=e7             ZA e d>e:       G d? d@e7             ZB e dAe8       G dB dCe7             ZCy)DzPyTorch ViLT model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
ViltConfigr   zdandelin/vilt-b32-mlmc                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeeej                           ed<   dZeeeej                           ed<   y)(ViltForImagesAndTextClassificationOutputa  
    Class for outputs of [`ViltForImagesAndTextClassification`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
            the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the attention
            weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r    r   r   r!        Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   4   sl    $ )-D(5$$
%, $FE$>BM8Du'8'8!9:;B;?JeE$5$5678?r*   r   c                   4     e Zd ZdZ fdZddZ	 ddZ xZS )ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                 ,   t         |           t        |      | _        t	        j
                  t        j                  dd|j                              | _	        t        |      | _        | j                  j                  }t	        j
                  t        j                  d|dz   |j                              | _        t	        j                  |j                  |j                        | _        t	        j"                  |j$                        | _        || _        y Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr&   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrB   r:   	__class__s      r+   r1   zViltEmbeddings.__init__W   s      .f5ekk!Q8J8J&KL 3F ;++77#%<<A{QPVPbPb0c#d %'\\&2Q2QSYSeSe%f"zz&"<"<=r*   c                 L   | j                   j                  j                  j                  \  }}}}| j                  |      }|d d d d d d d f   j	                         }t
        j                  j                  ||j                  d   |j                  d   f      j                         }|d d df   j                  d      d d df   }	|d d df   j                  d      d d df   }
|j                  \  }}}}| j                  j                  | j                  j                  z  }| j                  d d dd d d f   j                  dd      j                  d|||      }t!        j"                  t%        |	|
      D cg c]R  \  }}t
        j                  j'                  t
        j                  j                  |||fdd	      d||z
  d||z
  f      T c}}d      }|j)                  d      j                  dd      }|j)                  d      j                  dd      }t!        j*                  t-        t!        j.                  |j                  d
         t!        j.                  |j                  d         d      d      j1                  |j2                        }|d d d d d d d d f   }|j5                  |j                  d   |j                  d   ddd      }|j)                  dd      }|j)                  d      }|dk  s|t7        |t8              s|	|
z  }|j;                         }n|	|
z  }t=        |j;                         |      }|j?                  d      }d|z
  j?                  d      }|d d df   jA                         }|D cg c]  }||d d df   |k(      }}|D cg c]  }||d d df   |k(      }}|D cg c]  }|jC                  d       }}|D cg c]  }|jC                  d       }}|D cg c]  }||z
  	 }}g } tE        t%        |||            D ]  \  }!\  }}"}#|#dk  rOt!        jF                  t!        jH                  |      j	                         |      }$| jK                  ||!   |$          ^t!        jF                  t!        jH                  |"      j	                         |#d      }%| jK                  t!        j"                  ||!   ||!   |%   gd              t!        j"                  | d      } || d d df   | d d df   f   j                  |d|      }|| d d df   | d d df   f   j                  |d      }|| d d df   | d d df   f   j                  |dd      }|| d d df   | d d df   f   j                  |d|      }| jL                  j5                  |dd      }&t!        j"                  |&|fd      }t!        j"                  | j                  d d dd d f   d d d d d f   j5                  |dd      |fd      }||z   }| jO                  |      }t!        j"                  t!        jH                  |j                  d   d      j1                  |      |gd      }|||||fffS c c}}w c c}w c c}w c c}w c c}w c c}w )N   r
   )sizer   r   dimbilinearT)rG   modealign_cornersij)indexingdeviceF)as_tuple)replacement)(r9   
projectionweightshapefloatr   
functionalinterpolatelongsumrB   
image_size
patch_sizer;   	transposeviewr&   catzippadflattenstackr   arangetorR   expand
isinstanceintmaxminnonzerouniquerG   	enumeratemultinomialonesappendr7   rA   )'rC   pixel_values
pixel_maskmax_image_length_phpwxx_maskx_hx_w
batch_sizenum_channelsheightwidth	patch_dimspatial_poshw	pos_embedpatch_indexeffective_resolution	valid_idxnon_valid_idxunique_rowsuvalid_row_idxnon_valid_row_idxv
valid_numsnon_valid_numspad_numsselectinvpvalid_choice
pad_choice
cls_tokenss'                                          r+   visual_embedzViltEmbeddings.visual_embedf   sR   ,,77>>DD1b"!!,/AtQM*002**6QWWQZ8P*QVVXQTl1%ad+QTl1%ad+23''/
L&%KK**dkk.D.DD	..q!"ax8BB1aHMMaQ]_hjstII  SM Aq !!MM--#V'&*	 .  	1fqj1 
	  %%a(221a8	IIaL""1a(kkU\\&,,r"23U\\&,,rBR5S^bcik

"FMM"
" 	 "$aA"56!((a&,,q/2rSUV!))!Q/"a#3#;:N^`cCd
 $'9 3779#&9 "#7#;#;#=?OPNNEN2	V,,e,<1o,,.BMNQ9QT?a#78NNNYZ]=A+>!+CDZZ)67AaffQi7
7->?!&&)??2<=Q$q(==&s:~x'PQ 	fMAz2qAv$00A1D1D1FHXYmA.|<="..uzz"~/C/C/EqVZ[
eiiq)9;LQ;OPZ;[(\bcde	f 6q)fQTlF1a4L()..z2|Lq!tfQTl2388RH!&A,q!t"<=BB:rSTUfQTlF1a4L89>>z2|\	^^**:r2>
IIz1o1-II%%aAg.q$z:AA*bRTUW`agh
	 	MLLOEJJv||A:==fEvNTUV&;888SP OZ7?=s%   ?AZ
Z+ZZ%ZZ!c	           	      (   | j                  |||      }	|-| j                  ||| j                  j                        \  }}
}n|j	                  d      }
|d}|	| j                  t        j                  |t        j                  |	j                              z   }	|| j                  t        j                  |
|t        j                  |	j                              z   }t        j                  |	|gd      }t        j                  ||
gd      }||fS )N)	input_idstoken_type_idsinputs_embeds)ru   r   dtyperR   rH   )r3   r   rB   ru   rd   r>   r&   
zeros_liker[   rR   	full_likera   )rC   r   attention_maskr   rs   rt   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r+   forwardzViltEmbeddings.forward   s    **m + 

 595F5Fj4;;;W;W 6G 62L+{ %,,Q/K  '#$ !D$>$>^5::kFXFXY%
 
 $d&@&@OOK)=UZZXcXjXjk'
 

 YY\:B
		>;7Q?5  r*   )   )r   )r"   r#   r$   r%   r1   r   r   __classcell__rD   s   @r+   r-   r-   N   s    V9B '!r*   r-   c                   *     e Zd ZdZ fdZddZ xZS )r2   zGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   rN   F)
persistentr   r   )r0   r1   r   r<   
vocab_sizer6   pad_token_idword_embeddingsmax_position_embeddingsr;   type_vocab_sizer>   	LayerNormlayer_norm_epsr?   r@   rA   getattrr   register_bufferr&   rf   rh   r5   r   rG   r[   rC   rB   rD   s     r+   r1   zTextEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r*   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )NrN   r   r   r   r   r   )rG   r   hasattrr   rh   r&   r5   r[   rR   r   r>   r   r;   r   rA   )rC   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr>   r   r;   s               r+   r   zTextEmbeddings.forward   s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r*   )NNNNr"   r#   r$   r%   r1   r   r   r   s   @r+   r2   r2      s    Q
& r*   r2   c                   (     e Zd ZdZ fdZd Z xZS )r8   z#
    Image to Patch Embedding.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r0   r1   r]   r^   r~   r6   ri   collectionsabcIterabler:   r   Conv2drU   )rC   rB   r]   r^   r~   r6   r:   rD   s          r+   r1   zViltPatchEmbeddings.__init__&  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir*   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  j                  j
                  }| j                  |j                  |            }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rW   r~   
ValueErrorrU   rV   r   rg   )rC   rs   r}   r~   r   r   target_dtypery   s           r+   r   zViltPatchEmbeddings.forward5  si    2>2D2D/
L&%4,,,w  --33OOLOO,O?@r*   r   r   s   @r+   r8   r8   !  s    jr*   r8   c                   ,     e Zd Z fdZd ZddZ xZS )ViltSelfAttentionc                    t         |           |j                  |j                  z  dk7  r3t	        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r0   r1   r6   num_attention_headsr   r   rj   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   r   s     r+   r1   zViltSelfAttention.__init__A  s1    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr*   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )NrN   r   rF   r   r
   )rG   r   r   r`   permute)rC   ry   new_x_shapes      r+   transpose_for_scoresz&ViltSelfAttention.transpose_for_scoresS  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r*   c                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }	|	t        j                  | j                        z  }	||	|z   }	 t        j                  d      |	      }
| j                  |
      }
||
|z  }
t	        j
                  |
|      }|j                  dddd      j                         }|j                         d d | j                   fz   } |j"                  | }|r||
f}|S |f}|S )NrN   rM   rH   r   rF   r   r
   )r   r   r   r   r&   matmulr_   mathsqrtr   r   SoftmaxrA   r   
contiguousrG   r   r`   )rC   r    r   	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r+   r   zViltSelfAttention.forwardX  sa    JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ -"**,-=> ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r*   NNF)r"   r#   r$   r1   r   r   r   r   s   @r+   r   r   @  s    G$%
!r*   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rB   returnNc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r0   r1   r   r   r6   denser?   r@   rA   r   s     r+   r1   zViltSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r*   r    input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   rA   rC   r    r   s      r+   r   zViltSelfOutput.forward  s$    

=1]3r*   )
r"   r#   r$   r%   r   r1   r&   Tensorr   r   r   s   @r+   r   r   }  sD    
>z >d >
U\\  RWR^R^ r*   r   c                   ,     e Zd Z fdZd ZddZ xZS )ViltAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r0   r1   r   	attentionr   outputsetpruned_headsr   s     r+   r1   zViltAttention.__init__  s0    *62$V,Er*   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rH   )lenr   r  r   r   r
  r   r   r   r   r  r   r   union)rC   headsindexs      r+   prune_headszViltAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r*   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r  r  )rC   r    r   r   r   self_outputsattention_outputr   s           r+   r   zViltAttention.forward  sE    ~~m^YPab;;|AF#%QR(88r*   r   )r"   r#   r$   r1   r  r   r   r   s   @r+   r  r    s    ";$r*   r  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )ViltIntermediaterB   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r0   r1   r   r   r6   intermediate_sizer   ri   
hidden_actstrr   intermediate_act_fnr   s     r+   r1   zViltIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r*   r    c                 J    | j                  |      }| j                  |      }|S r   )r   r  rC   r    s     r+   r   zViltIntermediate.forward  s&    

=100?r*   	r"   r#   r$   r   r1   r&   r  r   r   r   s   @r+   r  r    s1    9z 9d 9U\\ ell r*   r  c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )
ViltOutputrB   r   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r0   r1   r   r   r  r6   r   r?   r@   rA   r   s     r+   r1   zViltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   r    r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r  r  s      r+   r   zViltOutput.forward  s.    

=1]3%4r*   r  r   s   @r+   r  r    s?    >z >d >
U\\  RWR^R^ r*   r  c                   *     e Zd ZdZ fdZddZ xZS )	ViltLayerz?This corresponds to the Block class in the timm implementation.c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater  r  r   r   r6   r   layernorm_beforelayernorm_afterr   s     r+   r1   zViltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr*   c                    | j                  | j                  |      |||      }|d   }|dd  }||j                  |j                        z   }| j	                  |      }| j                  |      }| j                  ||      }|f|z   }|S )N)r   r   r   )r  r(  rg   rR   r)  r'  r  )	rC   r    r   r   r   self_attention_outputsr  r   layer_outputs	            r+   r   zViltLayer.forward  s    !%!!-0/	 "0 "
 2!4(, )=+;+;<L<S<S+TT ++M:((6 {{<?/G+r*   r   r   r   s   @r+   r#  r#    s    I[r*   r#  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )ViltEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r0   r1   rB   r   
ModuleListrangenum_hidden_layersr#  layergradient_checkpointing)rC   rB   rv   rD   s      r+   r1   zViltEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                 x   |rdnd }|rdnd }t        | j                        D ]j  \  }	}
|r||fz   }|||	   nd }| j                  r,| j                  r | j	                  |
j
                  ||||      }n |
||||      }|d   }|sb||d   fz   }l |r||fz   }|st        d |||fD              S t        |||      S )Nr)   r   r   c              3   &   K   | ]	  }||  y wr   r)   ).0r   s     r+   	<genexpr>z&ViltEncoder.forward.<locals>.<genexpr>%  s     mq_`_lms   )last_hidden_stater    r!   )ro   r3  r4  training_gradient_checkpointing_func__call__tupler   )rC   r    r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r+   r   zViltEncoder.forward  s    #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]NO]n o)!,M &9]1=M<O&O#)	P,   1]4D Dm]4EGZ$[mmm++*
 	
r*   )NNFFTr"   r#   r$   r1   r   r   r   s   @r+   r.  r.    s    , "+
r*   r.  c                   *    e Zd ZdZeZdZdZddgZd Z	y)ViltPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    viltTr-   r   c                 "   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)ri   r   r   r   rV   datanormal_rB   initializer_ranger   zero_r<   r   r   fill_)rC   modules     r+   _init_weightsz!ViltPreTrainedModel._init_weights8  s   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r*   N)
r"   r#   r$   r%   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesrR  r)   r*   r+   rG  rG  -  s+    
 L&*#)+>?*r*   rG  aH  
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViltConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).
            `What are attention masks? <../glossary.html#attention-mask>`__

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).
            `What are attention masks? <../glossary.html#attention-mask>`__

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_images, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare ViLT Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Zd Zd Z ee       e	e
e      	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   dee   dee   dee
eej                     f   fd              Z xZS )	ViltModelc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j                          y Nr   )r0   r1   rB   r-   r   r.  encoderr   r   r6   r   	layernorm
ViltPoolerpooler	post_init)rC   rB   add_pooling_layerrD   s      r+   r1   zViltModel.__init__  si     (0"6*f&8&8f>S>ST,=j(4 	r*   c                 B    | j                   j                  j                  S r   r   r3   r   rC   s    r+   get_input_embeddingszViltModel.get_input_embeddings  s    ..>>>r*   c                 :    || j                   j                  _        y r   rb  )rC   r   s     r+   set_input_embeddingszViltModel.set_input_embeddings  s    :?''7r*   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr[  r3  r  r  )rC   heads_to_pruner3  r  s       r+   _prune_headszViltModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr*   output_typerS  r   r   r   rs   rt   r   r   r   r   r   r>  r?  r   c           
      ~   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         dd }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }||t	        d      ||t	        d      ||j                  d   n|j                  d   }||k7  rt	        d	      |Bt        j                  || j                   j                  | j                   j                  f|      }| j                  || j                   j                        }| j                  ||||||||	
      \  }}| j                  ||      }| j!                  ||||
||      }|d   }| j#                  |      }| j$                  | j%                  |      nd}|s
||f|dd z   S t'        |||j(                  |j*                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerN   z5You have to specify either input_ids or inputs_embedsrQ   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r   r>  r?  r   )r9  pooler_outputr    r!   )rB   r   r>  use_return_dictr   %warn_if_padding_and_no_attention_maskrG   rR   r&   rq   rW   r]   get_head_maskr2  r   get_extended_attention_maskr[  r\  r^  r   r    r!   )rC   r   r   r   rs   rt   r   r   r   r   r   r>  r?  r   text_batch_sizer   rR   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r+   r   zViltModel.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU&1#%.%:!!@T@T!"ZZ/:)FPVWN#(@eff!l&:VWW4@4L<--a0R^RdRdefRg.`aa%5t{{7M7Mt{{OeOe$fouvJ &&y$++2O2OP	+/??!5 ,; 	,
(. 150P0PQ_al0m,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r*   )TNNNNNNNNNNNN)r"   r#   r$   r1   rd  rf  rj  r   VILT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r&   
LongTensorr'   rj   boolr   r   r   r   r   s   @r+   rX  rX    sy   
?@C ++@A+ETcd 156:594815155948.2,0/3&*p
E,,-p
 !!2!23p
 !!1!12	p

 u001p
 U--.p
 E--.p
   1 12p
 u001p
 'smp
 $D>p
 'tnp
 d^p
 
)51B1B+CC	Dp
 e Bp
r*   rX  c                   $     e Zd Z fdZd Z xZS )r]  c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r0   r1   r   r   r6   r   Tanh
activationr   s     r+   r1   zViltPooler.__init__Y  s9    YYv1163E3EF
'')r*   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rC   r    first_token_tensorry  s       r+   r   zViltPooler.forward^  s6     +1a40

#566r*   rE  r   s   @r+   r]  r]  X  s    $
r*   r]  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                        e Zd ZddgZ fdZd Zd Z eej                  d             e
ee      	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                      d
eej                     deej                      deej                     deej                      deej                      deej                      deej                     dee   dee   dee   deeeej                      f   fd              Z xZS )ViltForMaskedLMzmlm_score.decoder.weightzmlm_score.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r0   r1   rX  rH  ViltMLMHead	mlm_scorer_  r   s     r+   r1   zViltForMaskedLM.__init__p  s4     f%	$V, 	r*   c                 .    | j                   j                  S r   )r  decoderrc  s    r+   get_output_embeddingsz%ViltForMaskedLM.get_output_embeddingsy  s    ~~%%%r*   c                 \    || j                   _        |j                  | j                   _        y r   )r  r  r   )rC   new_embeddingss     r+   set_output_embeddingsz%ViltForMaskedLM.set_output_embeddings|  s     !/,11r*   zbatch_size, sequence_lengthrk  r   r   r   rs   rt   r   r   r   labelsr   r>  r?  r   c                 F   ||n| j                   j                  }| j                  |||||||||
||      }|dd \  }}||j                  d   n|j                  d   }|ddd|f   |dd|df   }}| j	                  |      }d}|	at               }|	j                  |j                        }	 ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a	  
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import requests
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N
r   r   rs   rt   r   r   r   r   r>  r?  rF   r   rN   r   r   r    r!   )rB   ro  rH  rW   r  r	   rg   rR   r`   r   r   r    r!   )rC   r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   rx  ry  text_seq_lentext_featuresrv   
mlm_logitsmasked_lm_lossloss_fctr  s                          r+   r   zViltForMaskedLM.forward  s]   R &1%<k$++B]B]))))%!'%/!5#  
 *1!&-6-Byq)H[H[\]H^+A}},<=qR^R_O_?`q^^M2
')HYYz001F%joob$++:P:P&QSYS^S^_aSbcN ]WQR[0F3A3M^%.YSYY!//))	
 	
r*   rz  )r"   r#   r$   _tied_weights_keysr1   r  r  r   r{  formatr   r   r|  r   r&   r}  r'   r~  r   r   r   r   r   s   @r+   r  r  g  s    56NO&2 ++@+G+GHe+fg>X 156:594815155948-1,0/3&*n
E,,-n
 !!2!23n
 !!1!12	n

 u001n
 U--.n
 E--.n
   1 12n
 u001n
 ))*n
 $D>n
 'tnn
 d^n
 
~uU%6%677	8n
 Y hn
r*   r  c                   $     e Zd Z fdZd Z xZS )ViltPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y rZ  )r0   r1   r   r   r6   r   ri   r  r  r   transform_act_fnr   r   r   s     r+   r1   z$ViltPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr*   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r  r   r  s     r+   r   z#ViltPredictionHeadTransform.forward  s4    

=1--m<}5r*   rE  r   s   @r+   r  r    s    Ur*   r  c                   ,     e Zd Zd fd	Zd Zd Z xZS )r  c                 |   t         |           || _        t        |      | _        t        j                  |j                  |j                  d      | _	        t        j                  t        j                  |j                              | _        ||| j                  _        | j                  | j                  _        y )NFr   )r0   r1   rB   r  	transformr   r   r6   r   r  r4   r&   r5   r   rV   )rC   rB   rV   rD   s      r+   r1   zViltMLMHead.__init__  s    4V<yy!3!3V5F5FUSLLV->->!?@	"(DLL !IIr*   c                 :    | j                   | j                  _         y r   )r   r  rc  s    r+   _tie_weightszViltMLMHead._tie_weights  s     IIr*   c                 J    | j                  |      }| j                  |      }|S r   )r  r  )rC   ry   s     r+   r   zViltMLMHead.forward  s"    NN1LLOr*   r   )r"   r#   r$   r1   r  r   r   r   s   @r+   r  r    s    
&&r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e   de	e   deeee
j                     f   fd              Z xZS )ViltForQuestionAnsweringc           	         t         |   |       |j                  | _        t        |      | _        t        j                  t        j                  |j                  |j                  dz        t        j                  |j                  dz        t        j                         t        j                  |j                  dz  |j                              | _        | j                          y )NrF   )r0   r1   
num_labelsrX  rH  r   
Sequentialr   r6   r   GELU
classifierr_  r   s     r+   r1   z!ViltForQuestionAnswering.__init__"  s      ++f%	 --IIf((&*<*<q*@ALL++a/0GGIIIf((1,f.?.?@	
 	r*   rk  r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   c                    ||n| j                   j                  }| j                  |||||||||
||      }|r|j                  n|d   }| j	                  |      }d}|	K|	j                  |j                        }	t        j                  j                  ||	      |	j                  d   z  }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```Nr  r   rF   r  )rB   ro  rH  rn  r  rg   rR   r   rY    binary_cross_entropy_with_logitsrW   r   r    r!   )rC   r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   rn  r   r   r  s                     r+   r   z ViltForQuestionAnswering.forward3  s
   b &1%<k$++B]B]))))%!'%/!5#  
 2=--'!*/YYv}}-F==AA&&QTZT`T`abTccD Y,F)-)9TGf$EvE'!//))	
 	
r*   rz  r"   r#   r$   r1   r   r{  r   r   r|  r   r&   r}  r'   r~  r   r   r   r   r   s   @r+   r  r    so   " ++@A+CRab 156:594815155948-1,0/3&*S
E,,-S
 !!2!23S
 !!1!12	S

 u001S
 U--.S
 E--.S
   1 12S
 u001S
 ))*S
 $D>S
 'tnS
 d^S
 
'u/@/@)AA	BS
 c BS
r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e   de	e   deeee
j                     f   fd              Z xZS )ViltForImageAndTextRetrievalc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r/   )	r0   r1   rX  rH  r   r   r6   rank_outputr_  r   s     r+   r1   z%ViltForImageAndTextRetrieval.__init__  sC     f%	 99V%7%7; 	r*   rk  r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   c                 D   ||n| j                   j                  }d}|	t        d      | j                  |||||||||
||      }|r|j                  n|d   }| j                  |      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a'  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.r  r   rF   r  )	rB   ro  NotImplementedErrorrH  rn  r  r   r    r!   )rC   r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   r   rn  r   r  s                     r+   r   z$ViltForImageAndTextRetrieval.forward  s    Z &1%<k$++B]B]%&FGG))))%!'%/!5#  
 2=--'!*!!-0Y,F)-)9TGf$EvE'!//))	
 	
r*   rz  r  r   s   @r+   r  r    so   	 ++@A+CRab 156:594815155948-1,0/3&*L
E,,-L
 !!2!23L
 !!1!12	L

 u001L
 U--.L
 E--.L
   1 12L
 u001L
 ))*L
 $D>L
 'tnL
 d^L
 
'u/@/@)AA	BL
 c BL
r*   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e   de	e   deeee
j                     f   fd              Z xZS )"ViltForImagesAndTextClassificationc           	         t         |   |       |j                  | _        t        |      | _        |j
                  }t        j                  t        j                  |j                  |z  |j                  |z        t        j                  |j                  |z        t        j                         t        j                  |j                  |z  |j                              | _        | j                          y r   )r0   r1   r  rX  rH  
num_imagesr   r  r   r6   r   r  r  r_  )rC   rB   r  rD   s      r+   r1   z+ViltForImagesAndTextClassification.__init__  s      ++f%	 &&
--IIf((:5v7I7IJ7VWLL++j89GGIIIf((:5v7H7HI	
 	r*   rk  r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| |j                  dk(  r|j                  d      }| |j                  dk(  r|j                  d      }||j                  d   nd}|||j                  d   nd}|| j                   j                  k7  rt        d      g }|rg nd}|
rg nd}t        |      D ]  }| j                  |||||dd|ddddddf   nd||dd|ddddf   nd||||dd|ddddf   nd|dz   |
||      }|r|j                  n|d   }|j                  |       |r|j                  |j                         |
s|j                  |j                          t        j                   |d      }| j#                  |      }d}|	Wt%               }|	j'                  |j(                        }	 ||j+                  d| j,                        |	j+                  d            }|s|||f}||f|z   S |S t/        ||||	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import requests
        >>> from PIL import Image

        >>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        >>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image1, image2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r
   z\Make sure to match the number of images in the model with the number of images in the input.)r   r   rs   rt   r   r   r   r   r   r>  r?  rN   rH   r  )rB   r   r>  ro  ndim	unsqueezerW   r  r   r1  rH  rn  rr   r    r!   r&   ra   r  r	   rg   rR   r`   r  r   )rC   r   r   r   rs   rt   r   r   r   r  r   r>  r?  r  pooler_outputsr    r!   r   r   rn  ry  r   r   r  r  s                            r+   r   z*ViltForImagesAndTextClassification.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]#(9(9Q(>'11!4L#(9(9Q(>'11!4L.:.F\''*D
2>2J++A.PTJ///n  2,R$
z" 	6Aii--<H<T\!Q1a-8Z^5?5K:aAqj1QU#+9E9Q\!Q1*5W[%&U"3%9'   G 6AG11gajM!!-0#$$W%:%:; !!'"4"45+	6. 		.b9/')HYYv}}-FFKKDOO<fkk"oNDmZ8F)-)9TGf$EvE7'!	
 	
r*   rz  )r"   r#   r$   r1   r   r{  r   r   r|  r   r&   r}  r'   r~  r   r   r   r   r   s   @r+   r  r    so   $ ++@A+Sbqr 156:594815155948-1,0/3&*o
E,,-o
 !!2!23o
 !!1!12	o

 u001o
 U--.o
 E--.o
   1 12o
 u001o
 ))*o
 $D>o
 'tno
 d^o
 
7u?P?P9QQ	Ro
 s Bo
r*   r  z
    ViLT Model with a token classification head on top (a linear layer on top of the final hidden-states of the text
    tokens) e.g. for Named-Entity-Recognition (NER) tasks.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e   de	e   deeee
j                     f   fd              Z xZS )ViltForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y )NF)r`  )r0   r1   r  rX  rH  r   r?   r@   rA   r   r6   r  r_  r   s     r+   r1   z#ViltForTokenClassification.__init__  sk      ++f>	zz&"<"<=))F$6$68I8IJ 	r*   rk  r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   c                 4   ||n| j                   j                  }| j                  |||||||||
||      }|d   }||j                  d   n|j                  d   }| j	                  |      }| j                  |ddd|f         }d}|	Wt               }|	j                  |j                        }	 ||j                  d| j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:
        Nr  r   r   rN   rF   r  )rB   ro  rH  rW   rA   r  r	   rg   rR   r`   r  r   r    r!   )rC   r   r   r   rs   rt   r   r   r   r  r   r>  r?  r   rx  text_input_sizer   r   r  r  s                       r+   r   z"ViltForTokenClassification.forward  s?   0 &1%<k$++B]B]))))%!'%/!5#  
 "!*090E)//!,=K^K^_`Ka,,74D_4D1D!EF')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r*   rz  )r"   r#   r$   r1   r   r{  r   r   r|  r   r&   r}  r'   r~  r   r   r   r   r   s   @r+   r  r  |  s_   
 ++@A+@_ 156:594815155948-1,0/3&*=
E,,-=
 !!2!23=
 !!1!12	=

 u001=
 U--.=
 E--.=
   1 12=
 u001=
 ))*=
 $D>=
 'tn=
 d^=
 
$eE,=,=&>>	?=
 ` B=
r*   r  )Dr%   collections.abcr   r   dataclassesr   typingr   r   r   r   r&   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_viltr   
get_loggerr"   loggerr|  _CHECKPOINT_FOR_DOCr   Moduler-   r2   r8   r   r   r  r  r  r#  r.  rG  VILT_START_DOCSTRINGr{  4VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRINGrX  r]  r  r  r  r  r  r  r  r)   r*   r+   <module>r     s      ! / /    % !  . 
 u t * 
		H	%-  @{ @ @2W!RYY W!t6RYY 6r")) >9		 9zRYY $BII Fryy " #		 #L2
")) 2
j*/ *8	 5 n58 4p dN
# N
	N
b   	C
) C
C
L")) "")) ,  g
2 g
g
T  Z
#6 Z
Z
z  9	D
)< D
D
N  L
!4 L
L
r*   