
    sgz                        d Z ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$  e!jJ                  e&      Z'dZ(dZ)e G d de             Z*e G d de             Z+e G d de             Z,dFdZ-dGdZ.dHdZ/ G d dej`                        Z1 G d dej`                        Z2 G d d ej`                        Z3 G d! d"ej`                        Z4 G d# d$ej`                        Z5 G d% d&ej`                        Z6 G d' d(ej`                        Z7 G d) d*ej`                        Z8 G d+ d,ej`                        Z9 G d- d.ej`                        Z: G d/ d0ej`                        Z; G d1 d2e      Z<d3Z=d4Z> ed5e=       G d6 d7e<             Z? G d8 d9ej`                        Z@ ed:e=       G d; d<e<             ZA G d= d>ej`                        ZB G d? d@ej`                        ZC G dA dBej`                        ZD edCe=       G dD dEe<             ZEy)IzPyTorch TVLT model.    N)deepcopy)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputSequenceClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
TvltConfigr   zZinengTang/tvlt-basec                   d   e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZej                  ed<   dZej                  ed<   dZej                  ed	<   dZeeej                  d
f      ed<   dZeeej                  d
f      ed<   y)TvltModelOutputa  
    Class for TvltModel's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
            Pixel sequence of hidden-states at the output of the last layer of the model.
        last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
            Audio sequence of hidden-states at the output of the last layer of the model.
        pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
            Tensor indicating which pixel patches are masked (1) and which are not (0).
        audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
            Tensor indicating which audio patches are masked (1) and which are not (0).
        pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
            Tensor containing the ids permutation of pixel masking.
        audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
            Tensor containing the ids permutation of audio masking.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlast_hidden_statelast_pixel_hidden_statelast_audio_hidden_statepixel_label_masksaudio_label_maskspixel_ids_restoreaudio_ids_restore.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   
LongTensorr    r!   r"   r#   r   r   r$        d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.pyr   r   0   s    8 ,0u((/15U..515U..5*.u''.*.u''.*.u''.*.u''.=AM8E%"3"3S"89:A:>Ju00#567>r.   r   c                       e Zd ZU dZdZej                  ed<   dZe	e
ej                  df      ed<   dZe	e
ej                  df      ed<   y)TvltDecoderOutputaM  
    Class for TvltDecoder's outputs, with potential hidden states and attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlogits.r#   r$   )r%   r&   r'   r(   r2   r)   r*   r+   r#   r   r   r$   r-   r.   r/   r1   r1   Y   sW      !%FE$=AM8E%"3"3S"89:A:>Ju00#567>r.   r1   c                      e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZej                  ed<   dZeeej                  df      ed<   dZeeej                  df      ed	<   y)
TvltForPreTrainingOutputa
  
    Class for TvltForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss.
        matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
            Matching objective logits.
        pixel_logits (`torch.FloatTensor` of shape
            `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
            logits.
        audio_logits (`torch.FloatTensor` of shape
            `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
            logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlossmatching_logitspixel_logitsaudio_logits.r#   r$   )r%   r&   r'   r(   r5   r   r)   r*   r+   r6   r7   r8   r#   r   r$   r-   r.   r/   r4   r4   p   s    0 )-D(5$$
%,)-OU&&-&*L%##*&*L%##*=AM8E%"3"3S"89:A:>Ju00#567>r.   r4   c                     | j                   dd \  }}t        j                  ||f| j                        }t	        |d|z
  z        }||fS )!Generate noise for audio masking.N   devicer   )shaper)   randr=   int)pixel_values
pixel_mask
mask_ratio
batch_sizeseq_lennoiselen_keeps          r/   generate_pixel_mask_noiserH      sS     ',,Ra0JJJ
G,\5H5HIE7a*n-.H(?r.   c                 X   | j                   dd \  }}|dk(  rX||z  }t        j                  ||| j                        j	                  d      j                  dd|      j                  ||      }n'|dk(  r"t        j                  ||| j                        }t        |d|z
  z        }	|	fS )r:   Nr;   zframe-levelr<   r   patch-level)r>   r)   r?   r=   	unsqueezerepeatviewr@   )
audio_values
audio_maskrC   	mask_typefreq_lenrD   rE   num_time_patchesrF   rG   s
             r/   generate_audio_mask_noiserT      s     ',,Ra0JM!"h.JJz#3L<O<OPYr]VAq(#T*g&	 	 
m	#

:w|7J7JK7a*n-.H(?r.   c           	         | j                   \  }}}t        j                  |d      }t        j                  |d      }|ddd|f   }	t        j                  | d|	j	                  d      j                  dd|            }
t        j                  ||g| j                        }d|ddd|f<   t        j                  |d|      }|||z  }t        j                  |d|	      }|
|||fS )z
    Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
    noise. sequence: [batch_size, seq_len, hidden_dim], sequence
    r   dimNrJ   rW   indexr<   r   )r>   r)   argsortgatherrL   rM   onesr=   )sequencerF   rG   attention_masksrD   rE   
hidden_dimids_shuffleids_restoreids_keepsequence_maskedlabel_maskss               r/   random_maskingre      s     '/nn#J --1-K--3K 1ixi<(Hll8(:L:LR:P:W:WXY[\^h:ijO **j'28??KK !K9H9,,{EK"&,,AXNO[+EEr.   c                   *     e Zd ZdZ fdZddZ xZS )TvltPixelEmbeddings,Construct the patch and position embeddings.c                    t         |           t        |      | _        | j                  j                  | _        t        j                  t        j                  dd|j                              | _
        t        j                  t        j                  d|j                  |j                              | _        t        j                  t        j                  d| j                  |j                              | _        || _        y Nr   )super__init__TvltPixelPatchEmbeddingspatch_embeddingsnum_patches_per_imager   	Parameterr)   zeroshidden_sizetype_embed_v
num_framestemporal_embedpos_embed_vconfigselfrw   	__class__s     r/   rl   zTvltPixelEmbeddings.__init__   s     8 @%)%:%:%P%P"LLQ6;M;M)NO ll5;;q&:K:KVM_M_+`a<<At7Q7QSYSeSe(fgr.   c                    |j                   \  }}}}}| j                  |      }|| j                  j                  d|d      z  }|t	        j
                  | j                  d d d |f   | j                  d      z  }|| j                  z  }||fS Nr   rV   )	r>   rn   rv   rM   r)   repeat_interleaveru   ro   rs   )	ry   rA   r^   rD   rt   num_channelsheightwidth
embeddingss	            r/   forwardzTvltPixelEmbeddings.forward   s    >J>P>P;
Jfe**<8
d&&--aQ??
e--d.A.A![j[..QSWSmSmstuu
d'''
?**r.   Nr%   r&   r'   r(   rl   r   __classcell__rz   s   @r/   rg   rg      s    6
	+r.   rg   c                   *     e Zd ZdZ fdZddZ xZS )TvltAudioEmbeddingsrh   c                    t         |           t        |      | _        | j                  j                  | _        t        j                  t        j                  dd|j                              | _
        |j                  |j                  d   z  | _        t        j                  t        j                  d| j                  | j                  z  |j                              | _        t        j                  t        j                  d| j                  |j                              | _        |j                  |j                  d   z  | _        || _        y rj   )rk   rl   TvltAudioPatchEmbeddingsrn   num_patchesr   rp   r)   rq   rr   type_embed_afrequency_lengthaudio_patch_sizenum_freq_patchespos_embed_a
freq_embedrw   rx   s     r/   rl   zTvltAudioEmbeddings.__init__   s     8 @00<<LLQ6;M;M)NO & 7 76;R;RST;U U<<At7G7G4K`K`7`bhbtbt(uv,,u{{1d6K6KVM_M_'`a & 7 76;R;RST;U Ur.   c                 6   | j                  |      }|j                  d      | j                  z  }|| j                  j	                  d|d      z  }|t        j                  | j                  d d d |f   | j                  d      z  }|| j                  z  }||fS r|   )	rn   sizer   r   rM   r)   r}   r   r   )ry   rO   r^   r   rS   s        r/   r   zTvltAudioEmbeddings.forward   s    **<8
%??1-1F1FFdoo,,Q0@!DD
e--d.>.>qBSCSBS?S.TVZVkVkqrss
d'''
?**r.   r   r   r   s   @r/   r   r      s    6	+r.   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )rm   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _
        || _        || _        || _        t        j                  ||||      | _        y Nr   r   )kernel_sizestride)rk   rl   
image_sizeimage_patch_sizenum_image_channelsrr   
isinstancecollectionsabcIterable
patch_sizer~   ro   r   Conv2d
projection)ry   rw   r   r   r~   rr   ro   rz   s          r/   rl   z!TvltPixelPatchEmbeddings.__init__	  s    !'!2!2F4K4KJ
$*$=$=v?Q?Qk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!+A*Q-!?JqMU_`aUbDb c$$(%:"&))L+:^hir.   rA   returnc                    |j                   \  }}}}}|| j                  k7  rt        d      || j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      |j	                  ||z  |||      }| j                  |      j                  d      j                  dd      }|j	                  ||| j                  z  | j                        }|S )	NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*) doesn't match model ().r;   )
r>   r~   
ValueErrorr   reshaper   flatten	transposero   rr   )ry   rA   rD   rt   r~   r   r   r   s           r/   r   z TvltPixelPatchEmbeddings.forward  s   >J>P>P;
Jfe4,,,w  T__Q''5DOOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  $++J,C\SY[`a__\2::1=GG1M
''
JA[A[4[]a]m]mn
r.   	r%   r&   r'   r(   rl   r)   Tensorr   r   r   s   @r/   rm   rm     s)    j ELL U\\ r.   rm   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )r   z
    This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         
|           |j                  |j                  |j                  }}}|j
                  |j                  }}||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}	|| _        || _        || _        || _        |	| _        t!        j"                  ||||      | _        y r   )rk   rl   spectrogram_lengthr   r   num_audio_channelsrr   r   r   r   r   spectrogram_sizer   r~   r   patch_shaper   r   r   )ry   rw   r   r   r   r~   rr   r   r   r   rz   s             r/   rl   z!TvltAudioPatchEmbeddings.__init__2  s   %%#### /9,
 %+$=$=v?Q?Qk.0@A#-j+//:R:R#SZZdfpYq
'*jm;@PQR@SWabcWd@de'*jm;=Ma=PT^_`Ta=ab 0$(&&))L+:^hir.   rO   r   c                 h   |j                   \  }}}}|| j                  k7  rt        d      || j                  d   kD  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      j                  d      j                  dd      }|S )	Nr   r   r   zInput audio size (r   r   r   r;   )r>   r~   r   r   r   r   r   )ry   rO   rD   r~   r   r   r   s          r/   r   z TvltAudioPatchEmbeddings.forwardG  s    2>2D2D/
L&%4,,,w  D))!,,9N9Nq9Q0Q$VHAeW 5**1-.a0E0Ea0H/IM  __\2::1=GG1M
r.   r   r   s   @r/   r   r   +  s)    j*ELL U\\ r.   r   c                   ,     e Zd Z fdZd ZddZ xZS )TvltSelfAttentionc                    t         |           |j                  |j                  z  dk7  r3t	        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rk   rl   rr   num_attention_headshasattrr   r@   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutrx   s     r/   rl   zTvltSelfAttention.__init__X  s1    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr.   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )NrJ   r   r;   r      )r   r   r   rN   permute)ry   xnew_x_shapes      r/   transpose_for_scoresz&TvltSelfAttention.transpose_for_scoresj  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r.   c                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }	|	t        j                  | j                        z  }	||	|z   }	 t        j                  d      |	      }
| j                  |
      }
||
|z  }
t	        j
                  |
|      }|j                  dddd      j                         }|j                         d d | j                   fz   } |j"                  | }|r||
f}|S |f}|S )NrJ   rV   r   r;   r   r   )r   r   r   r   r)   matmulr   mathsqrtr   r   Softmaxr   r   
contiguousr   r   rN   )ry   r#   attention_mask	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r/   r   zTvltSelfAttention.forwardo  sa    JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ -"**,-=> ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r.   NNF)r%   r&   r'   rl   r   r   r   r   s   @r/   r   r   W  s    G$%
!r.   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	TvltSelfOutputz
    The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rw   r   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	rk   rl   r   r   rr   denser   hidden_dropout_probr   rx   s     r/   rl   zTvltSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r.   r#   input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   r   ry   r#   r   s      r/   r   zTvltSelfOutput.forward  s$    

=1]3r.   )
r%   r&   r'   r(   r   rl   r)   r   r   r   r   s   @r/   r   r     sD    
>z >d >
U\\  RWR^R^ r.   r   c                   ,     e Zd Z fdZd ZddZ xZS )TvltAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rk   rl   r   	attentionr   outputsetpruned_headsrx   s     r/   rl   zTvltAttention.__init__  s0    *62$V,Er.   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rV   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)ry   headsrY   s      r/   prune_headszTvltAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r.   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )ry   r#   r   r   r   self_outputsattention_outputr   s           r/   r   zTvltAttention.forward  sE    ~~m^YPab;;|AF#%QR(88r.   r   )r%   r&   r'   rl   r   r   r   r   s   @r/   r   r     s    ";$r.   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )TvltIntermediaterw   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rk   rl   r   r   rr   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnrx   s     r/   rl   zTvltIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r.   r#   c                 J    | j                  |      }| j                  |      }|S r   )r   r  ry   r#   s     r/   r   zTvltIntermediate.forward  s&    

=100?r.   	r%   r&   r'   r   rl   r)   r   r   r   r   s   @r/   r   r     s1    9z 9d 9U\\ ell r.   r   c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )
TvltOutputrw   r   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rk   rl   r   r   r   rr   r   r   r   r   rx   s     r/   rl   zTvltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r.   r#   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r/   r   zTvltOutput.forward  s.    

=1]3%4r.   r  r   s   @r/   r  r    s?    >z >d >
U\\  RWR^R^ r.   r  c                   *     e Zd ZdZ fdZddZ xZS )	TvltLayerz?This corresponds to the Block class in the timm implementation.c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y Nr   eps)rk   rl   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater  r   r   	LayerNormrr   layer_norm_epslayernorm_beforelayernorm_afterrx   s     r/   rl   zTvltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr.   c                    | j                  | j                  |      |||      }|d   }|dd  }||j                  |j                        z   }| j	                  |      }| j                  |      }| j                  ||      }|f|z   }|S )Nr   r   r   )r   r  tor=   r  r  r   )	ry   r#   r   r   r   self_attention_outputsr   r   layer_outputs	            r/   r   zTvltLayer.forward  s    !%!!-0/	 "0 "
 2!4(, )=+;+;<L<S<S+TT ++M:((6 {{<?/G+r.   r   r   r   s   @r/   r
  r
    s    I[r.   r
  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )TvltEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rk   rl   rw   r   
ModuleListrangenum_hidden_layersr
  layergradient_checkpointing)ry   rw   _rz   s      r/   rl   zTvltEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                 x   |rdnd }|rdnd }t        | j                        D ]j  \  }	}
|r||fz   }|||	   nd }| j                  r,| j                  r | j	                  |
j
                  ||||      }n |
||||      }|d   }|sb||d   fz   }l |r||fz   }|st        d |||fD              S t        |||      S )Nr-   r   r   c              3   &   K   | ]	  }||  y wr   r-   .0vs     r/   	<genexpr>z&TvltEncoder.forward.<locals>.<genexpr>9  s     mq_`_lm   )r   r#   r$   )	enumerater!  r"  training_gradient_checkpointing_func__call__tupler   )ry   r#   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                r/   r   zTvltEncoder.forward  s    #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]NO]n o)!,M &9]1=M<O&O#)	P,   1]4D Dm]4EGZ$[mmm++*
 	
r.   )NNFFTr%   r&   r'   rl   r   r   r   s   @r/   r  r    s    , "+
r.   r  c                   &    e Zd ZdZeZdZdZdZd Z	y)TvltPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    tvltrA   Tc                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weights        )meanstdNg      ?)r   r   r   r   weightdatanormal_rw   initializer_ranger   zero_r  fill_)ry   modules     r/   _init_weightsz!TvltPreTrainedModel._init_weightsL  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r.   N)
r%   r&   r'   r(   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrG  r-   r.   r/   r:  r:  A  s$    
 L$O&*#
*r.   r:  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
            Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
            Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
            be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.

        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
            [`TvltProcessor.__call__`] for details.

        mask_pixel (`bool`, *optional*):
            Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.

        mask_audio (`bool`, *optional*):
            Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.c                   ,    e Zd Z fdZd Zd Z ee       ee	e
      	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     d	ed
edee   dee   dee   deeej                     e	f   fd              Z xZS )	TvltModelc                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        t        j                  t        j                  dd|j                              | _        |j                  rd | _        n0t        j"                  |j                  |j$                        | _        | j'                          y r  )rk   rl   rw   rg   pixel_embeddingsr   audio_embeddingsr  encoderr   rp   r)   rq   rr   cls_embeddinguse_mean_pooling	layernormr  r  	post_initrx   s     r/   rl   zTvltModel.__init__  s      3F ; 3F ;"6*\\%++aF<N<N*OP""!DN\\&*<*<&BWBWXDN 	r.   c                 Z    | j                   j                  | j                  j                  fS r   )rO  rn   rP  )ry   s    r/   get_input_embeddingszTvltModel.get_input_embeddings  s%    $$55t7L7L7]7]]]r.   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrQ  r!  r   r   )ry   heads_to_pruner!  r   s       r/   _prune_headszTvltModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr.   output_typerH  rA   rO   rB   rP   
mask_pixel
mask_audior   r0  r1  r   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  ||      \  }
}| j                  ||      \  }}d}d}|r9t        |
|| j                   j                        \  }}t        |
|||      \  }
}}}d}d}|r| j                   j                  | j                   j                  d   z  }t        ||| j                   j                  | j                   j                  |      \  }}t        ||||      \  }}}}|j                  d      }t        j                   | j"                  j%                  |dd      |
|gd      }|
j                  d      }d}|$|"t        j                   |ddddf   ||gd      }|j                         }d}|| j'                  ||      }| j)                  |||||	      }|d   }| j*                  | j+                  |      }|dddd|z   f   }|ddd|z   df   }|	s|||||||f|dd z   S t-        ||||||||j.                  |j0                  	      S )	a  
        Returns:

        Examples:

        ```python
        >>> from transformers import TvltProcessor, TvltModel
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 8
        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
        >>> audio = list(np.random.randn(10000))

        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
        >>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")

        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")

        >>> outputs = model(**input_dict)
        >>> loss = outputs.loss
        ```N)rB   rC   )r^   r   )rP   rC   rQ   rR   r   )r   r   r0  r1  )	r   r   r   r   r    r!   r"   r#   r$   )rw   r   r0  use_return_dictrO  rP  rH   pixel_mask_ratiore   r   r   rT   audio_mask_ratioaudio_mask_typer   r)   catrR  rM   get_extended_attention_maskrQ  rT  r   r#   r$   )ry   rA   rO   rB   rP   r^  r_  r   r0  r1  pixel_embedding_outputaudio_embedding_outputr   r!   pixel_mask_noisepixel_len_keepr    r"   r   audio_mask_noiseaudio_len_keeprD   embedding_outputmasked_pixel_lenr   input_shapeextended_attention_maskencoder_outputssequence_outputpixel_sequence_outputaudio_sequence_outputs                                  r/   r   zTvltModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-1-B-B<Q[-\*
-1-B-B<Q[-\*
 ! /H&:$++JfJf0,n Xf&  *	XT"J0ACT ! #{{;;t{{?[?[\]?^^/H&%;;77++55)0,n Xf&  *	XT"J0ACT "&&q)
 99&&z1a8:PRhikl
 266q9!j&<"YY
1bqb5(9:z'RTUVN&++-"&%&*&F&F~Wb&c#,,2/!5# ' 
 *!,>>%"nn_=O /1q;K7K3K0K L /17G3G3I0I J%%!!!!  #$ $ -$9$9////)77&11

 
	
r.   )NNFFNNN)r%   r&   r'   rl   rW  r[  r   TVLT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr)   r*   r   boolr   r   r   r   r   s   @r/   rM  rM    s   
$^C ++@A?Y
 3726  ,0/3&*@
''@
 ''@
 U../	@

 U../@
 @
 @
 $D>@
 'tn@
 d^@
 
uU&&'8	9@
 Z B@
r.   rM  c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )TvltDecoderc                    t         |           t        |      }|j                  |_        |j
                  |_        |j                  |_        |j                  |_
        t        j                  t        |j
                        D cg c]  }t        |       c}      | _        t        j                   |j                  |j"                        | _        d| _        || _        y c c}w )Nr  F)rk   rl   r   decoder_hidden_sizerr   decoder_num_hidden_layersr   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r  r
  decoder_layersr  r  rT  r"  rw   )ry   rw   decoder_configr#  rz   s       r/   rl   zTvltDecoder.__init__9  s    !&)%+%?%?"+1+K+K(-3-O-O*+1+K+K( mm05f6V6V0WX1Y~&X
 f&@&@fF[F[\&+# Ys   C"c                    |rdnd }|rdnd }t        | j                        D ]_  \  }}|r||fz   }| j                  r+| j                  r| j	                  |j
                  |d |      }	n
 |||      }	|	d   }|sW||	d   fz   }a |r||fz   }| j                  |      }
|st        d |
||fD              S t        |
||      S )Nr-   r  r   r   c              3   &   K   | ]	  }||  y wr   r-   r&  s     r/   r)  z&TvltDecoder.forward.<locals>.<genexpr>n  s     fqXYXefr*  )r2   r#   r$   )	r+  r  r"  r,  r-  r.  rT  r/  r1   )ry   r#   r   r0  r1  r2  r3  r4  r5  r7  r2   s              r/   r   zTvltDecoder.forwardJ  s     #7BD$5b4()<)<= 	POA|#$58H$H!**t}} $ A A ))!%	! !-]N_ `)!,M &9]1=M<O&O##	P&   1]4D D .fV->@S$Tfff >O\oppr.   )FFTr8  r   s   @r/   ry  ry  8  s    (  "%qr.   ry  zTThe TVLT Model transformer with the decoder on top for self-supervised pre-training.c                       e Zd Z fdZd Zd Zd Zd Zd Z e	e
       eee      	 	 	 	 	 	 	 	 ddej                  d	ej                  d
eej                     deej                     deej"                     deej                     deej                     dee   dee   dee   deeej                     ef   fd              Z xZS )TvltForPreTrainingc                    t         	|   |       || _        |j                  | _        |j                  | _        | j                  s| j                  st        d      t        |      | _        | j                  rt        |      | _	        | j                  r$t        j                  |j                  |j                  d      | _        t        j                  t!        j"                  dd|j                              | _        t        j                  t!        j"                  dd|j                              | _        t)        |      | _        |j                  }|j,                  }| j                  j.                  j0                  }t        j                  t!        j"                  d||            | _        t        j                  t!        j"                  d|j,                  |            | _        t        j                  t!        j"                  dd|            | _        | j                  j8                  j:                  }|j<                  |j>                  d   z  }t        j                  t!        j"                  d||z  |            | _         t        j                  t!        j"                  d||            | _!        t        j                  t!        j"                  dd|            | _"        | j                  jF                  d   dz  | j                  jH                  z  }tK        ||      | _&        | j                  j>                  d   | j                  j>                  d   z  | j                  jN                  z  }tK        ||      | _(        || _        || _        || _)        |jF                  | _#        |j>                  | _        | jU                          y )Nz;Must set at least one of matching task and MAE task to trueTr   r   r   r;   )+rk   rl   rw   task_matchingtask_maer   rM  r;  TvltMatchingHeadmatching_headr   r   rr   r{  encoder_to_decoderrp   r)   rq   pixel_mask_tokenaudio_mask_tokenry  decoderrt   rO  ro   decoder_pixel_pos_embeddecoder_temporal_embeddecoder_pixel_type_embedrP  r   r   r   decoder_audio_pos_embeddecoder_freq_embeddecoder_audio_type_embedr   r   TvltMAEHeadpixel_mae_headr   audio_mae_headr   rU  )
ry   rw   r{  rt   ro   num_audio_patchesr   pixel_mae_output_dimaudio_mae_output_dimrz   s
            r/   rl   zTvltForPreTraining.__init__w  s    #11""dmmZ[[f%	!1&!9D==&(ii0B0BFD^D^ei&jD#$&LLQ6C]C]1^$_D!$&LLQ6C]C]1^$_D!&v.DL"("<"<**J$(II$>$>$T$T!+-<<AG\^q8r+sD(*,,,u{{1fFWFWYl7m*nD',.LLQK^9_,`D) $		 : : F F%66&:Q:QRS:TT+-<<A04DDFYZ,D( ')ll5;;qBRTg3h&iD#,.LLQK^9_,`D)#';;#?#?#Ba#G$++JhJh#h "-f6J"KD,,Q/$++2N2Nq2QQTXT_T_TrTrr ! #.f6J"KD(DO)>D&$4D!$*$;$;D!$*$;$;D! 	r.   c           
         |j                   \  }}}}}|j                   d   | j                  d   z  }|j                   d   | j                  d   z  }|j                  ||||| j                  d   || j                  d   f      }	t        j                  d|	      }	|	j                  |||z  |z  | j                  d   | j                  d   z  |z  f      }	|	S )zJ
        pixel_values: [batch_size, num_frames, 3, height, width]
        r   r   r   r   r>   zntchpwq->nthwpqc)r>   r   r   r)   einsum)
ry   rA   rD   rt   r~   r   r   num_patches_heightnum_patches_widthpatchified_pixel_valuess
             r/   patchify_pixelz!TvltForPreTraining.patchify_pixel  s    ?K>P>P;
Jfe)//2d6K6KA6NN(..q1T5J5J15MM"."6"6"%%a(!%%a( #7 
#
 #(,,/ACZ"["9"A"A"%66C%%a(4+@+@+CClR #B #
 '&r.   c           	      p   |j                   \  }}}}|| j                  d   z  }|| j                  d   z  }|j                  |||| j                  d   || j                  d   f      }t        j                  d|      }|j                  |||z  | j                  d   | j                  d   z  |z  f      }|S )z>
        audio_values: [batch_size, 1, height, width]
        r   r   r  znchpwq->nhwpqc)r>   r   r   r)   r  )	ry   rO   rD   r~   r   r   r  r  patchified_audio_valuess	            r/   patchify_audioz!TvltForPreTraining.patchify_audio  s     3?2D2D/
L&%#t'<'<Q'??!T%:%:1%=="."6"6"%%a(!%%a( #7 	#
 #(,,/?AX"Y"9"A"A"%66%%a(4+@+@+CClR #B #
 '&r.   c                     | j                  |      }||z
  dz  }|j                  d      }||z  j                         |j                         z  }|S Nr;   rJ   rV   )r  r>  sum)ry   rA   pixel_predictionsmaskr  r5   s         r/   pixel_mae_lossz!TvltForPreTraining.pixel_mae_loss  U    "&"5"5l"C!$;;AyyRy t  "TXXZ/r.   c                     | j                  |      }||z
  dz  }|j                  d      }||z  j                         |j                         z  }|S r  )r  r>  r  )ry   rO   audio_predictionsr  r  r5   s         r/   audio_mae_lossz!TvltForPreTraining.audio_mae_loss  r  r.   c           	         |j                   \  }}}|j                  ||j                   d   |z
  d      }t        j                  ||gd      }t        j                  |d|j                  d      j                  dd|            }|S )Nr   rV   rJ   rX   )r>   rM   r)   re  r[   rL   )	ry   
mask_tokenr]   ra   rD   
seq_lengthrW   mask_tokenspadded_sequences	            r/   concatenate_maskz#TvltForPreTraining.concatenate_mask  s    &.nn#
J ''
K4E4Ea4H:4UWXY))X{$;C,,+*?*?*C*J*J1aQT*U
 r.   r\  rA   rO   rB   rP   labelspixel_values_mixedpixel_mask_mixedr   r0  r1  r   c                    |
|
n| j                   j                  }
d}| j                  r~|t        d      |t        d      | j	                  ||||||	|
      }|d   }| j                  |      }t               } ||j                  d      |j                  d            }||z  }d}d}| j                  rv| j                  ri| j	                  ||||dd||	|
		      }|
r|j                  n|d
   }|
r|j                  n|d   }|
r|j                  n|d   }|
r|j                  n|d   }|
r|j                  n|d   }|
r|j                  n|d   }| j!                  |      }| j!                  |      }|j#                  d
      }| j%                  | j&                  ||      }|| j(                  j+                  d
|d
      z   }|t-        j.                  | j0                  ddd|f   | j2                  d
      z   }|| j4                  z   }| j7                  |      }| j9                  |j:                        }| j%                  | j<                  ||      }|j#                  d
      | j>                  z  }|| j@                  j+                  d
|d
      z   }|t-        j.                  | jB                  ddd|f   | j>                  d
      z   }|| jD                  z   }| j7                  |      }| jG                  |j:                        }| jI                  |||      | jK                  |||      z   }||z  }|
s||fdd z   }|f|z   S |S tM        |||jN                  |jP                        S )aF  
        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
            obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.

        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
            [`TvltProcessor.__call__`] for details.

        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.

        Return:

        Examples:

        ```python
        >>> from transformers import TvltProcessor, TvltForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 8
        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
        >>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
        >>> audio = list(np.random.randn(10000))
        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
        >>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
        >>> input_dict = processor(
        ...     images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
        ... )

        >>> outputs = model(**input_dict)
        >>> loss = outputs.loss
        ```Nr=  zMatching task requires labelsz)Matching task requires pixel_values_mixedrB   rP   r   r0  r1  r   rJ   T)rB   rP   r^  r_  r   r0  r1  r   r;   r   r         rV      )r5   r6   r7   r8   r#   r$   ))rw   ra  r  r   r;  r  r	   rN   r  r,  r   r   r   r    r!   r"   r  r   r  r  r  rM   r)   r}   r  ro   r  r  r  r2   r  r   r  r  r  r  r  r  r4   r#   r$   ) ry   rA   rO   rB   rP   r  r  r  r   r0  r1  
total_lossr   rr  r6   loss_fctr5   r7   r8   rs  rt  r   r    r!   r"   pixel_decoder_inputaudio_decoder_inputrt   pixel_decoder_outputsrS   audio_decoder_outputsr   s                                    r/   r   zTvltForPreTraining.forward  s   b &1%<k$++B]B]
~ !@AA!) !LMMii"+%"3%9'   G &ajO"00AO(*HO004fkk"oFD$J==T]]ii%%"3%9'   
G HSG$C$CX_`aXb!GRG$C$CX_`aXb!=H 9 9gVWj=H 9 9gVWj=H 9 9gVWj=H 9 9gVWj"&"9"9%# #'"9"9%# &**1-J"&"7"78M8MObdu"v"58T8T8[8[\]_ikl8m"m"58O8O++A{
{N;T=W=W]^9 # #68U8U"U$(LL1D$E!../D/K/KLL"&"7"78M8MObdu"v277:d>S>SS"58O8O8V8VWXZjlm8n"n"58O8O,,Q0A1A0A-ABDDYDY_`9 # #68U8U"U$(LL1D$E!../D/K/KLL&&|\CTUX\XkXkl,=Y D $J%|\BWQR[PF/3/?ZMF*KVK'+%%!//))
 	
r.   )NNNNNNNN)r%   r&   r'   rl   r  r  r  r  r  r   ru  r   r4   rv  r)   r*   r   r,   rw  r   r   r   r   r   s   @r/   r  r  r  sG   
4l'8'6 ++@A+CRab
 3726-1:>8<,0/3&*H
''H
 ''H
 U../	H

 U../H
 ))*H
 %U%6%67H
 #5#4#45H
 $D>H
 'tnH
 d^H
 
uU&&')AA	BH
 c BH
r.   r  c                   $     e Zd Z fdZd Z xZS )
TvltPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rk   rl   r   r   rr   r   Tanh
activationrx   s     r/   rl   zTvltPooler.__init__  s9    YYv1163E3EF
'')r.   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )ry   r#   first_token_tensorpooled_outputs       r/   r   zTvltPooler.forward  s4    *1a40

#566r.   r8  r   s   @r/   r  r    s    $
r.   r  c                   $     e Zd Z fdZd Z xZS )r  c                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y rj   )rk   rl   r  poolerr   r   rr   fcrx   s     r/   rl   zTvltMatchingHead.__init__  s2     ())F..2r.   c                 F    | j                  | j                  |            }|S r   )r  r  r  s     r/   r   zTvltMatchingHead.forward  s    M :;r.   r8  r   s   @r/   r  r    s    3
r.   r  c                   &     e Zd Zd fd	Zd Z xZS )r  c                 z    t         |           || _        t        j                  |j
                  |      | _        y r   )rk   rl   rw   r   r   r{  r  )ry   rw   
output_dimrz   s      r/   rl   zTvltMAEHead.__init__  s-    yy!;!;ZHr.   c                 (    | j                  |      }|S r   )r  r  s     r/   r   zTvltMAEHead.forward  s    ]3r.   r   r8  r   s   @r/   r  r    s    I
r.   r  z
    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
    c                   4    e Zd Z fdZ ee       eee      	 	 	 	 	 	 dde	j                  de	j                  dee	j                     dee	j                     dee   dee   d	ee   d
ee	j                     deee	j                     ef   fd              Z xZS ) TvltForAudioVisualClassificationc           	         t         |   |       t        |      | _        t	        j
                  t	        j                  |j                  |j                  dz        t	        j                  |j                  dz  |j                        t	        j                         t	        j                  |j                  dz  |j                              | _        || _        | j                          y )Nr;   r  )rk   rl   rM  r;  r   
Sequentialr   rr   r  r  GELU
num_labels
classifierrw   rU  rx   s     r/   rl   z)TvltForAudioVisualClassification.__init__  s     f%	 --IIf((&*<*<q*@ALL++a/V5J5JKGGIIIf((1,f.?.?@	
  	r.   r\  rA   rO   rB   rP   r   r0  r1  r  r   c	           	         ||n| j                   j                  }| j                  |||||||      }	|	d   dddf   }
| j                  |
      }d}|Y| j                   j                  dk(  rt               } |||      }n,| j                   j                  dk(  rt               } |||      }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
            refers to the number of classes in audiovisual tasks.

        Return:

        Examples:
        ```python
        >>> from transformers import TvltProcessor, TvltForAudioVisualClassification
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 8
        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
        >>> audio = list(np.random.randn(10000))
        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
        >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")

        >>> outputs = model(**input_dict)
        >>> loss = outputs.loss
        ```Nr  r   
regressionclassificationr   )r5   r2   r#   r$   )
rw   ra  r;  r  	loss_typer   r
   r   r#   r$   )ry   rA   rO   rB   rP   r   r0  r1  r  r   rr  r2   r5   r  r   s                  r/   r   z(TvltForAudioVisualClassification.forward  s   H &1%<k$++B]B]))!!/!5#  
 "!*QT*1{{$$4"9/&&*::+-/Y,F)-)9TGf$EvE'!//))	
 	
r.   )NNNNNN)r%   r&   r'   rl   r   ru  r   r   rv  r)   r*   r   rw  r,   r   r   r   r   r   s   @r/   r  r    s    " ++@A+CRab
 3726,0/3&*-1B
''B
 ''B
 U../	B

 U../B
 $D>B
 'tnB
 d^B
 ))*B
 
uU&&')AA	BB
 c BB
r.   r  )N      ?)Nr  rK      r   )Fr(   collections.abcr   r   copyr   dataclassesr   typingr   r   r   r)   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   configuration_tvltr   
get_loggerr%   loggerrv  _CHECKPOINT_FOR_DOCr   r1   r4   rH   rT   re   Modulerg   r   rm   r   r   r   r   r   r  r
  r  r:  TVLT_START_DOCSTRINGru  rM  ry  r  r  r  r  r  r-   r.   r/   <module>r     sb       ! ) )    A A " J . R  + 
		H	%,  %?k %? %?P ? ? ?, ?{ ? ?B$F:+")) +6+")) +:&ryy &R)ryy )X9		 9xRYY $BII Dryy   #		 #L2
")) 2
j*/ *0	 * Z d`
# `
	`
F7q")) 7qt ZO
, O
	O
d
 
ryy "))   V
': V
V
r.   