
    sgT                        d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ  ej<                  e      Z e G d de             Z! G d de	jD                        Z# G d de	jD                        Z$ G d de	jD                        Z% G d de	jD                        Z& G d de	jD                        Z' G d de	jD                        Z( G d de	jD                        Z) G d  d!e	jD                        Z* G d" d#e	jD                        Z+ G d$ d%e	jD                        Z, G d& d'e      Z-d(Z.d)Z/ G d* d+e	jD                        Z0 G d, d-e	jD                        Z1e0e1d.Z2 ed/e.       G d0 d1e-             Z3 G d2 d3e	jD                        Z4 ed4e.       G d5 d6e-             Z5y)7zPyTorch TVP Model    N)	dataclass)OptionalTuple)nn   )ACT2FN)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)logging)load_backbone   )	TvpConfigc                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)TvpVideoGroundingOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Temporal-Distance IoU loss for video grounding.
        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
            input texts.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        W/var/www/html/venv/lib/python3.12/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   &   sl      )-D(5$$
%, $FE$=AM8E%"3"3S"89:A:>Ju00#567>r#   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`List[str]`):
            List of all the losses to be applied.
    c                     t         |           | j                  | j                  | j                  d| _        |D ]  }|| j
                  vst        d| d       || _        y )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr3   r   	__class__s      r$   r-   zTvpLoss.__init__I   sj    ==****

  	?D4==( 5n!=>>	? r#   c                     t        j                  ||      t        j                  ||      z
  }t        j                  ||      t        j                  ||      z
  }d|j                  d      |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r8   maxclamp)	r4   
start_timeend_timecandidates_start_timecandidates_end_timer+   interunionr)   s	            r$   r.   zTvpLoss.loss_iouV   si     		-x8599EZ\f;gg		-x8599EZ\f;gg%++!+$u,,
r#   c                 P   t        j                  t        j                  ||      d      }t        j                  t        j                  ||      d      }t        j                  t        j                  ||      t        j                  ||      z
  |      j                  d      }|S )z5
        Measure the distance of mid points.
        g       @g?r7   )r   divaddr9   r8   r:   )	r4   r;   r<   r=   r>   r+   mid_candidatesmid_groundtruthdistance_diffs	            r$   r/   zTvpLoss.loss_distance`   s     599-BDW#XZ]^))EIIj($CSI		IIno6>Sb9ccem

%C%. 	 r#   c                     t        j                  ||      }t        j                  ||      }t        j                  t        j                  t        j                  ||      |            }|j	                  d      }|S )z5
        Measure the difference of duration.
        g?r7   )r   subsquarerB   r:   )	r4   r;   r<   r=   r>   r+   duration_candidatesduration_groundtruthduration_diffs	            r$   r0   zTvpLoss.loss_durationl   sh     $ii(;=RS$yy:>UYYuyy9LNb/cem%no%+++4r#   c                    |\  }}}t        j                  ||      }|dddf   j                         |dddf   j                         }}i }	| j                  D ],  }
|	j	                  |
 | j
                  |
   |||||      i       . |	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`List[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr3   updater1   )r4   r   labelsr+   r;   r<   
candidatesr=   r>   losses_dictr   s              r$   forwardzTvpLoss.forwardw   s     *0&*hYYvx0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KK 	D*t}}T*:xAVXkmuvw	
 r#   )
r   r   r   r   r-   r.   r/   r0   rT   __classcell__r5   s   @r$   r&   r&   >   s!    
	r#   r&   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	      \   t         |           t        |      | _        |j                  |j                  j
                  d   }nt        | j                  d      rDt        | j                  j                  d      r$| j                  j                  j
                  d   }nbt        | j                  d      rAt        | j                  j                  d      r!| j                  j                  j                  }nt        d      t        j                  ||j                  ddddd	      | _        y )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r,   r-   r   backbonebackbone_configr\   hasattrr[   r]   r2   r   Conv2dgrid_encoder_conv)r4   r[   in_channelsr5   s      r$   r-   zTvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H-'$--:N:NP^2_--..;;B?KT]]H-'$--:N:NP]2^--..::K899!#"
r#   c                    |j                   \  }}}}}|j                  ||z  |||      }| j                  |      d   d   }| j                  |      }t        j
                  j                  |dd      }t        j
                  j                  |d      }|j                   dd  \  }	}
}|j                  |||	|
|      }|j                  ddd	d
d      }|S )Nfeature_mapsr      )r^   r_   T)inplacer   r      )	shapeviewrc   rg   r   
functional
max_pool2drelupermute)r4   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r$   rT   zTvpVisionModel.forward   s    >J>P>P;
Jfe#((j)@,PVX]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*ZyyZj)T||Aq!Q*r#   r   r   r   r-   rT   rU   rV   s   @r$   rX   rX      s    
.r#   rX   c                   ~     e Zd ZdZ fdZdej                  dededej                  fdZdde	fd	Z
dde	fd
Z xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                 r   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _
        t        j                  d|j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        |j                  | _        |j                  | _	        y )Nr   eps)r,   r-   r   	Embeddingmax_position_embeddingsr]   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr4   r[   r5   s     r$   r-   z TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r#   	embeddingry   rz   returnc                     dx}}|| j                   kD  r|| j                   z  }|| j                  kD  r|| j                  z  }|j                  dddd      }t        j                  j                  |||fdd      }|j                  dddd      }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   rk   bicubicFscale_factormodealign_corners)r   r   rt   r   rq   interpolate)r4   r   ry   rz   h0w0s         r$   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$???B4888>>>B%%aAq1	MM--b	 . 
	 %%aAq1	r#   r   c                    |j                   \  }}}}t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }	dt        |j                         dz
  z  |d|fz   }
 |	j                  |
 }	t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }|d||f} |j                  | }|	|z   }|r6|| j                  kD  s|| j                  kD  r|| j                  |||      z   }|S ||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )ro   r8   r   r   arangelongr   r   lenrp   r   r   r   )r4   r|   r   rv   ry   rz   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r$   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sQ    15

-
FE: >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	">"9">">	"J ==uE	 <<	DKKX"&">">?O"PIz:	">"9">">	"J 7:Q Q $T:::edFkFk>k$778MvW\]]D  //Dr#   c                    |j                   \  }}}}}|j                  d      }| j                  ||      }|j                  |d|      }|j                   dd }	|j                  }
t        j                  |	t
        j                  |
      }| j                  |      }||z   }| j                  |      }| j                  |      }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rZ   Nr   )ro   meanr   rp   r   r   zerosr   r   r   r   )r4   r|   r   rv   rw   ry   rz   rx   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r$   rT   zTvpVisualInputEmbedding.forward  s     ?Cjj;
J|yy|00Ph0i		*b,?+11#26%% %8

SYZ $ : :> J"%::
__Z0
\\*-
r#   F)r   r   r   r   r-   r   Tensorintr   boolr   rT   rU   rV   s   @r$   r   r      sT    
X%,,  TW \a\h\h .'4 'Rd r#   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        y )N)padding_idxr   )r,   r-   r   r   
vocab_sizer]   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r$   r-   zTvpTextInputEmbeddings.__init__*  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r#   c                 .   ||j                         }n|j                         d d }|d   }||j                  n|j                  }|Ft        j                  |t        j                  |      }|j                  d      j                  |      }|&t        j                  |t        j                  |      }|| j                  |      }| j                  |      }| j                  |      }	||z   |	z   }
| j                  |
      }
| j                  |
      }
|
S )NrZ   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r4   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r$   rT   zTvpTextInputEmbeddings.forward2  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"%88;PP
__Z0
\\*-
r#   )NNNNr   r   r   r   r-   rT   rU   rV   s   @r$   r   r   '  s    Q>r#   r   c                   f     e Zd Z fdZd Zdej                  dedefdZ	 	 	 d	de	e
   fdZ xZS )
TvpAttentionc                    t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        j$                  |j                  |j&                        | _        t        j                  |j*                        | _        t/               | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r,   r-   r]   num_attention_headsre   r2   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r$   r-   zTvpAttention.__init__L  s    : ::a?PVXhHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=Er#   c                 N   t        |      dk(  ry t        j                  | j                  | j                        }t        |      | j                  z
  }|D ](  t        fd| j                  D              z
  d|<   * |j                  d      j                         j                  d      }t        j                  t        |            |   j                         }t        | j                  |      | _        t        | j                  |      | _        t        | j                   |      | _        t        | j"                  |d      | _        | j                  t        |      z
  | _        | j                  | j                  z  | _        | j                  j'                  |      | _        y )Nr   c              3   0   K   | ]  }|k  rd nd  yw)r   r   Nr"   ).0hheads     r$   	<genexpr>z+TvpAttention.prune_heads.<locals>.<genexpr>h  s     Nq1t8a2Ns   rZ   r   dim)r   r   onesr   r   r   r   sumrp   
contiguouseqr   r   r   r   r   r   r   r   r@   )r4   headsmaskindexr   s       @r$   prune_headszTvpAttention.prune_headsa  sN   u:?zz$22D4L4LME
T... 	D#ND<M<MNNNDDJ	 yy}''),,Q/SY'-224 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r#   tensorsequence_lengthrv   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   rk   )rp   r   r   	transposer   )r4   r   r   rv   s       r$   _reshapezTvpAttention._reshapex  s7    KK
OT5M5MtOgOghYq!_Z\	
r#   output_attentionsc                 :   |j                   d d \  }}| j                  |      }| j                  |      }| j                  |      }	| j	                  |||      }
| j	                  |||      }| j	                  |	||      }t        j                  |
|j                  dd            }|t        j                  | j                        z  }|||z   }t        j                  j                  |d      }| j                  |      }|||z  }t        j                  ||      }|j                  dd      j                         }|j!                  ||| j"                        }| j%                  |      }| j'                  |      }| j)                  ||z         }|r||f}|S |f}|S )Nrk   rZ   r   r   )ro   r   r   r   r   r   matmulr   mathsqrtr   r   rq   softmaxr   r   reshaper   r   r   r   )r4   r   attention_mask	head_maskr   rv   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r$   rT   zTvpAttention.forward  s    '4&9&9"1&=#
O JJ}5((=1 JJ}5mm$5
SMM/?JO	mm$5
S !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ++O<  -	9Oll?K@!++Aq1<<>!))*otGYGYZjj-ll;/ookM&AB4E;0 MX>r#   NNN)r   r   r   r-   r   r   r   r   r   r   r   rT   rU   rV   s   @r$   r   r   K  sI    "*;.
u|| 
c 
s 
 ,0+
 $D>+r#   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )TvpIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y N)r,   r-   r   r   r]   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r$   r-   zTvpIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r#   r   r   c                 J    | j                  |      }| j                  |      }|S r  )r   r  )r4   r   s     r$   rT   zTvpIntermediate.forward  s&    

=100?r#   r   r   r   r-   r   r   rT   rU   rV   s   @r$   r
  r
    s#    9U\\ ell r#   r
  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )TvpOutputLayerc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r,   r-   r   r   r  r]   r   r   r   r   r   r   r   r   s     r$   r-   zTvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r#   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r  )r   r   r   )r4   r   r  s      r$   rT   zTvpOutputLayer.forward  s7    

=1]3(DEr#   r  rV   s   @r$   r  r    s1    >U\\  RWR^R^ r#   r  c                   8     e Zd Z fdZ	 	 	 ddee   fdZ xZS )TvpEncodeLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r  )r,   r-   r   	attentionr
  intermediater  outputr   s     r$   r-   zTvpEncodeLayer.__init__  s3    %f-+F3$V,r#   r   c                     | j                  ||||      }|d   }|dd  }| j                  |      }| j                  ||      }	|	f|z   }|S )N)r   r   r   )r  r  r  )
r4   r   r   r   r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r$   rT   zTvpEncodeLayer.forward  sr     "&/	 "0 "
 2!4(,"//0@A{{#68HI/G+r#   r  )r   r   r   r-   r   r   rT   rU   rV   s   @r$   r  r    s&    - ,0
 $D>r#   r  c            
       n     e Zd Z fdZ	 	 	 	 	 ddeej                     dee   dee   dee   fdZ xZ	S )
TvpEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r,   r-   r[   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r4   r[   _r5   s      r$   r-   zTvpEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r   r   output_hidden_statesreturn_dictc                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}t	        | j
                        D ]k  \  }	}
|r||fz   }| j                  r3| j                  r'| j                  |
j                  |||||	   nd |      }n |
||||	   |      }|d   }|sc||d   fz   }m |r||fz   }|s|f}|r||fz   }|r||fz   }|S t        ||r|nd |r|      S d       S )Nr"   r   r   )last_hidden_stater   r   )r[   r.  r   r-  	enumerater*  r+  training_gradient_checkpointing_func__call__r   )r4   r   r   r   r   r-  r.  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r$   rT   zTvpEncoder.forward  ss    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4 	FOA|#$58H$H!**t}} $ A A ))!"%.%:Yq\%! !-]NIVWLZk l)!,M !/=3C2E!E#	F(   1]4D D$&G#!%6$88 !^$55N+/C+):~
 	
 AE
 	
r#   )NNNNN)
r   r   r   r-   r   r   r    r   rT   rU   rV   s   @r$   r%  r%    s]    , 15,0/3&*4
 E--.	4

 $D>4
 'tn4
 d^4
r#   r%  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	TvpPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r  )r,   r-   r   r   r]   r   Tanh
activationr   s     r$   r-   zTvpPooler.__init__'  s9    YYv1163E3EF
'')r#   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r>  )r4   r   first_token_tensorpooled_outputs       r$   rT   zTvpPooler.forward,  s6     +1a40

#566r#   r  rV   s   @r$   r;  r;  &  s#    $
U\\ ell r#   r;  c                   "    e Zd ZdZeZdZdZd Zy)TvpPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for downloading and loading pretrained models.
    modelTc                 &   t        |t        j                  t        j                  f      r<|j                  j
                  j                  d| j                  j                         nct        |t        j                        rI|j                  j
                  j                          |j                  j
                  j                  d       t        |t        j                        r0|j                  $|j                  j
                  j                          t        |t        j                        rdt        j                  j                  |j                  dd       |j                  +t        j                  j!                  |j                  d       yyy)	zInitialize the weights        )r   stdg      ?Nfan_outrs   )r   nonlinearityr   )r  r   r   r   weightdatanormal_r[   initializer_ranger   rb   zero_fill_rf   initkaiming_normal_	constant_)r4   modules     r$   _init_weightsz TvpPreTrainedModel._init_weights>  s   fryy",,78 MM&&CT[[5R5R&S-KK""$MM$$S)fbii(V[[-DKK""$fbii(GG##FMM	PV#W{{&!!&++q1 ' )r#   N)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrT  r"   r#   r$   rC  rC  5  s     L&*#2r#   rC  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`TvpImageProcessor`]. See [`TvpImageProcessor.__call__`]
            for details.

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained image pad prompter encodings and positional encodings.
c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      |   |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                  | _        |j                   | _         t        j                  t        j                  d|j
                  d|j                  |j                  g            | _        y )NrC   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr2   r,   r-   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnpad_downr   s     r$   r-   z TvpFrameDownPadPrompter.__init__  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r#   c                    | j                   dk7  rst        j                  | j                  | j                  g|j                  |j
                        }d|| j                  | j                  z
  | j                  d d f<   ||z  }| j                   dk7  rt        j                  |j                  d   |j                  d   d| j                  | j                  g|j
                        }| j                  | j                  z
  }| j                  |d d d d d d || j                  d d f<   ||j                  |j                        z  }|S )	NrC   r   rF  r]  r   r   r   r   )r_  r   r   rb  r   r   r`  r   ro   re  to)r4   ru   visual_prompt_maskpromptstart_points        r$   rT   zTvpFrameDownPadPrompter.forward  s1   %%.!&""D$5$56l>P>PYeYlYl" fit0043J3JJTM^M^^`aab..L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK$*;*;;Q>?FIIl&8&899Lr#   r   rV   s   @r$   rY  rY    s    
r#   rY  c                   p     e Zd ZdZ fdZdej                  dededej                  fdZd
de	fd	Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                   | _         |j
                  |j                  dz  z
  | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        y )Nr[  r^  rk   r   r   )r_  r2   r,   r-   rw   rb  r`  	base_sizer   rc  r   rd  pad_upre  pad_left	pad_rightr   s     r$   r-   zTvpFramePadPrompter.__init__  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r#   rj  ry   rz   r   c                    || j                   z  || j                   z  }}|j                  \  }}}}	}
|j                  ||z  ||	|
      }t        j                  j                  |||fdd      }|j                  |||||      }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )rb  ro   r   r   rq   r   )r4   rj  ry   rz   r   r   batchrw   channelsprompt_heightprompt_widths              r$   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encoding  s     $+++UT5F5F-FBCI<<@z8]L 
 2Hm\Z**b	 + 
 z8VUKr#   rx  c                 Z   |r|j                   d   |j                   d   fn| j                  | j                  f\  }}| j                  dvrt        d| j                         | j                  dv r3t	        j
                  ||g|j                  |j                        }||z  }| j                  dv rt	        j                  d| j                  d	| j                  | j                  |j                  
      }t	        j                  | j                  || j                  gd      }t	        j                  | j                  || j                  gd	      }t	        j                  |j!                  d      |gz        }|r| j#                  |||      }||j%                  |j                        z   }|S )Nr   rZ   )rC   r]  r\  z$Invalid visual_prompter_apply value )r\  r]  r   )r\  rC   r   r   rg  rn   r   r   )ro   rb  r_  r2   r   r   r   r   r   rw   ro  catrq  rr  rp  re  r   rx  rh  )r4   ru   rx  ry   rz   ri  baserj  s           r$   rT   zTvpFramePadPrompter.forward  s{    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VUO<CUCU^j^q^q!r..L%%);;;;q$//1dnndnn]i]p]pqDYYtT^^D!LFYYVT]]CKFYY|003vh>?F'66vvuM'&))L4F4F*GGLr#   r   )r   r   r   r   r-   r   r   r   rx  r   rT   rU   rV   s   @r$   rm  rm    sG    $
Lu|| S QT Y^YeYe 0d r#   rm  )framedownpadframepadzmThe bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.c                       e Zd Z fdZd Zd Zd Z ee       e	e
e      	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
ee   dee   dee   defd              Z xZS )TvpModelc                 "   t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        t        |      | _
        t        |      | _        t        j                  t        j                   dd|j"                  g            | _        t        j&                  |j(                        | _        |j,                  t.        vrt1        d      t/        |j,                     |      | _        | j5                          y )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r,   r-   r[   rX   vision_modelr   r   r   visual_embeddingsr%  encoderr;  poolerr   rc  r   rd  r]   text_promptr   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr2   visual_prompter	post_initr   s     r$   r-   zTvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r#   c                 .    | j                   j                  S r  r   r   )r4   s    r$   get_input_embeddingszTvpModel.get_input_embeddings  s    ...r#   c                 &    || j                   _        y r  r  )r4   r   s     r$   set_input_embeddingszTvpModel.set_input_embeddings  s    */'r#   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r*  r  r   )r4   heads_to_pruner*  r   s       r$   _prune_headszTvpModel._prune_heads"  sE     +002 	CLE5LLu%//;;EB	Cr#   output_typerU  r   ru   r   r   r   r-  r.  r   c	                 "   ||n| j                   j                  }| j                  | j                  ||            }| j	                  |      }	| j                  ||      }
||j                  |
j                  dd       }t        j                  |j                  d   d      j                  |j                  |j                        }t        j                  |||gd	
      }| j                  ||j                               j                  |j                        }| j                   j#                  |	j                  d   d	d	      }t        j                  ||	|
gd
      }| j%                  ||| j'                  || j                   j(                        |||      }|r|j*                  n|d   }| j-                  |      }| j/                  |      }| j/                  |      }|s
||f|dd z   S t1        |||j2                  |j4                        S )a(  
        Returns:

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rx  )r   r   rk   r   r  )r   r   rZ   r   r   )r   r   r   r-  r.  )r0  pooler_outputr   r   )r[   r.  r  r  r   r  new_onesro   r   r   rh  r   r   rz  get_extended_attention_maskr   r  r   r  get_head_maskr)  r0  r  r   r   r   r   )r4   r   ru   r   r   r   r-  r.  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskr  embedding_outputencoder_outputsr0  rA  s                     r$   rT   zTvpModel.forward)  s    : &1%<k$++BYBY((  H` a
 !%) D"&"8"83K #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==ninnN^_bbclcscstN&&--.C.I.I!.LbRTU 99k3HJa%bhij,,)((DKK4Q4QR/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r#   )NNNNNNNF)r   r   r   r-   r  r  r  r
   TVP_INPUTS_DOCSTRINGr   r   r   r   r   
LongTensorr    r   rT   rU   rV   s   @r$   r  r    s    
 /0C ++?@+ET]^ 15485915,0/3&*).H
E,,-H
 u001H
 !!1!12	H

 E--.H
 $D>H
 'tnH
 d^H
 #'H
 _ AH
r#   r  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  dz  d      | _        t        j                         | _        t        j                         | _
        y )Nrk   )r,   r-   r   r   r]   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r$   r-   zTvpVideoGroundingHead.__init__w  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr#   c                     | j                  | j                  |            }| j                  | j                  |            }|S r  )r  r  r  r  )r4   r  r   s      r$   rT   zTvpVideoGroundingHead.forward~  s9    ""4<<#>?""4<<#78r#   r   rV   s   @r$   r  r  v  s    )r#   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                   "    e Zd Z fdZ ee       eee      	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     dee
j                     de	e
j                     de	e   d	e	e   d
e	e   defd              Z xZS )TvpForVideoGroundingc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r  )r,   r-   r[   r  rD  r  video_grounding_headr  r   s     r$   r-   zTvpForVideoGrounding.__init__  s:     f%
$9&$A!r#   r  r   ru   r   rQ   r   r   r-  r.  r   c
           
         ||n| j                   j                  }| j                  ||||||||	      }
|
d   }| j                  |      }d}|pt	        g d      }|j                  | j                          |||      }|d   | j                   j                  |d   z  z   | j                   j                  |d   z  z   }|s|f|
dd z   }
||f|
z   }
|
S t        |||
j                  |
j                  	      S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.
        Returns:

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r-  r.  r   r   r(   r)   r*   r+   rk   )r   r   r   r   )r[   r.  rD  r  r&   rh  r   distance_loss_weightduration_loss_weightr   r   r   )r4   r   ru   r   rQ   r   r   r-  r.  r   r  r  r   r   	criterion	loss_dicts                   r$   rT   zTvpForVideoGrounding.forward  s-   @ &1%<k$++BYBY**/!5#%=  	
  
**=9 ?@ILL%!&&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r#   )	NNNNNNNNF)r   r   r   r-   r
   r  r   r   r   r   r   r  r    r   r   r   rT   rU   rV   s   @r$   r  r    s     ++?@+BQZ[ 154859&*15,0/3&*).A
E,,-A
 u001A
 !!1!12	A

 ell#A
 E--.A
 $D>A
 'tnA
 d^A
 #'A
 \ AA
r#   r  )6r   r   dataclassesr   typingr   r   r   torch.utils.checkpointr   activationsr   
file_utilsr	   r
   r   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   Moduler&   rX   r   r   r   r
  r  r  r%  r;  rC  TVP_START_DOCSTRINGr  rY  rm  r  r  r  r  r"   r#   r$   <module>r     s     ! "    ! p p X X - /  1 ( 
		H	% ?k ? ?.Mbii M`%RYY %Pnbii nb!RYY !H_299 _Fbii RYY RYY 8;
 ;
~		 2 28	 # L"bii "JW")) Wv ,#   vh
! h
	h
VBII   	L
- L
L
r#   