
    sgج                        d dl Z d dlmZmZmZmZ d dlZd dlZd dlmZm	Z	m
Z
 d dlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ  ej<                  e      Z  G d de
jB                        Z" G d de
jB                        Z# G d de
jB                        Z$ G d de
jB                        Z% G d de
jB                        Z& G d de
jB                        Z' G d de
jB                        Z( G d de
jB                        Z) G d de
jB                        Z* G d d e
jB                        Z+ G d! d"e
jB                        Z, G d# d$e
jB                        Z- G d% d&e      Z. G d' d(e.      Z/ G d) d*e.e      Z0y)+    N)ListOptionalTupleUnion)Tensordevicenn)CrossEntropyLoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModelapply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )BlipTextConfigc                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     de	dej                  f
dZ xZS )
BlipTextEmbeddingsz;Construct the embeddings from word and position embeddings.c                 P   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d      d       t'        |dd      | _        || _        y )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r	   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr!   configselfr6   	__class__s     ^/var/www/html/venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.pyr$   zBlipTextEmbeddings.__init__1   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$    	input_idsr   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|@|j                  | j                  j                  j
                        }| j                  |      }|}| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nr   r   r"   )
sizer   tor)   weightr   r!   r+   r,   r0   )	r8   r<   r   r=   r>   input_shape
seq_length
embeddingsr+   s	            r:   forwardzBlipTextEmbeddings.forwardC   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL !T%9%9%@%@%G%GHI 00;M"
'':5"&":":<"H--J^^J/
\\*-
r;   )NNNr   )__name__
__module____qualname____doc__r$   r   r2   
LongTensorFloatTensorintr   rG   __classcell__r9   s   @r:   r   r   .   ss    E( 153759&'E,,- u//0   1 12	
 !$ 
r;   r   c                   4    e Zd Z fdZd Zd Zd Zd Zd Z	 	 	 	 	 	 dde	j                  dee	j                     d	ee	j                     d
ee	j                     dee	j                     deeee	j                           dee   dee	j                     fdZ xZS )BlipTextSelfAttentionc                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        t'        |dd      | _        | j(                  dk(  s| j(                  dk(  rG|j*                  | _        t        j,                  d|j*                  z  d	z
  | j                        | _        y y )
Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r!   r"   relative_keyrelative_key_query   r   )r#   r$   r6   r'   num_attention_headshasattr
ValueErrorrN   attention_head_sizeall_head_sizer	   Linearqueryencoder_hidden_sizekeyvaluer.   attention_probs_dropout_probr0   r5   r!   r*   r%   distance_embeddingr8   r6   is_cross_attentionr9   s      r:   r$   zBlipTextSelfAttention.__init__d   s    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr;   c                     || _         y Nattn_gradients)r8   ri   s     r:   save_attn_gradientsz)BlipTextSelfAttention.save_attn_gradients   s
    ,r;   c                     | j                   S rg   rh   r8   s    r:   get_attn_gradientsz(BlipTextSelfAttention.get_attn_gradients   s    """r;   c                     || _         y rg   attention_map)r8   rp   s     r:   save_attention_mapz(BlipTextSelfAttention.save_attention_map   s
    *r;   c                     | j                   S rg   ro   rl   s    r:   get_attention_mapz'BlipTextSelfAttention.get_attention_map   s    !!!r;   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr   r   rW   r   r   )rA   rX   r[   viewpermute)r8   xnew_x_shapes      r:   transpose_for_scoresz*BlipTextSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r;   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsr?   c                 J   | j                  |      }|d u}	|	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  rF|j                         d   }t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j                  || j                  z   dz
        }|j!                  |j"                  
      }| j                  dk(  rt	        j$                  d||      }||z   }nE| j                  dk(  r6t	        j$                  d||      }t	        j$                  d|
|      }||z   |z   }|t'        j(                  | j*                        z  }|||j!                  |j                        z   } t-        j.                  d      |      }| j1                  |      }|||z  }t	        j                  ||      }|j3                  dddd      j5                         }|j                         d d | j6                  fz   } |j                  | }|r||fn|f}||fz   }|S )Nr   rW   dimr   r   rU   rV   )dtyper   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )r^   ry   r`   ra   r2   catmatmul	transposer!   rA   r3   longr   ru   rc   r*   rB   r   einsummathsqrtr[   r	   Softmaxr0   rv   
contiguousr\   )r8   rz   r{   r|   r}   r~   r   r   mixed_query_layerre   	key_layervalue_layerquery_layerattention_scoresrE   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probsattention_probs_droppedcontext_layernew_context_layer_shapeoutputss                              r:   rG   zBlipTextSelfAttention.forward   s~    !JJ}5
 3$>11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB#[1 !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.2C2CDTD[D[2\\ -"**,-=> #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2mM]^--r;   NNNNNF)rH   rI   rJ   r$   rj   rm   rq   rs   ry   r2   r   r   rM   r   boolrG   rO   rP   s   @r:   rR   rR   c   s    u6-#+"% 7;15=A>BDH,1N||N !!2!23N E--.	N
  ((9(9:N !)):): ;N !uU->->'?!@AN $D>N 
u||	Nr;   rR   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r#   r$   r	   r]   r'   denser,   r-   r.   r/   r0   r7   s     r:   r$   zBlipTextSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r;   rz   input_tensorr?   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rg   r   r0   r,   r8   rz   r   s      r:   rG   zBlipTextSelfOutput.forward   7    

=1]3}|'CDr;   rH   rI   rJ   r$   r2   r   rG   rO   rP   s   @r:   r   r      1    >U\\  RWR^R^ r;   r   c                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )BlipTextAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y rg   )r#   r$   rR   r8   r   outputsetpruned_headsrd   s      r:   r$   zBlipTextAttention.__init__   s3    )&2DE	(0Er;   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r8   rX   r[   r   r   r^   r`   ra   r   r   r\   union)r8   headsindexs      r:   prune_headszBlipTextAttention.prune_heads   s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r;   rz   r{   r|   r}   r~   r   r   r?   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )r8   r   )r8   rz   r{   r|   r}   r~   r   r   self_outputsattention_outputr   s              r:   rG   zBlipTextAttention.forward
  sW     yy!"
  ;;|AF#%QR(88r;   )Fr   )rH   rI   rJ   r$   r   r2   r   r   rM   r   r   rG   rO   rP   s   @r:   r   r      s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r;   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rg   )r#   r$   r	   r]   r'   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr7   s     r:   r$   zBlipTextIntermediate.__init__$  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r;   rz   r?   c                 J    | j                  |      }| j                  |      }|S rg   )r   r   r8   rz   s     r:   rG   zBlipTextIntermediate.forward,  s&    

=100?r;   r   rP   s   @r:   r   r   #  s#    9U\\ ell r;   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r#   r$   r	   r]   r   r'   r   r,   r-   r.   r/   r0   r7   s     r:   r$   zBlipTextOutput.__init__4  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r;   rz   r   r?   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rg   r   r   s      r:   rG   zBlipTextOutput.forward:  r   r;   r   rP   s   @r:   r   r   3  r   r;   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )BlipTextLayerc                 F   t         |           || _        |j                  | _        d| _        t        |      | _        || _        | j                  j                  r&t        || j                  j                        | _	        t        |      | _        t        |      | _        y )Nr   )re   )r#   r$   r6   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r8   r6   r   r9   s      r:   r$   zBlipTextLayer.__init__B  s}    '-'E'E$*62";;!!"3Ft{{OeOe"fD08$V,r;   rz   r{   r|   r}   r~   r   r   r?   c                    ||d d nd }| j                  |||||      }	|	d   }
|	dd }|	d   }|$| j                  |
|||||      }|d   }
||dd z   }t        | j                  | j                  | j
                  |
      }|f|z   }||fz   }|S )NrW   )r   r   r   r   r   )r   )r   r   r   feed_forward_chunkr   r   )r8   rz   r{   r|   r}   r~   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attention_outputslayer_outputs                  r:   rG   zBlipTextLayer.forwardN  s     :H9S>"1#5Y] !%/3 "0 "
 2!4(2.226 ,&*&9&9 %&"3 ': '#  7q9 7" ==G0##T%A%A4CSCSUe
  /G+.00r;   c                 L    | j                  |      }| j                  ||      }|S rg   )r   r   )r8   r   intermediate_outputr   s       r:   r   z BlipTextLayer.feed_forward_chunkz  s,    "//0@A{{#68HIr;   r   )rH   rI   rJ   r$   r2   r   r   rM   r   r   rG   r   rO   rP   s   @r:   r   r   A  s    
- 7;15=A>BDH,1*||* !!2!23* E--.	*
  ((9(9:* !)):): ;* !uU->->'?!@A* $D>* 
u||	*Xr;   r   c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )BlipTextEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )NF)
r#   r$   r6   r	   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r8   r6   ir9   s      r:   r$   zBlipTextEncoder.__init__  sP    ]]eFLdLdFe#fM&!$<#fg
&+# $gs   A$rz   r{   r|   r}   r~   past_key_values	use_cacher   output_hidden_statesreturn_dictr?   c                    | j                   r%| j                  r|rt        j                  d       d}|	rdnd }|rdnd }|r| j                  j
                  rdnd }|rdnd }t        | j                  j                        D ]  }| j                  |   }|	r||fz   }|||   nd }|||   nd }| j                   r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s||d   fz   }||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F r   r   r   rW   c              3   $   K   | ]  }|| 
 y wrg   r   ).0vs     r:   	<genexpr>z*BlipTextEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater   rz   
attentionscross_attentions)r   trainingloggerwarningr6   r   r   r   r   _gradient_checkpointing_func__call__tupler   )r8   rz   r{   r|   r}   r~   r   r   r   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacher   layer_modulelayer_head_maskr   layer_outputss                       r:   rG   zBlipTextEncoder.forward  s    &&4==p "	"6BD$5b4%64;;;Q;QrW[#,R$t{{445 #	RA::a=L#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#';}Q?O>Q'Q$G#	RJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r;   )	NNNNNNFFT)rH   rI   rJ   r$   r2   r   r   rM   r   r   r   r   rG   rO   rP   s   @r:   r   r     s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r;   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rg   )r#   r$   r	   r]   r'   r   Tanh
activationr7   s     r:   r$   zBlipTextPooler.__init__  s9    YYv1163E3EF
'')r;   rz   r?   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )r8   rz   first_token_tensorpooled_outputs       r:   rG   zBlipTextPooler.forward  s6     +1a40

#566r;   r   rP   s   @r:   r  r    s#    $
U\\ ell r;   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r#   r$   r	   r]   r'   r   r   r   r   r   transform_act_fnr,   r-   r7   s     r:   r$   z(BlipTextPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr;   rz   r?   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rg   )r   r  r,   r   s     r:   rG   z'BlipTextPredictionHeadTransform.forward  s4    

=1--m<}5r;   r   rP   s   @r:   r  r    s$    UU\\ ell r;   r  c                   *     e Zd Z fdZd Zd Z xZS )BlipTextLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)bias)r#   r$   r  	transformr	   r]   r'   r&   decoder	Parameterr2   zerosr  r7   s     r:   r$   z!BlipTextLMPredictionHead.__init__  sm    8@ yy!3!3V5F5FUSLLV->->!?@	 !IIr;   c                 :    | j                   | j                  _         y rg   )r  r  rl   s    r:   _tie_weightsz%BlipTextLMPredictionHead._tie_weights  s     IIr;   c                 J    | j                  |      }| j                  |      }|S rg   )r  r  r   s     r:   rG   z BlipTextLMPredictionHead.forward  s$    }5]3r;   )rH   rI   rJ   r$   r  rG   rO   rP   s   @r:   r  r    s    &&r;   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextOnlyMLMHeadc                 B    t         |           t        |      | _        y rg   )r#   r$   r  predictionsr7   s     r:   r$   zBlipTextOnlyMLMHead.__init__  s    3F;r;   sequence_outputr?   c                 (    | j                  |      }|S rg   )r  )r8   r  prediction_scoress      r:   rG   zBlipTextOnlyMLMHead.forward  s     ,,_=  r;   r   rP   s   @r:   r  r    s#    <!u|| ! !r;   r  c                   "    e Zd ZdZeZdZg Zd Zy)BlipTextPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertc                 .   t        |t        j                  t        j                  f      r<|j                  j
                  j                  d| j                  j                         nct        |t        j                        rI|j                  j
                  j                          |j                  j
                  j                  d       t        |t        j                        r2|j                  %|j                  j
                  j                          yyy)zInitialize the weightsg        )meanstd      ?N)r   r	   r]   r%   rC   datanormal_r6   initializer_ranger,   r  zero_fill_)r8   modules     r:   _init_weightsz%BlipTextPreTrainedModel._init_weights.  s    fryy",,78 MM&&CT[[5R5R&S-KK""$MM$$S)fbii(V[[-DKK""$ .E(r;   N)	rH   rI   rJ   rK   r   config_classbase_model_prefix_no_split_modulesr.  r   r;   r:   r"  r"  $  s    
 "L
%r;   r"  c            !           e Zd ZdZd fd	Zd Zd Zd Zdede	e
   ded	ed
ef
dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej"                        dee   dee   dee   dee   d	ee   d
ee	ej                     ef   fdZ xZS )BlipTextModela  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd | _        | j                          y rg   )
r#   r$   r6   r   rF   r   encoderr  pooler	post_init)r8   r6   add_pooling_layerr9   s      r:   r$   zBlipTextModel.__init__E  sI     ,V4&v.0AnV,tr;   c                 .    | j                   j                  S rg   rF   r)   rl   s    r:   get_input_embeddingsz"BlipTextModel.get_input_embeddingsO  s    ...r;   c                 &    || j                   _        y rg   r:  )r8   ra   s     r:   set_input_embeddingsz"BlipTextModel.set_input_embeddingsR  s    */'r;   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr5  r   r   r   )r8   heads_to_pruner   r   s       r:   _prune_headszBlipTextModel._prune_headsV  sE    
 +002 	CLE5LLu%//;;EB	Cr;   r{   rD   r   r   r?   c                    |j                         dk(  r|dddddddf   }n>|j                         dk(  r|r|\  }}t        j                  ||      }|ddddf   j                  ||d      |ddddf   k  }	|	j	                  |j
                        }	|	j                  d   |j                  d   k  r[|j                  d   |	j                  d   z
  }
t        j                  t        j                  |||
f||	j
                        |	gd      }	|	dddddddf   |ddddddf   z  }n3|ddddddf   }n%t        d	j                  ||j                              |j	                  | j
                  
      }d|z
  dz  }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r   NrW   r   r   )r   r   r   )axiszAWrong shape for input_ids (shape {}) or attention_mask (shape {})r   r'  g     )r   r2   r3   repeatrB   r   shaper   onesrZ   format)r8   r{   rD   r   r   extended_attention_mask
batch_sizerE   seq_idscausal_maskprefix_seq_lens              r:   get_extended_attention_maskz)BlipTextModel.get_extended_attention_mask^  s   & 1$&4Qa]&C#!Q& )4&
J,,z&A%dD!m4;;J
TUVZabfhikoboZpp *nn^-A-AB$$Q'.*>*>q*AA%3%9%9!%<{?P?PQR?S%SN"'))!JJ!+Z HQW_j_p_p (	  #K +6aq!m*D~VWY]_cefVfGg*g'*8D$9I*J'SZZ!5!5  #:"<"<4::"<"N#&)@#@H"L&&r;   r<   r   r|   r=   encoder_embedsr}   r~   r   r   r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|r|
|
n| j                   j                  }
nd}
||t        d      |4| j                  ||       |j                         }|\  }}|j                  }nY|%|j                         dd }|\  }}|j                  }n2|%|j                         dd }|\  }}|j                  }nt        d      |	|	d   d   j                  d   nd}|)t        j                  |||z   f      j                  |      }| j                  ||||      }|t        |t              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t              r|D cg c]  }| j!                  |       }}n?|)t        j                  ||      }| j!                  |      }n| j!                  |      }nd}| j#                  || j                   j$                        }|| j'                  ||||	      }n|}| j)                  ||||||	|
|||

      }|d   }| j*                  | j+                  |      nd}|s
||f|dd z   S t-        |||j.                  |j0                  |j2                  |j4                        S c c}w )a.  
        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   rW   rC  )r<   r   r=   r>   )	r{   r|   r}   r~   r   r   r   r   r   r   )r   pooler_outputr   rz   r   r   )r6   r   r   use_return_dictr   rZ   %warn_if_padding_and_no_attention_maskrA   r   rF  r2   rG  rB   rN  r   listinvert_attention_maskget_head_maskr   rF   r5  r6  r   r   rz   r   r   )r8   r<   r{   r   r|   r=   rO  r}   r~   r   r   r   r   r   r   rD   rJ  rE   r   r>   rI  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapemaskencoder_extended_attention_maskembedding_outputencoder_outputsr  r	  s                                  r:   rG   zBlipTextModel.forward  sU   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%.%:	@U@UII ]%>cdd"66y.Q#..*K%0"J
%%F&',,.s3K%0"J
"))F'(--/4K%0"J
#**Ffgg DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)Z\__`fgN 150P0PK1
 !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2wX\43M3Md3S2w/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y$++2O2OP	!##)+'=	  /    .,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
Y 3xs   :K)T)NNNNNNNNNNNNNF)rH   rI   rJ   rK   r$   r;  r=  rA  r   r   rN   r   r   rN  r   r2   r   rM   r   r   rG   rO   rP   s   @r:   r3  r3  <  s   /0C?'$?'38:?'GM?'[_?'	?'F -115/3,004158<9==A$(,0/3&*%*I
ELL)I
 !.I
 u||,	I

 ELL)I
  -I
 !.I
  (5I
 !) 6I
 "$u'8'8"9:I
 D>I
 $D>I
 'tnI
 d^I
 TNI
  
uU\\"$PP	Q!I
r;   r3  c            %           e Zd Z fdZd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   dee   dee   dee   deee	j                     ef   f"dZddZd Z xZS )BlipTextLMHeadModelc                     t         |   |       t        |d      | _        t	        |      | _        |j                  | _        y )NF)r8  )r#   r$   r3  r#  r  clslabel_smoothingr7   s     r:   r$   zBlipTextLMHeadModel.__init__-  s8     !&EB	&v.%55r;   c                 6    | j                   j                         S rg   )r#  r;  rl   s    r:   r;  z(BlipTextLMHeadModel.get_input_embeddings4  s    yy--//r;   c                 :    | j                   j                  |       y rg   )r#  r=  r8   new_embeddingss     r:   r=  z(BlipTextLMHeadModel.set_input_embeddings7  s    		&&~6r;   c                 B    | j                   j                  j                  S rg   )rb  r  r  rl   s    r:   get_output_embeddingsz)BlipTextLMHeadModel.get_output_embeddings:  s    xx##+++r;   c                     || j                   j                  _        |j                  | j                   j                  _        y rg   )rb  r  r  r  rf  s     r:   set_output_embeddingsz)BlipTextLMHeadModel.set_output_embeddings=  s,    '5$$2$7$7!r;   r<   r{   r   r|   r=   r}   r~   labelsr   r   r   r   r   return_logitsr   	reductionr?   c                 2   ||n| j                   j                  }|d}
| j                  ||||||||	|
||||      }|d   }| j                  |      }|r|ddddddf   j	                         S d}||ddddddf   j	                         }|ddddf   j	                         j                  |j                        }t        || j                        } ||j                  d| j                   j                        |j                  d            }|dk(  r0|j                  |j                  d      d      j                  d      }|s|f|d	d z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  
      S )a  
        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
            configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NF)r{   r   r|   r=   r}   r~   r   r   r   r   r   r   r   r   r   )rn  rc  nonerW   )losslogitsr   rz   r   r   )r6   rR  r#  rb  r   rB   r   r
   rc  ru   r&   rA   sumr   r   rz   r   r   )r8   r<   r{   r   r|   r=   r}   r~   rl  r   r   r   r   r   rm  r   rn  r   r  r   lm_lossshifted_prediction_scoresloss_fctr   s                           r:   rG   zBlipTextLMHeadModel.forwardA  s   R &1%<k$++B]B]I)))%'"7#9+/!5#!  
  "!* HH_5$QQY/::<<(9!SbS!)(D(O(O(Q%AqrE]--/223L3S3STF')TMaMabH8==b$++BXBXY[a[f[fgi[jkGF"!,,'8'='=a'@"EII!L')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r;   c                 $   |j                   }||j                  |      }|G|d   d   j                   d   }|j                   d   |kD  r|}n|j                   d   dz
  }|d d |d f   }||||j                  dd       |j                  dd       ddS )Nr   rW   r   r}   r~   T)r<   r{   r   r}   r~   r   )rF  new_onesget)r8   r<   r   r{   model_kwargsrD   past_lengthremove_prefix_lengths           r:   prepare_inputs_for_generationz1BlipTextLMHeadModel.prepare_inputs_for_generation  s      oo!&//<N &)!,Q/55a8K q!K/'2$ (1q'9A'=$!!%9%:":;I #,.%1%5%56Mt%T&2&6&67OQU&V
 	
r;   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectrB   r   )r   
past_statebeam_idxs     r:   r   z5BlipTextLMHeadModel._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r   )r8   r   r  reordered_past
layer_pasts     `  r:   _reorder_cachez"BlipTextLMHeadModel._reorder_cache  s=    ) 	Jncmnn N	 r;   )NNNNNNNNNNNNNFTr%  )NN)rH   rI   rJ   r$   r;  r=  ri  rk  r   r2   r   r   r   r   r   r   r   rG   r}  r  rO   rP   s   @r:   r`  r`  ,  s   607,8 -115/3,0048<9=)-8<$(,0/3&*(-%)#)#X
ELL)X
 !.X
 u||,	X

 ELL)X
  -X
  (5X
 !) 6X
 &X
 "$u||"45X
 D>X
 $D>X
 'tnX
 d^X
  ~X
  TN!X
" C=#X
$ 
uU\\"$EE	F%X
t
<r;   r`  )1r   typingr   r   r   r   r2   torch.utils.checkpointr   r   r	   torch.nnr
   activationsr   
generationr   modeling_outputsr   r   r   modeling_utilsr   r   r   r   utilsr   configuration_blipr   
get_loggerrH   r   Moduler   rR   r   r   r   r   r   r   r  r  r  r  r"  r3  r`  r   r;   r:   <module>r     sK  "  / /   $ $ % ! ) 
   . 
		H	%1 1j{BII {~ .		 .d299  RYY <BII <@Z
bii Z
|RYY  bii $ryy 0!")) !%o %0l
+ l
`S1? Sr;   