
    sgW                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZmZ d
dlmZ  ej*                  e      Ze G d de             Z G d de
j2                        Z G d de
j2                        Z G d de
j2                        Z G d de
j2                        Z G d de
j2                        Z G d de
j2                        Zy)zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)OptionalTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)ModelOutputlogging   )IdeficsVisionConfigc                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        U/var/www/html/venv/lib/python3.12/site-packages/transformers/models/idefics/vision.pyr   r   "   sm    * 15L(5,,-4+/u((/=AM8E%"3"3S"89:A:>Ju00#567>r   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )IdeficsVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r"   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr"   	__class__s     r   r.   z IdeficsVisionEmbeddings.__init__A   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr   
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  | j                        }|j                   d   dz
  }||k(  r||k(  r|S |dddf   }|ddddf   }|j                   d   }	|| j                  j                  z  }
|| j                  j                  z  }|
dz   |dz   }}
t        j                  |      }|j                  dt        |      t        |      |	      }|j                  dddd      }|j                  t        j                  k(  }|r4t        j                  d       |j                  t        j                         }t"        j$                  j'                  ||
|z  ||z  fd	d
      }|r|j                  t        j                        }t        |
      |j                   d   k7  st        |      |j                   d   k7  rBt)        dt        |
      t        |      f d|j                   d   |j                   d   f d      |j                  dddd      j+                  dd|	      }t        j,                  |j/                  d      |fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r+   g?r   r)   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaper<   r*   r"   r2   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rA   rC   rD   rE   r9   	pos_embedr:   class_pos_embedpatch_pos_embedr0   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r   interpolate_pos_encodingz0IdeficsVisionEmbeddings.interpolate_pos_encodingX   sf    !&&q)A-++D,=,=>	!*Q.-'FeO#AqD/#AqrE*$$R(	$++"8"88!7!77 (5s':MC<O}!YY}5)11!S9K5LcRdNegpq)11!Q1=(..%..@h .00=O--33'*<<mN`>`a	 4 
 -00@O}!6!6r!::c->PTcTiTijlTm>m0]1CSEW1W0X Y00?0E0Eb0I?K`K`acKd0d/eefh  *11!Q1=BB1b)Tyy/33A6HaPPr   pixel_valuesri   c                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rV   r)   r   r+   rN   )rP   r1   r^   r8   weightrV   rZ   flatten	transposer5   r?   r   r`   ri   r<   r*   )rA   rj   ri   
batch_sizer7   rD   rE   target_dtypepatch_embedsclass_embedsrC   s              r   forwardzIdeficsVisionEmbeddings.forward   s8   2>2D2D/
L&%'(ET__,D (% 9)4??*;;su 
 ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
 $#d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr   F)r   r   r   r   r.   r   TensorrT   ri   r   boolrt   __classcell__rB   s   @r   r!   r!   @   sm    q2 q./Q5<< /Q /QUX /Q]b]i]i /QbE$5$5 QU bgbnbn r   r!   c                        e Zd ZdZ fdZdej                  dedefdZ	 	 	 ddej                  de	ej                     d	e	ej                     d
e	e
   deej                  e	ej                     f   f
dZ xZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )r-   r.   r"   r/   r0   num_attention_heads	num_headshead_dimr^   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr@   s     r   r.   zIdeficsVisionAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r)   )r_   r~   r   ro   
contiguous)rA   r   r   r   s       r   _shapezIdeficsVisionAttention._shape   s7    {{3GQQRSUVWbbddr   r   attention_maskcausal_attention_maskoutput_attentionsrF   c                    |j                         \  }}}| j                  |      | j                  z  }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
|| j                  z  d| j                  f} | j                  |||      j                  | } |	j                  | }	 |
j                  | }
|	j                  d      }t        j                  ||	j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t        j                  j!                  || j                   | j"                  	      }t        j                  ||
      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j%                  |||      }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr+   r   r)   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size rN   )ptrainingz `attn_output` should be of size )sizer   r   r   r   r   r~   r   r_   r   bmmro   r^   r   r\   softmaxr   r   rS   r   )rA   r   r   r   r   r   tgt_lenr0   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r   rt   zIdeficsVisionAttention.forward   s    #0"4"4"6Wi {{=1DJJ>[[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  !,$))+Q/II 7a'8R7S T-22457  (,,S$..'7SVkkL',,S4>>-A7GTL%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r   )NNF)r   r   r   r   r.   r   rv   rT   r   r   rw   r   rt   rx   ry   s   @r   r{   r{      s    GB&eU\\ eC ec e 268<,1L2||L2 !.L2  (5	L2
 $D>L2 
u||Xell33	4L2r   r{   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )IdeficsVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r-   r.   r"   r	   
hidden_actactivation_fnr   r   r/   intermediate_sizefc1fc2r@   s     r   r.   zIdeficsVisionMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr   r   rF   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rA   r   s     r   rt   zIdeficsVisionMLP.forward  s4    /**=9/r   )r   r   r   r.   r   rv   rt   rx   ry   s   @r   r   r     s$    KU\\ ell r   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
IdeficsVisionEncoderLayerr"   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)eps)r-   r.   r/   r0   r{   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r@   s     r   r.   z"IdeficsVisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr   r   r   r   r   rF   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r   r   r   r   )rA   r   r   r   r   residualr   outputss           r   rt   z!IdeficsVisionEncoderLayer.forward&  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr   ru   )r   r   r   r   r.   r   rv   r   rw   r   r   rt   rx   ry   s   @r   r   r     sg    S2 S -2&||& &  %||	&
 $D>& 
u  	!&r   r   c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r"   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r-   r.   r"   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rA   r"   _rB   s      r   r.   zIdeficsVisionEncoder.__init__Y  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#r   r   r   output_hidden_statesreturn_dictrF   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r   r   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r   	<genexpr>z/IdeficsVisionEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   )r   r   r   )r"   r   r   use_return_dict	enumerater   r   r   _gradient_checkpointing_func__call__tupler
   )rA   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r   rt   zIdeficsVisionEncoder.forward_  sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r   )NNNNN)r   r   r   r   r   r.   r   r   rv   rw   r   r   r
   rt   rx   ry   s   @r   r   r   P  s    ,2 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r   r   c                        e Zd Zdef fdZ	 	 	 	 	 d
deej                     dee   dee   dee   dee   de	e
ef   fd	Z xZS )IdeficsVisionTransformerr"   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r-   r.   r"   r/   r!   rC   r   r   r   pre_layrnormr   encoderpost_layernorm)rA   r"   r0   rB   s      r   r.   z!IdeficsVisionTransformer.__init__  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr   rj   r   r   ri   r   rF   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|dddddf   }	| j                  |	      }	|s
||	f|dd z   S t        ||	|j                  |j                        S )z
        Returns:

        Nz You have to specify pixel_values)ri   )r   r   r   r   r   r   )r   pooler_outputr   r   )r"   r   r   r   r^   rC   r   r   r   r   r   r   )
rA   rj   r   r   ri   r   r   encoder_outputsr   pooled_outputs
             r   rt   z IdeficsVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r   )NNNFN)r   r   r   r   r.   r   r   r   rw   r   r   r   rt   rx   ry   s   @r   r   r     s    Q2 Q 59,0/338&*+
u001+
 $D>+
 'tn	+

 #+4.+
 d^+
 
u00	1+
r   r   ) r   rQ   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   activationsr	   modeling_outputsr
   r   utilsr   r   configuration_ideficsr   
get_loggerr   rX   r   Moduler!   r{   r   r   r   r   r   r   r   <module>r      s    [  ! ) )    ! K ) 6 
		H	% ?{ ? ?:`bii `He2RYY e2Rryy  /		 /f^
299 ^
D7
ryy 7
r   