
    sg$                         d Z ddlmZmZ ddlZddlmZ ddlmZ  G d dej                        Z
 G d d	ej                        Z G d
 dej                        Zy)a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )IdeficsConfigc                   t     e Zd Zdededededededdf fd	Zd
ej                  dej                  fdZ xZ	S )IdeficsPerceiverResamplerconfig	embed_dimdepthn_headshead_dim	n_latentsreturnNc                    t         |           ||||f\  | _        | _        | _        | _        |j                  j                  | _        t        j                  t        j                  | j
                  | j                        d      | _        t        |j                  d      s| j                  dz  n|j                  j                  dz  | _        t        j"                  t%        |      D cg c]a  }t        j"                  t'        | j                  | j                  | j                  | j                        t)        | j                   |      g      c c}      | _        t        j,                  | j                        | _        yc c}w )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        T)requires_gradr
      N)super__init__r
   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normsnn	Parametertorchrandnlatentshasattrvision_configintermediate_dim
ModuleListrangeIdeficsPerceiverAttention
IdeficsMLPblocks	LayerNorm
layer_norm)	selfr	   r
   r   r   r   r   _	__class__s	           X/var/www/html/venv/lib/python3.12/site-packages/transformers/models/idefics/perceiver.pyr   z"IdeficsPerceiverResampler.__init__1   s7   ( 	FOQXZbdmFmCdmT^$55NN ||EKK$O_cd 6//= NNQ%%//!3 	 mm u  1$..$,,PTP]P]_c_r_rs"4#8#8&A

 ,,t~~6s   -A&Fcontextc                     | j                   j                  |j                  d   dd      }| j                  D ]  \  }} |||      |z   } ||      |z   } | j	                  |      S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r   repeatshaper$   r&   )r'   r+   r   attnffs        r*   forwardz!IdeficsPerceiverResampler.forward_   sn     ,,%%gmmA&61=  	,HD"7G,w6GkG+G	, w''    )
__name__
__module____qualname__r   intr   r   Tensorr1   __classcell__r)   s   @r*   r   r   0   s\    ,7#,703,7<?,7JM,7Y\,7il,7	,7\
(u|| 
( 
(r2   r   c            
            e Zd Zdededededdf
 fdZdej                  d	ej                  dej                  fd
Z xZ	S )r"   r
   r   r   r   r   Nc                    t         |           |||c| _        | _        | _        || _        t        j                  | j                        | _        t        j                  | j                        | _	        | j
                  rHt        j                  | j                        | _
        t        j                  | j                        | _        | j                  dz  | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  |d      | _        y)ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FbiasN)r   r   r
   r   r   r   r   r%   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r'   r
   r   r   r   r)   s        r*   r   z"IdeficsPerceiverAttention.__init__m   s0   6?(3dm,"$,,t~~">"$,,t~~"> "T]] ;D "T]] ;Dt+ iit}}0LSXYiit}}0LSXYiit}}0LSXY99T\\DMM%A9SXYr2   r+   r   c           	         | j                  |      }| j                  |      }|j                  dd \  }}}| j                  |      }| j	                  t        j                  ||gd            }| j                  t        j                  ||gd            }|||fD 	cg c]G  }	|	j                  ||	j                  d   | j                  | j                        j                  dd      I c}	\  }}}| j                  r"| j                  |      }| j                  |      }t        j                  d|| j                   z  |      }
|
|
j#                  dd	
      j%                         z
  }|j'                  d      }t        j                  d||      }| j)                  |j                  dd      j+                  d            S c c}	w )aF  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        N   )dimr      z... i d, ... j d -> ... i jT)rK   keepdimz... i j, ... j d -> ... i d)r>   r?   r.   rD   rE   r   catrF   reshaper   r   	transposer   r@   rA   einsumrB   amaxdetachsoftmaxrG   flatten)r'   r+   r   
batch_size
seq_lengthr
   qkvxscoresstabilized_scoresr/   	resampleds                 r*   r1   z!IdeficsPerceiverAttention.forward   s    ))'2))'2,3MM"1,=)
J	 KK KK		7G"4"=>KK		7G"4"=>
 mnoprsktufg199ZT\\4==Q[[\]_`au1a!!!$A!!!$A;Q=NPQR"fkkb$k&G&N&N&PQ ((R(0 LL!>aH		 3 3Aq 9 A A" EFF vs   AG )
r3   r4   r5   r6   boolr   r   r7   r1   r8   r9   s   @r*   r"   r"   l   s]    Z# Z Zs ZTX Z]a Z*(Gu|| (Gell (Gu|| (Gr2   r"   c                   h     e Zd Zdef fdZdeeej                        dej                  fdZ	 xZ
S )r#   r	   c                 n   t         |           |j                  j                  | _        t	        j
                  | j                        | _        t	        j                  | j                  |d      | _        t	        j                         | _
        t	        j                  || j                  d      | _        y)z:Simple MLP block with intermediate_size and embedding sizeFr<   N)r   r   r   r
   r   r%   lnrC   fcReLUactc_proj)r'   intermediate_sizer	   r)   s      r*   r   zIdeficsMLP.__init__   st    --77,,t~~.))DNN,=EJ779ii 14>>Nr2   hidden_statesr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )N)rc   rd   rf   rg   )r'   ri   s     r*   r1   zIdeficsMLP.forward   s@    ../M2r2   )r3   r4   r5   r   r   r   r   r   FloatTensorr1   r8   r9   s   @r*   r#   r#      s:    O- OXeE4E4E.F%G EL]L] r2   r#   )__doc__typingr   r   r   torch.nnr   configuration_ideficsr   Moduler   r"   r#    r2   r*   <module>rr      sL   4 #   09(		 9(x>G		 >GB r2   