
    sg'                     0   d dl mZmZ d dlZd dlmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZmZmZ dZ G d de      Z G d de      Zg dZdZ ede       G d dee             ZdZdZ ede       G d dee             Zg dZy)    )OptionalUnionN)BCEWithLogitsLossCrossEntropyLossMSELoss)IJepaConfig   )ImageClassifierOutput)PreTrainedModel)add_start_docstrings	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelzfacebook/ijepa_vith14_1kc            	            e Zd Zddededdf fdZdej                  dededej                  fd	Z		 	 dd
ej                  de
ej                     dedej                  fdZ xZS )IJepaEmbeddingsconfiguse_mask_tokenreturnNc                     t         |   ||       | `| j                  j                  }t        j                  t        j                  d||j                              | _
        y )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__s       Z/var/www/html/venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.pyr   zIJepaEmbeddings.__init__   sL    0N++77#%<<A{FL^L^0_#`     
embeddingsheightwidthc                 0   |j                   d   }| j                  j                   d   }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  }|j                   d   }|| j
                  z  }|| j
                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r	   r   bicubicF)sizemodealign_corners)shaper#   r    jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r$   r(   r)   r*   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r&   interpolate_pos_encodingz(IJepaEmbeddings.interpolate_pos_encoding!   s#    !&&q)0066q9 yy##%+*F6UZ?+++22r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr'   pixel_valuesbool_masked_posr@   c                 x   |j                   \  }}}}| j                  ||      }|Z|j                   d   }	| j                  j                  ||	d      }
|j	                  d      j                  |
      }|d|z
  z  |
|z  z   }|r|| j                  |||      z   }n|| j                  z   }| j                  |      }|S )N)r@   r   r,         ?)	r1   r   
mask_tokenexpand	unsqueezetype_asr@   r#   dropout)r$   rA   rB   r@   
batch_size_r)   r*   r(   
seq_lengthmask_tokensmasks               r&   forwardzIJepaEmbeddings.forwardH   s     (4'9'9$
Avu**<Rj*k
&#))!,J//00ZLK",,R088ED#sTz2[45GGJ $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r'   )F)NF)__name__
__module____qualname__r   boolr   r    Tensorintr@   r   
BoolTensorrO   __classcell__r%   s   @r&   r   r      s    a{ aD aT a%5<< % %UX %]b]i]i %T 7;).	ll "%"2"23 #'	
 
r'   r   c                       e Zd ZdZeZdZdZdZddgZ	dZ
deej                  ej                  ej                  f   dd	fd
Zy	)IJepaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    ijeparA   Tr   
IJepaLayermoduler   Nc                 
   t        |t        j                  t        j                  f      rt        j                  j                  |j                  j                  j                  t        j                        d| j                  j                        j                  |j                  j                        |j                  _        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yt        |t$              rt        j                  j                  |j&                  j                  j                  t        j                        d| j                  j                        j                  |j&                  j                        |j&                  _        yy)zInitialize the weightsg        )meanstdNrD   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator    float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r#   )r$   r]   s     r&   _init_weightsz"IJepaPreTrainedModel._init_weightsp   sZ   fryy"))45 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '-KK""$MM$$S)0.0gg.C.C**//225==AKK11 /D / b++112	 &&+ 1r'   )rP   rQ   rR   __doc__r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   rb   rc   rn   rp    r'   r&   rZ   rZ   c   s[    
 L$O&*#*L9N3E"))RYY*L$M 3RV 3r'   rZ   )r      i   aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`IJepaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
z_The bare IJepa Model transformer outputting raw hidden-states without any specific head on top.c                   .     e Zd Zddededef fdZ xZS )
IJepaModelr   add_pooling_layerr   c                 V    t         |   |       || _        t        ||      | _        y )N)r   )r   r   r   r   r(   )r$   r   r|   r   r%   s       r&   r   zIJepaModel.__init__   s%     )&Pr'   )FF)rP   rQ   rR   r   rS   r   rW   rX   s   @r&   r{   r{      s(    
Q{ Qt Q]a Q Qr'   r{   zjmtzt/ijepa_vith14_1kzEgyptian cata  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Zdef fdZ	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   dee   dee   d	ee   d
e	e
ef   fdZ xZS )IJepaForImageClassificationr   c                 h    t         |   |       t        |d      | _        | j	                          y )NF)r|   )r   r   r{   r[   	post_init)r$   r   r%   s     r&   r   z$IJepaForImageClassification.__init__   s(     %@
r'   rA   	head_masklabelsoutput_attentionsoutput_hidden_statesr@   return_dictr   c                 n   ||n| j                   j                  }| j                  ||||||      }|d   }	| j                  |	j	                  d            }
d}||j                  |
j                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                   j                  dk(  r=t               } ||
j!                  d	| j                        |j!                  d	            }n,| j                   j                  dk(  rt#               } ||
|      }|s|
f|dd z   }||f|z   S |S t%        ||
|j&                  |j(                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r@   r   r   r   )r<   
regressionsingle_label_classificationmulti_label_classificationr,   )losslogitshidden_states
attentions)r   use_return_dictr[   
classifierr_   rh   deviceproblem_type
num_labelsrk   r    longrU   r   squeezer   r9   r   r
   r   r   )r$   rA   r   r   r   r   r@   r   outputssequence_outputr   r   loss_fctoutputs                 r&   rO   z#IJepaForImageClassification.forward   s     &1%<k$++B]B]**/!5%=#  
 "!*!5!5!!5!<=YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r'   )NNNNNNN)rP   rQ   rR   r   r   r   r    rT   rS   r   tupler
   rO   rW   rX   s   @r&   r   r      s     {  04,0)-,0/337&*A
u||,A
 ELL)A
 &	A

 $D>A
 'tnA
 #+4.A
 d^A
 
u++	,A
r'   r   )rZ   r{   r   ) typingr   r   r    torch.nnr   r   r   r   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr
   modeling_utilsr   utilsr   r   vit.modeling_vitr   r   r   _CHECKPOINT_FOR_DOCr   rZ   _EXPECTED_OUTPUT_SHAPEIJEPA_START_DOCSTRINGr{   _IMAGE_CLASS_CHECKPOINT_IMAGE_CLASS_EXPECTED_OUTPUTr   __all__rx   r'   r&   <module>r      s    "   A A E 5 -  1 Gm GT3? 3D ( 	  eQ%x Q	Q 2 -   G
"68Q G
G
Tr'   