
    sgv              	          d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddl m!Z!  ejD                  e#      Z$dZ%dZ&g dZ'dZ(dZ)e G d de             Z*e G d de             Z+e G d de             Z,e G d de             Z- G d dej\                        Z/ G d dej\                        Z0d@de
jb                  d e2d!e3d"e
jb                  fd#Z4 G d$ d%ej\                        Z5 G d& d'ej\                        Z6 G d( d)ej\                        Z7 G d* d+ej\                        Z8 G d, d-ej\                        Z9 G d. d/ej\                        Z: G d0 d1e      Z;d2Z<d3Z= ed4e<       G d5 d6e;             Z> ed7e<       G d8 d9e;             Z? ed:e<       G d; d<e;             Z@ ed=e<       G d> d?e;e             ZAy)AzPyTorch FocalNet model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )FocalNetConfigr   zmicrosoft/focalnet-tiny)r   1   i   ztabby, tabby catc                       e Zd ZU dZdZej                  ed<   dZe	e
ej                        ed<   dZe	e
ej                        ed<   y)FocalNetEncoderOutputa  
    FocalNet encoder's outputs, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r        a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   8   sO    ( ,0u((/8<M8E%"3"345<AEHU5+<+<%=>Er&   r   c                       e Zd ZU dZdZej                  ed<   dZe	ej                     ed<   dZ
e	eej                        ed<   dZe	eej                        ed<   y)FocalNetModelOutputa  
    FocalNet model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r    r!   r   r"   r#   r$   r*   r   r   r   r   r%   r&   r'   r)   r)   S   sd    * ,0u((/15M8E--.58<M8E%"3"345<AEHU5+<+<%=>Er&   r)   c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)!FocalNetMaskedImageModelingOutputa  
    FocalNet masked image model outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r    r!   r-   r   r"   r#   r$   r.   r   r   r   r%   r&   r'   r,   r,   p   sc    * )-D(5$$
%,(,NE%%,8<M8E%"3"345<AEHU5+<+<%=>Er&   r,   c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FocalNetImageClassifierOutputaS  
    FocalNet outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr-   logitsr   r   )r   r   r    r!   r-   r   r"   r#   r$   r1   r   r   r   r%   r&   r'   r0   r0      sc    * )-D(5$$
%, $FE$8<M8E%"3"345<AEHU5+<+<%=>Er&   r0   c                        e Zd ZdZd fd	Z	 ddeej                     deej                     de	ej                     fdZ xZS )	FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    c           	         t         |           t        ||j                  |j                  |j
                  |j                  |j                  d      | _        | j                  j                  | _
        |r4t        j                  t        j                  dd|j                              nd | _        t        j                   |j                  |j"                        | _        t        j&                  |j(                        | _        y )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr6   r7   r8   r9   r:   patch_embeddings	grid_size
patch_gridr   	Parameterr"   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr5   use_mask_token	__class__s      r'   r?   zFocalNetEmbeddings.__init__   s     7((((,,&&!00!
 //99O]",,u{{1a9I9I'JKcgLL!1!1v7L7LM	zz&"<"<=r&   pixel_valuesbool_masked_posreturnc                 8   | j                  |      \  }}| j                  |      }|j                         \  }}}|K| j                  j	                  ||d      }|j                  d      j                  |      }	|d|	z
  z  ||	z  z   }| j                  |      }||fS )N      ?)rA   rI   sizerF   expand	unsqueezetype_asrL   )
rM   rP   rQ   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r'   forwardzFocalNetEmbeddings.forward   s     )-(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ\\*-
,,,r&   )FN)r   r   r    r!   r?   r   r"   r#   
BoolTensorr   Tensorra   __classcell__rO   s   @r'   r3   r3      sQ    >& hl-$U%6%67-JRSXScScJd-	u||	-r&   r3   c                   z     e Zd Z	 	 	 d fd	Zd Zdeej                     deej                  ee
   f   fdZ xZS )r@   c	                 d   t         |           t        |t        j                  j
                        r|n||f}t        |t        j                  j
                        r|n||f}|d   |d   z  |d   |d   z  z  }	|| _        || _        || _        |	| _	        |d   |d   z  |d   |d   z  f| _
        |r/|rd}
d}d}nd}
d}d}t        j                  |||
||      | _        nt        j                  ||||      | _        |r't        j                  ||j                  	      | _        y d | _        y )
Nr   r            r   )kernel_sizestridepadding)rl   rm   r<   )r>   r?   
isinstancecollectionsabcIterabler6   r7   r8   num_patchesrB   r   Conv2d
projectionrG   rH   rI   )rM   r5   r6   r7   r8   r9   add_normr:   r;   rs   rl   rn   rm   rO   s                r'   r?   z FocalNetPatchEmbeddings.__init__   s7    	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY iii[Y`DO !iiiZ`jkDOYF4I4IJDIDIr&   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )r7   r   
functionalpad)rM   rP   heightwidth
pad_valuess        r'   	maybe_padz!FocalNetPatchEmbeddings.maybe_pad   s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr&   rP   rR   c                 N   |j                   \  }}}}|| j                  k7  rt        d      | j                  |||      }| j	                  |      }|j                   \  }}}}||f}|j                  d      j                  dd      }| j                  | j                  |      }||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rj   r   )shaper8   
ValueErrorr}   ru   flatten	transposerI   )rM   rP   r^   r8   rz   r{   rZ   r[   s           r'   ra   zFocalNetPatchEmbeddings.forward  s    )5););&<4,,,w  ~~lFEB__\2
(..1fe#UO''*44Q:
99 :.J,,,r&   )FFF)r   r   r    r?   r}   r   r"   r#   r   rd   intra   re   rf   s   @r'   r@   r@      sL     (T-HU->->$? -E%,,X]^aXbJbDc -r&   r@   input	drop_probtrainingrR   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)r   ndimr"   randr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          r'   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr&   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rR   c                 0    t         |           || _        y rb   )r>   r?   r   )rM   r   rO   s     r'   r?   zFocalNetDropPath.__init__2  s    "r&   r   c                 D    t        || j                  | j                        S rb   )r   r   r   )rM   r   s     r'   ra   zFocalNetDropPath.forward6  s    FFr&   c                 8    dj                  | j                        S )Nzp={})formatr   rM   s    r'   
extra_reprzFocalNetDropPath.extra_repr9  s    }}T^^,,r&   rb   )r   r   r    r!   r   floatr?   r"   rd   ra   strr   re   rf   s   @r'   r   r   /  sG    b#(5/ #T #GU\\ Gell G-C -r&   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetModulationc                    t         	|           || _        |j                  |   | _        |j
                  |   | _        || _        |j                  | _        |j                  | _	        t        j                  |d|z  | j                  dz   z   |      | _        t        j                  ||dd|      | _        t        j                         | _        t        j                  ||      | _        t        j$                  |      | _        t        j(                         | _        g | _        t/        | j                        D ]  }| j                  |z  | j                  z   }| j*                  j1                  t        j2                  t        j                  |||d||dz  d      t        j                                      | j,                  j1                  |        | j                  r't        j4                  ||j6                        | _        y y )Nrj   r   )bias)rl   rm   r   F)rl   rm   groupsrn   r   r<   )r>   r?   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrt   projection_contextGELU
activationprojection_outrJ   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrG   rH   	layernorm)
rM   r5   indexr   r   r   r   krl   rO   s
            r'   r?   zFocalNetModulation.__init__>  s   "007!..u5(060W0W-#)#=#= YYsAGt7G7G!7K,LSWX"$))C!ATX"Y'') iiS1"$**-?"@MMOt''( 
	2A++a/$2C2CCK$$IISk!CYdhiYipu GGI	 $$[1
	2 00\\#63H3HIDN 1r&   c                 |   |j                   d   }| j                  |      j                  dddd      j                         }t	        j
                  |||| j                  dz   fd      \  }}| _        d}t        | j                        D ]5  } | j                  |   |      }||| j                  dd||dz   f   z  z   }7 | j                  |j                  dd      j                  dd            }||| j                  dd| j                  df   z  z   }| j                  r|| j                  dz   z  }| j                  |      | _        || j                  z  }	|	j                  dddd      j                         }	| j                  r| j!                  |	      }	| j#                  |	      }	| j%                  |	      }	|	S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rT   r   r   r   rj   NT)keepdim)r   r   permute
contiguousr"   splitr   gatesr   r   r   meanr   r   	modulatorr   r   r   r   )
rM   hidden_stater8   xqctxctx_alllevel
ctx_globalx_outs
             r'   ra   zFocalNetModulation.forward_  s    $))"- |,44Q1a@KKM"[[\<IYIY\]I],^`ab3
 4++, 	GE*$##E*3/CdjjEEAI4E1E&F FFG	G __SXXaX%>%C%CAt%C%TU
JAt7G7G7I4I)JJJ ##!1!1A!56G 009DNN"aAq)44600NN5)E ##E*''.r&   )rj   Tr   r   r   r    r?   ra   re   rf   s   @r'   r   r   =  s    JB"r&   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetMlpc                 
   t         |           |xs |}|xs |}t        j                  ||      | _        t
        |j                     | _        t        j                  ||      | _        t        j                  |      | _
        y rb   )r>   r?   r   r   fc1r   
hidden_actr   fc2rJ   drop)rM   r5   in_featureshidden_featuresout_featuresr   rO   s         r'   r?   zFocalNetMlp.__init__  sh    #2{)8[99[/: !2!2399_l;JJt$	r&   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rb   )r   r   r   r   )rM   r   s     r'   ra   zFocalNetMlp.forward  sN    xx-|4yy.xx-yy.r&   )NNr   r   rf   s   @r'   r   r     s    %r&   r   c                   *     e Zd ZdZd fd	Zd Z xZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`Tuple[int]`):
            Input resulotion.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    c                 H   t         |           || _        || _        || _        |j
                  | _        |j                  | _        t        j                  ||j                        | _        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t%        ||j&                  z        }t)        |||| j                        | _        d| _        d| _        |j0                  ryt        j2                  |j4                  t7        j8                  |      z  d      | _        t        j2                  |j4                  t7        j8                  |      z  d      | _        y y )Nr<   )r5   r   r   r   r   )r5   r   r   r   rU   T)requires_grad)r>   r?   r5   r   input_resolutionrK   r   use_post_layernormr   rG   rH   norm1r   
modulationr   Identityr   norm2r   	mlp_ratior   mlpgamma_1gamma_2use_layerscalerD   layerscale_valuer"   ones)rM   r5   r   r   r   r   mlp_hidden_dimrO   s          r'   r?   zFocalNetLayer.__init__  sE     0 ..	"(";";\\#6+@+@A
,#yy	
 9BC))4R[[]\\#6+@+@A
S6#3#334f#~dhdmdmn  <<(?(?%**cBS(ScghDL<<(?(?%**cBS(ScghDL !r&   c           	      :   |\  }}|j                   \  }}}|}| j                  r|n| j                  |      }|j                  ||||      }| j	                  |      j                  |||z  |      }| j                  s|n| j                  |      }|| j                  | j                  |z        z   }|| j                  | j                  | j                  r | j                  | j                  |            n| j                  | j                  |            z        z   }|S rb   )
r   r   r   viewr   r   r   r   r   r   )	rM   r   input_dimensionsrz   r{   r\   r^   r8   shortcuts	            r'   ra   zFocalNetLayer.forward  s   (&2&8&8#
A| (,'>'>|DJJ|D\#((VULQ|499*funVbc+/+B+B|

S_H`  $..1L"MM#dnnLL595L5Ltzz$((<01RVRZRZ[_[e[efr[sRtv'
 

 r&   )r   )r   r   r    r!   r?   ra   re   rf   s   @r'   r   r     s    i@r&   r   c                   j     e Zd Z fdZdej
                  deeef   deej
                     fdZ xZ	S )FocalNetStagec                    t         |           || _        t        |j                        | _        t        | j
                        D cg c]  }|j                  d|z  z   }}||   }|| j
                  dz
  k  r||dz      nd }|| j
                  dz
  k  rt        nd }t        j                  d|j                  t        |j                              D 	cg c]  }	|	j                          }
}	|
t        |j                  d |       t        |j                  d |dz           }t        j                  t        |j                  |         D cg c]'  }t!        ||||t#        |t$              r||   n|      ) c}      | _        |' |||d||d|j(                  d      | _        d| _        y d | _        d| _        y c c}w c c}	w c c}w )Nrj   r   r   )r5   r   r   r   r   TF)r5   r6   r7   r8   r9   rv   r:   r;   )r>   r?   r5   lendepths
num_stagesr   r9   r@   r"   linspacedrop_path_ratesumitemr   r   r   ro   listlayersr:   
downsamplepointing)rM   r5   r   r   ir9   r   out_dimr   r   dprr   rO   s               r'   r?   zFocalNetStage.__init__  s   fmm,8=doo8NO1V%%A.O	O+04??Q3F+F)EAI&T1619L1L,SW
 "'63H3H#fmmJ\!]^Aqvvx^^FMM&512S{QR9S5TU	mm v}}U34	  !%5.8D.Iily	
 !(+ !%44	DO  #DOI P _	s   F:F?
,Gr   r   rR   c                    |\  }}| j                   D ]  } |||      } |}| j                  K|\  }}|j                  dd      j                  |j                  d   d||      }| j                  |      \  }}n||||f}|||f}|S )Nr   rj   r   rT   )r   r   r   reshaper   )	rM   r   r   rz   r{   layer_module!hidden_states_before_downsamplingr[   stage_outputss	            r'   ra   zFocalNetStage.forward	  s    ( KK 	JL(8HIM	J -:)??&,MFE)33Aq9AA177:BM 04}/M,M, "( >&(IK\]r&   )
r   r   r    r?   r"   rd   r   r   ra   re   rf   s   @r'   r   r     s=    *XU\\ U3PS8_ Y^_d_k_kYl r&   r   c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   dee	   dee	   dee	   de
eef   fdZ xZS )
FocalNetEncoderc                 2   t         |           t        |j                        | _        || _        t        j                  t        | j                        D cg c]$  }t        |||d   d|z  z  |d   d|z  z  f      & c}      | _
        d| _        y c c}w )Nr   rj   r   )r5   r   r   F)r>   r?   r   r   r   r5   r   r   r   r   stagesgradient_checkpointing)rM   r5   rB   i_layerrO   s       r'   r?   zFocalNetEncoder.__init__  s    fmm,mm  %T__5  !!&/lq'z&BIaLUVX_U_D`%a	
 ',#s   )Br   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrR   c                    |rdnd }|rdnd }|rE|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }}| j
                  r*| j                  r| j                  |j                  ||      }n	 |||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}	}
 |j                  |g|d   |d   f|
 }|j                  dddd      }||fz  }||fz  }|s|r|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  } |st        d ||fD              S t        |||	      S )
Nr%   r   r   r   rj   rT   c              3   &   K   | ]	  }||  y wrb   r%   ).0vs     r'   	<genexpr>z*FocalNetEncoder.forward.<locals>.<genexpr>g  s     Xq!-Xs   )r   r   r   )r   r   r   	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )rM   r   r   r  r	  r
  all_hidden_statesall_reshaped_hidden_statesr\   r^   hidden_sizereshaped_hidden_stater   stage_moduler  r   r[   s                    r'   ra   zFocalNetEncoder.forward1  s9    #7BD+?RT")6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5  	GOA|**t}} $ A A ))!$! !-]<L M)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF*A 	GD X]4E$FXXX$++#=
 	
r&   )FFT)r   r   r    r?   r"   rd   r   r   r   boolr   r   ra   re   rf   s   @r'   r  r    su    ,, 05CH&*<
||<
  S/<
 'tn	<

 3;4.<
 d^<
 
u++	,<
r&   r  c                   ,    e Zd ZdZeZdZdZdZdgZ	d Z
y)FocalNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    focalnetrP   Tr   c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsr   )r   stdNrU   )ro   r   r   rt   weightdatanormal_r5   initializer_ranger   zero_rG   fill_)rM   modules     r'   _init_weightsz%FocalNetPreTrainedModel._init_weights}  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r&   N)r   r   r    r!   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr'  r%   r&   r'   r  r  q  s-    
 "L"$O&*#()
*r&   r  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`FocalNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aB  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`AutoImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zVThe bare FocalNet Model outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Z ee       eee	e
de      	 	 	 	 ddeej                     deej                     dee   dee   d	eee	f   f
d
              Z xZS )FocalNetModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd | _        | j)                          y )Nrj   r   )rN   r<   )r>   r?   r5   r   r   r   r   r9   num_featuresr3   rZ   r  rC   encoderr   rG   rH   r   AdaptiveAvgPool1dpooler	post_init)rM   r5   add_pooling_layerrN   rO   s       r'   r?   zFocalNetModel.__init__  s     fmm, 0 0119L3M MN,VNS&vt/I/IJd&7&7V=R=RS1Bb**1- 	r&   c                 .    | j                   j                  S rb   )rZ   rA   r   s    r'   get_input_embeddingsz"FocalNetModel.get_input_embeddings  s    ///r&   vision)
checkpointoutput_typer(  modalityexpected_outputrP   rQ   r  r
  rR   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  ||      \  }}| j                  ||||      }|d   }| j                  |      }d}	| j                  7| j                  |j                  dd            }	t        j                  |	d      }	|s||	f|dd z   }
|
S t        ||	|j                  |j                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rQ   r  r
  r   r   rj   )r   r*   r   r   )r5   r  use_return_dictr   rZ   r1  r   r3  r   r"   r   r)   r   r   )rM   rP   rQ   r  r
  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r'   ra   zFocalNetModel.forward  s   ( %9$D $++JjJj 	 &1%<k$++B]B]?@@-1__\[j_-k**,,!5#	 ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%}58KKFM"-')77#2#I#I	
 	
r&   )TFNNNN)r   r   r    r?   r7  r   FOCALNET_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr)   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r"   r#   rc   r  r   r   ra   re   rf   s   @r'   r.  r.    s    
0 ++DE&'$. 596:/3&*.
u001.
 "%"2"23.
 'tn	.

 d^.
 
u))	*.
 F.
r&   r.  a|  FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                        e Zd Z fdZ ee       eee      	 	 	 	 d	de	e
j                     de	e
j                     de	e   de	e   deeef   f
d              Z xZS )
FocalNetForMaskedImageModelingc                    t         |   |       t        |dd      | _        t	        |j
                        | _        t        |j                  d| j                  dz
  z  z        }t        j                  t        j                  ||j                  dz  |j                  z  d      t        j                  |j                              | _        | j!                          y )NFT)r5  rN   rj   r   )in_channelsout_channelsrl   )r>   r?   r.  r  r   r   r   r   r9   r   r   rt   encoder_strider8   PixelShuffledecoderr4  )rM   r5   r0  rO   s      r'   r?   z'FocalNetForMaskedImageModeling.__init__  s     %fVZ[fmm,6++aDOOa4G.HHI}}II(v7L7La7ORXReRe7est OOF112	
 	r&   r:  r(  rP   rQ   r  r
  rR   c                    ||n| j                   j                  }| j                  ||||      }|d   }|j                  dd      }|j                  \  }}}	t        j                  |	dz        x}
}|j                  |||
|      }| j                  |      }d}|| j                   j                  | j                   j                  z  }|j                  d||      }|j                  | j                   j                  d      j                  | j                   j                  d      j                  d      j                         }t        j                  j!                  ||d	      }||z  j#                         |j#                         d
z   z  | j                   j$                  z  }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                        S )aQ  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rQ   r  r
  r   r   rj   g      ?rT   none)	reductiongh㈵>)r-   r.   r   r   )r5   r?  r  r   r   mathfloorr   rP  r6   r7   repeat_interleaverX   r   r   rx   l1_lossr   r8   r,   r   r   )rM   rP   rQ   r  r
  outputsrB  r\   r8   sequence_lengthrz   r{   reconstructed_pixel_valuesmasked_im_lossrV   r`   reconstruction_lossr   s                     r'   ra   z&FocalNetForMaskedImageModeling.forward  s   N &1%<k$++B]B]--+!5#	   
 "!*)33Aq94C4I4I1
L/OS$899)11*lFTYZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7F`lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY05!//#*#A#A	
 	
r&   rD  )r   r   r    r?   r   rE  r   r,   rG  r   r"   r#   rc   r  r   r   ra   re   rf   s   @r'   rJ  rJ    s    " ++DE+L[jk 596:/3&*N
u001N
 "%"2"23N
 'tn	N

 d^N
 
u77	8N
 l FN
r&   rJ  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                        e Zd Z fdZ ee       eeee	e
      	 	 	 	 d	deej                     deej                     dee   dee   deeef   f
d              Z xZS )
FocalNetForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y )Nr   )r>   r?   
num_labelsr.  r  r   r   r0  r   
classifierr4  rM   r5   rO   s     r'   r?   z'FocalNetForImageClassification.__init__o  sx      ++%f- IOHYHY\]H]BIIdmm00&2C2CDcecncncp 	
 	r&   )r9  r:  r(  r<  rP   labelsr  r
  rR   c                    ||n| j                   j                  }| j                  |||      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j
                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr>  r   
regressionsingle_label_classificationmulti_label_classificationrT   rj   )r-   r1   r   r   )r5   r?  r  rb  problem_typera  r   r"   longr   r
   squeezer	   r   r   r0   r   r   )rM   rP   rd  r  r
  rY  rC  r1   r-   loss_fctr   s              r'   ra   z&FocalNetForImageClassification.forward}  s   ( &1%<k$++B]B]--!5#   
  
/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE,!//#*#A#A	
 	
r&   rD  )r   r   r    r?   r   rE  r   _IMAGE_CLASS_CHECKPOINTr0   rG  _IMAGE_CLASS_EXPECTED_OUTPUTr   r"   r#   
LongTensorr  r   r   ra   re   rf   s   @r'   r_  r_  f  s     ++DE*1$4	 59-1/3&*9
u0019
 ))*9
 'tn	9

 d^9
 
u33	49
 F9
r&   r_  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c                        e Zd Zdef fdZ ee       eee	      	 	 d	de
j                  dee   dee   defd              Z xZS )
FocalNetBackboner5   c                     t         |   |       t         | 	  |       |j                  g|j                  z   | _        t        |      | _        | j                          y rb   )	r>   r?   _init_backboner9   hidden_sizesr0  r.  r  r4  rc  s     r'   r?   zFocalNetBackbone.__init__  sQ     v&#--.1D1DD%f- 	r&   rQ  rP   r  r
  rR   c                    ||n| j                   j                  }||n| j                   j                  }| j                  |dd      }|j                  }d}t        | j                        D ]  \  }}|| j                  v s|||   fz  } |s|f}	|r|	|j                  fz  }	|	S t        ||r|j                  d      S dd      S )a|  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr>  r%   )feature_mapsr   
attentions)
r5   r?  r  r  r   r  stage_namesr   r   r   )
rM   rP   r  r
  rY  r   rv  idxstager   s
             r'   ra   zFocalNetBackbone.forward  s    8 &1%<k$++B]B]$8$D $++JjJj 	 --4UY-Z66#D$4$45 	6JC)))s!3 55	6 "_F#70022M%3G'//
 	
MQ
 	
r&   )NN)r   r   r    r   r?   r   rE  r   r   rG  r"   rd   r   r  ra   re   rf   s   @r'   rq  rq    st    ~  ++DE>X 04&*	2
ll2
 'tn2
 d^	2

 
2
 Y F2
r&   rq  )r   F)Br!   collections.abcrp   rU  dataclassesr   typingr   r   r   r"   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   modeling_utilsr   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerrG  rF  rH  rm  rn  r   r)   r,   r0   Moduler3   r@   rd   r   r  r   r   r   r   r   r   r  r  FOCALNET_START_DOCSTRINGrE  r.  rJ  r_  rq  r%   r&   r'   <module>r     s~      ! ) )    A A ! . -  2 2 
		H	% # 0 %  4 1  FK F F4 F+ F F8 F F F8 FK F F8%- %-PD-bii D-PU\\ e T V[VbVb *-ryy -D DN")) &BBII BJ?BII ?DO
bii O
f*o *2	   \I
+ I
	I
X 
 b
%< b
b
J  P
%< P
P
f  	?
. ?
?
r&   