
    sgD              	          d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZmZ ddlmZ  ej<                  e      Z dZ!dZ"g dZ#e G d de             Z$e G d de             Z%d Z&d Z' G d dejP                        Z) G d dejP                        Z* G d dejP                        Z+d:de
jX                  de-de.de
jX                  fd Z/ G d! d"ejP                        Z0 G d# d$ejP                        Z1 G d% d&ejP                        Z2 G d' d(ejP                        Z3 G d) d*ejP                        Z4 G d+ d,ejP                        Z5 G d- d.ejP                        Z6 G d/ d0ejP                        Z7 G d1 d2ejP                        Z8 G d3 d4e      Z9d5Z:d6Z; ed7e:       G d8 d9e9             Z<y);zPyTorch Donut Swin Transformer model.

This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
states.    N)	dataclass)OptionalTupleUnion)nn   )ACT2FN)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging	torch_int   )DonutSwinConfigr   z0https://huggingface.co/naver-clova-ix/donut-base)r   1   i   c                       e Zd ZU dZdZej                  ed<   dZe	e
ej                  df      ed<   dZe	e
ej                  df      ed<   dZe	e
ej                  df      ed<   y)DonutSwinEncoderOutputa  
    DonutSwin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   r        `/var/www/html/venv/lib/python3.12/site-packages/transformers/models/donut/modeling_donut_swin.pyr   r   5   sx    2 ,0u((/=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr%   r   c                       e Zd ZU dZdZej                  ed<   dZe	ej                     ed<   dZ
e	eej                  df      ed<   dZe	eej                  df      ed<   dZe	eej                  df      ed<   y)	DonutSwinModelOutputaY  
    DonutSwin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r    r   r!   r"   r#   r)   r   r   r   r   r   r$   r%   r&   r(   r(   W   s    6 ,0u((/15M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr%   r(   c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r&   window_partitionr;   }   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr%   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )z?
    Merges windows to produce higher resolution features.
    r.   r   r   r   r+   r,   r-   r/   )r:   r5   r7   r8   r9   s        r&   window_reverser=      sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr%   c            
            e Zd ZdZd fd	Zdej                  dededej                  fdZ	 	 dde	ej                     d	e	ej                     d
edeej                     fdZ xZS )DonutSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    c                 ~   t         |           t        |      | _        | j                  j                  }| j                  j
                  | _        |r4t        j                  t        j                  dd|j                              nd | _        |j                  r=t        j                  t        j                  d|dz   |j                              | _        nd | _        t        j                  |j                        | _        t        j"                  |j$                        | _        |j(                  | _        || _        y )Nr   )super__init__DonutSwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr!   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)selfrT   use_mask_tokenrE   	__class__s       r&   rB   zDonutSwinEmbeddings.__init__   s     8 @++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r%   
embeddingsr7   r8   returnc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr.   g      ?r   r   r+   bicubicF)sizemodealign_cornersdim)r0   rM   r!   jit
is_tracingrS   r   reshaper2   r   
functionalinterpolater1   cat)rU   rX   r7   r8   rE   num_positionsclass_pos_embedpatch_pos_embedr`   
new_height	new_widthsqrt_num_positionss               r&   interpolate_pos_encodingz,DonutSwinEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr%   pixel_valuesbool_masked_posrm   c                    |j                   \  }}}}| j                  |      \  }}	| j                  |      }|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  (|r|| j                  |||      z   }n|| j                  z   }| j                  |      }||	fS )Nr.         ?)r0   rD   rO   r\   rK   expand	unsqueezetype_asrM   rm   rR   )rU   rn   ro   rm   _r9   r7   r8   rX   output_dimensionsr6   seq_lenmask_tokensmasks                 r&   forwardzDonutSwinEmbeddings.forward   s     *6););&<(,(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ##/''$*G*G
TZ\a*bb
'$*B*BB
\\*-
,,,r%   )F)NF)r   r   r   r    rB   r!   Tensorintrm   r   r"   
BoolTensorboolr   rz   __classcell__rW   s   @r&   r?   r?      s    &&D5<< &D &DUX &D]b]i]i &DV 7;).	-u001- "%"2"23- #'	-
 
u||	-r%   r?   c                   v     e Zd ZdZ fdZd Zdeej                     de	ej                  e	e   f   fdZ xZS )rC   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        |d   |d   z  |d   |d   z  f| _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rA   rB   
image_sizerS   r9   rJ   
isinstancecollectionsabcIterablerE   rF   r   Conv2d
projection)rU   rT   r   rS   r9   hidden_sizerE   rW   s          r&   rB   z!DonutSwinPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79I9Ik#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L+:^hir%   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )rS   r   rd   pad)rU   rn   r7   r8   
pad_valuess        r&   	maybe_padz"DonutSwinPatchEmbeddings.maybe_pad  s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr%   rn   rY   c                     |j                   \  }}}}| j                  |||      }| j                  |      }|j                   \  }}}}||f}|j                  d      j	                  dd      }||fS )Nr+   r   )r0   r   r   flatten	transpose)rU   rn   ru   r9   r7   r8   rX   rv   s           r&   rz   z DonutSwinPatchEmbeddings.forward  s}    )5););&<~~lFEB__\2
(..1fe#UO''*44Q:
,,,r%   )r   r   r   r    rB   r   r   r!   r"   r   r{   r|   rz   r   r   s   @r&   rC   rC      sF    j	-HU->->$? 	-E%,,X]^aXbJbDc 	-r%   rC   c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )DonutSwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionr`   
norm_layerrY   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr,   r+   Fbias)rA   rB   r   r`   r   Linear	reductionrO   )rU   r   r`   r   rW   s       r&   rB   zDonutSwinPatchMerging.__init__+  sI     01s7AG%@q3w'	r%   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr+   r   r   )r   rd   r   )rU   r4   r7   r8   
should_padr   s         r&   r   zDonutSwinPatchMerging.maybe_pad2  sU    qjAo:519>
Q519a!<JMM--mZHMr%   r4   input_dimensionsc                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r+   r   r.   r,   )r0   r1   r   r!   rf   rO   r   )rU   r4   r   r7   r8   r6   r`   r9   input_feature_0input_feature_1input_feature_2input_feature_3s               r&   rz   zDonutSwinPatchMerging.forward:  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r%   )r   r   r   r    r   rN   r   r|   ModulerB   r   r!   r{   rz   r   r   s   @r&   r   r     sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r%   r   input	drop_probtrainingrY   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   dtypedevice)r0   ndimr!   randr   r   floor_div)r   r   r   	keep_probr0   random_tensoroutputs          r&   	drop_pathr   U  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
DonutSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rY   c                 0    t         |           || _        y N)rA   rB   r   )rU   r   rW   s     r&   rB   zDonutSwinDropPath.__init__m  s    "r%   r   c                 D    t        || j                  | j                        S r   )r   r   r   rU   r   s     r&   rz   zDonutSwinDropPath.forwardq  s    FFr%   c                 8    dj                  | j                        S )Nzp={})formatr   rU   s    r&   
extra_reprzDonutSwinDropPath.extra_reprt  s    }}T^^,,r%   r   )r   r   r   r    r   floatrB   r!   r{   rz   strr   r   r   s   @r&   r   r   j  sG    b#(5/ #T #GU\\ Gell G-C -r%   r   c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
DonutSwinSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        t        j"                  | j                  d         }t        j"                  | j                  d         }t        j$                  t'        ||gd            }t        j(                  |d      }|d d d d d f   |d d d d d f   z
  }	|	j+                  ddd      j-                         }	|	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   d| j                  d   z  dz
  z  cc<   |	j/                  d	      }
| j1                  d
|
       t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j<                  |j>                        | _         y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r+   r   ij)indexingr.   relative_position_indexr   )!rA   rB   
ValueErrornum_attention_headsr|   attention_head_sizeall_head_sizer   r   r   r   r5   r   rH   r!   rI   relative_position_bias_tablearangestackr   r   r2   r3   sumregister_bufferr   qkv_biasquerykeyvaluerP   attention_probs_dropout_probrR   )rU   rT   r`   	num_headsr5   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rW   s              r&   rB   zDonutSwinSelfAttention.__init__z  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr%   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr.   r   r+   r   r   )r\   r   r   r1   r2   )rU   xnew_x_shapes      r&   transpose_for_scoresz+DonutSwinSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r%   r   attention_mask	head_maskoutput_attentionsrY   c                    |j                   \  }}}| j                  |      }| j                  | j                  |            }	| j                  | j	                  |            }
| j                  |      }t        j                  ||	j                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j!                         }||j#                  d      z   }|r|j                   d   }|j                  ||z  || j$                  ||      }||j#                  d      j#                  d      z   }|j                  d| j$                  ||      }t&        j(                  j+                  |d      }| j-                  |      }|||z  }t        j                  ||
      }|j                  dddd      j!                         }|j/                         d d | j0                  fz   }|j                  |      }|r||f}|S |f}|S )Nr.   r   r   r+   r_   r   )r0   r   r   r   r   r!   matmulr   mathsqrtr   r   r   r1   r5   r2   r3   rs   r   r   rd   softmaxrR   r\   r   )rU   r   r   r   r   r6   r`   r9   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r&   rz   zDonutSwinSelfAttention.forward  s    )6(;(;%
C JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r%   NNF)r   r   r   rB   r   r!   r{   r   r"   r~   r   rz   r   r   s   @r&   r   r   y  sv    #GJ% 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6r%   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )DonutSwinSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r   )rA   rB   r   r   denserP   r   rR   rU   rT   r`   rW   s      r&   rB   zDonutSwinSelfOutput.__init__  s6    YYsC(
zz&"E"EFr%   r   input_tensorrY   c                 J    | j                  |      }| j                  |      }|S r   r   rR   )rU   r   r   s      r&   rz   zDonutSwinSelfOutput.forward  s$    

=1]3r%   r   r   r   rB   r!   r{   rz   r   r   s   @r&   r   r     s2    G
U\\  RWR^R^ r%   r   c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
DonutSwinAttentionc                     t         |           t        ||||      | _        t	        ||      | _        t               | _        y r   )rA   rB   r   rU   r   r   setpruned_heads)rU   rT   r`   r   r5   rW   s        r&   rB   zDonutSwinAttention.__init__  s8    *63	;O	)&#6Er%   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r_   )lenr   rU   r   r   r  r   r   r   r   r   r   r   union)rU   headsindexs      r&   prune_headszDonutSwinAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r%   r   r   r   r   rY   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )rU   r   )rU   r   r   r   r   self_outputsattention_outputr   s           r&   rz   zDonutSwinAttention.forward  sG     yy	K\];;|AF#%QR(88r%   r   )r   r   r   rB   r	  r!   r{   r   r"   r~   r   rz   r   r   s   @r&   r   r     st    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
r%   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DonutSwinIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rA   rB   r   r   r|   	mlp_ratior   r   
hidden_actr   r	   intermediate_act_fnr   s      r&   rB   zDonutSwinIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r%   r   rY   c                 J    | j                  |      }| j                  |      }|S r   )r   r  r   s     r&   rz   zDonutSwinIntermediate.forward  s&    

=100?r%   r   r   s   @r&   r  r    s#    9U\\ ell r%   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DonutSwinOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y r   )
rA   rB   r   r   r|   r  r   rP   rQ   rR   r   s      r&   rB   zDonutSwinOutput.__init__$  sF    YYs6#3#3c#9:C@
zz&"<"<=r%   r   rY   c                 J    | j                  |      }| j                  |      }|S r   r   r   s     r&   rz   zDonutSwinOutput.forward)  s$    

=1]3r%   r   r   s   @r&   r  r  #  s#    >
U\\ ell r%   r  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 ddej                  de	e
e
f   deej                     dee   d	ee   d
e	ej                  ej                  f   fdZ xZS )DonutSwinLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )N)eps)r5   r   )rA   rB   chunk_size_feed_forward
shift_sizer5   r   r   rN   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr  intermediater  r   )rU   rT   r`   r   r   drop_path_rater  rW   s          r&   rB   zDonutSwinLayer.__init__1  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a>Ls>R*>:XZXcXcXe!||CV5J5JK1&#>%fc2r%   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr5   r   r  r!   ra   rb   tensor)rU   r   s     r&   set_shift_and_window_sizez(DonutSwinLayer.set_shift_and_window_size>  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r%   c           	         | j                   dkD  rzt        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  t        d            j                  |dk(  t        d            }|S d }|S )Nr   r   r   r.   r+   g      Yr   )
r  r!   rI   slicer5   r;   r1   rs   masked_fillr   )rU   r7   r8   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r&   get_attn_maskzDonutSwinLayer.get_attn_maskF  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir%   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r&  )r5   r   rd   r   )rU   r   r7   r8   	pad_right
pad_bottomr   s          r&   r   zDonutSwinLayer.maybe_padb  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r%   r   r   r   r   always_partitionrY   c                    |s| j                  |       n	 |\  }}|j                         \  }}	}
|}| j                  |      }|j                  ||||
      }| j	                  |||      \  }}|j
                  \  }	}}}	| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |
      }| j                  |||j                  |j                        }| j                  ||||      }|d   }|j                  d| j                  | j                  |
      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |
      }|| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r+   )shiftsdimsr.   r   )r   r   r-   r   )r)  r\   r  r1   r   r0   r  r!   rollr;   r5   r5  r   r   r   r=   r3   r   r"  r#  r   )rU   r   r   r   r   r9  r7   r8   r6   ru   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr4  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r&   rz   zDonutSwinLayer.forwardi  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN!9iK\ + 
 -Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr%   )r   r   NFF)r   r   r   rB   r)  r5  r   r!   r{   r   r|   r   r"   r~   rz   r   r   s   @r&   r  r  0  s    38) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*Ar%   r  c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   deej                     dee
   dee
   deej
                     fdZ xZS )
DonutSwinStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr+   r   )rT   r`   r   r   r$  r  )r`   r   F)rA   rB   rT   r`   r   
ModuleListranger  r5   blocksrN   
downsamplepointing)
rU   rT   r`   r   depthr   r   rQ  irW   s
            r&   rB   zDonutSwinStage.__init__  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r   r   r   r   r9  rY   c                    |\  }}t        | j                        D ]  \  }}	|||   nd }
 |	|||
||      }|d   }! |}| j                  )|dz   dz  |dz   dz  }}||||f}| j                  ||      }n||||f}|||f}|r|dd  z  }|S )Nr   r   r+   )	enumeraterP  rQ  )rU   r   r   r   r   r9  r7   r8   rT  layer_modulelayer_head_maskrI  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledrv   stage_outputss                    r&   rz   zDonutSwinStage.forward  s     )(5 	-OA|.7.CilO(/BSUeM *!,M	- -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr%   rJ  )r   r   r   rB   r!   r{   r   r|   r   r"   r~   rz   r   r   s   @r&   rL  rL    sz    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	r%   rL  c                        e Zd Z fdZ	 	 	 	 	 	 ddej
                  deeef   deej                     dee
   dee
   dee
   dee
   d	ee
   d
eeef   fdZ xZS )DonutSwinEncoderc                    t         |           t        |j                        | _        || _        t        j                  d|j                  t        |j                              D cg c]  }|j                          }}t        j                  t        | j                        D cg c]  }t        |t        |j                   d|z  z        |d   d|z  z  |d   d|z  z  f|j                  |   |j"                  |   |t        |j                  d |       t        |j                  d |dz           || j                  dz
  k  rt$        nd        c}      | _        d| _        y c c}w c c}w )Nr   r+   r   )rT   r`   r   rS  r   r   rQ  F)rA   rB   r  depths
num_layersrT   r!   linspacer$  r   itemr   rN  rO  rL  r|   rJ   r   r   layersgradient_checkpointing)rU   rT   rF   r   dpri_layerrW   s         r&   rB   zDonutSwinEncoder.__init__  sJ   fmm,!&63H3H#fmmJ\!]^Aqvvx^^mm  %T__5  !F,,q'z9:&/lq'z&BIaLUVX_U_D`%a --0$..w7!#fmmHW&=">V]]S`U\_`U`EaAbc9@4??UVCV9V4]a
 ',#! _s   'E$&B*E)r   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingr9  return_dictrY   c	           	      Z   |rdnd }	|rdnd }
|rdnd }|rE|j                   \  }}} |j                  |g|| }|j                  dddd      }|	|fz  }	|
|fz  }
t        | j                        D ]  \  }}|||   nd }| j
                  r-| j                  r!| j                  |j                  |||||      }n ||||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}} |j                  |g|d   |d   f| }|j                  dddd      }|	|fz  }	|
|fz  }
nI|rG|sE|j                   \  }}} |j                  |g|| }|j                  dddd      }|	|fz  }	|
|fz  }
|s||dd  z  } |st        d ||	|fD              S t        ||	||
	      S )
Nr$   r   r   r   r+   r   r.   c              3   &   K   | ]	  }||  y wr   r$   ).0vs     r&   	<genexpr>z+DonutSwinEncoder.forward.<locals>.<genexpr>F  s     mq_`_lms   )r   r   r   r   )r0   r1   r2   rV  rd  re  r   _gradient_checkpointing_func__call__tupler   )rU   r   r   r   r   rh  ri  r9  rj  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr6   ru   r   reshaped_hidden_staterT  rW  rX  rI  rY  rv   s                         r&   rz   zDonutSwinEncoder.forward  s    #7BD+?RT"$5b4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 *	9OA|.7.CilO**t}} $ A A ))!$#%$! !-!#3_FWYi! *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#U*	9X m]4EGZ$[mmm%++*#=	
 	
r%   )NFFFFT)r   r   r   rB   r!   r{   r   r|   r   r"   r~   r   r   rz   r   r   s   @r&   r^  r^    s    ,4 26,1/4CH+0&*K
||K
  S/K
 E--.	K

 $D>K
 'tnK
 3;4.K
 #4.K
 d^K
 
u,,	-K
r%   r^  c                   ,    e Zd ZdZeZdZdZdZdgZ	d Z
y)DonutSwinPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    swinrn   TrL  c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsr   )meanstdNrq   )r   r   r   r   weightdatanormal_rT   initializer_ranger   zero_rN   fill_)rU   modules     r&   _init_weightsz&DonutSwinPreTrainedModel._init_weights]  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r%   N)r   r   r   r    r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r$   r%   r&   rx  rx  Q  s-    
 #L$O&*#)*
*r%   rx  aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DonutSwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DonutImageProcessor.__call__`] for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zdThe bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fd	Zd Zd Z ee       ee	e
ede      	 	 	 	 	 	 	 ddeej                     deej                      deej                     d	ee   d
ee   dedee   deee
f   fd              Z xZS )DonutSwinModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        |rt        j                  d      nd | _        | j#                          y )Nr+   r   )rV   )rA   rB   rT   r  r`  ra  r|   rJ   num_featuresr?   rX   r^  rG   encoderr   AdaptiveAvgPool1dpooler	post_init)rU   rT   add_pooling_layerrV   rW   s       r&   rB   zDonutSwinModel.__init__  s     fmm, 0 0119L3M MN-f^T'0J0JK1Bb**1- 	r%   c                 .    | j                   j                  S r   )rX   rD   r   s    r&   get_input_embeddingsz#DonutSwinModel.get_input_embeddings  s    ///r%   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  layerr   r	  )rU   heads_to_pruner  r  s       r&   _prune_headszDonutSwinModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr%   vision)
checkpointoutput_typer  modalityexpected_outputrn   ro   r   r   rh  rm   rj  rY   c                 ~   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |t        | j                   j                              }| j                  |||      \  }}	| j                  ||	||||      }
|
d   }d}| j                  7| j                  |j                  dd            }t        j                  |d      }|s||f|
dd z   }|S t        |||
j                  |
j                   |
j"                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)ro   rm   )r   r   rh  rj  r   r   r+   )r   r)   r   r   r   )rT   r   rh  use_return_dictr   get_head_maskr  r`  rX   r  r  r   r!   r   r(   r   r   r   )rU   rn   ro   r   r   rh  rm   rj  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s                 r&   rz   zDonutSwinModel.forward  sb   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y#dkk6H6H2IJ	-1__/Tl .= .
** ,,/!5# ' 
 *!,;;" KK(A(A!Q(GHM!MM-;M%}58KKFM#-')77&11#2#I#I
 	
r%   )TF)NNNNNFN)r   r   r   rB   r  r  r   SWIN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr(   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r!   r"   r}   r~   r   r   rz   r   r   s   @r&   r  r    s    
0C ++@A&($. 596:15,0/3).&*=
u001=
 "%"2"23=
 E--.	=

 $D>=
 'tn=
 #'=
 d^=
 
u**	+=
 B=
r%   r  )r   F)=r    collections.abcr   r   dataclassesr   typingr   r   r   r!   torch.utils.checkpointr   activationsr	   modeling_utilsr
   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_donut_swinr   
get_loggerr   loggerr  r  r  r   r(   r;   r=   r   r?   rC   r   r{   r   r~   r   r   r   r   r   r  r  r  rL  r^  rx  SWIN_START_DOCSTRINGr  r  r$   r%   r&   <module>r     s  
   ! ) )    ! - [ [  6 
		H	% $ I %  K[ K K@  K;  K  KH	Y-")) Y-z(-ryy (-X3BII 3nU\\ e T V[VbVb *-		 -aRYY aJ
")) 
# #NBII  	bii 	zRYY z|9RYY 9zb
ryy b
L* *2	  0 j_
- _
	_
r%   