
    sg֘                        d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZmZm Z  ddl!m"Z"  ejF                  e$      Z%dZ&dZ'g dZ(e G d de             Z) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z- G d dejT                        Z. G d dejT                        Z/ G d de/      Z0 G d d ejT                        Z1 G d! d"ejT                        Z2 G d# d$e2      Z3 G d% d&ejT                        Z4 G d' d(ejT                        Z5e2e3d)Z6 G d* d+ejT                        Z7 G d, d-ejT                        Z8 G d. d/e      Z9d0Z:d1Z; ed2e:       G d3 d4e9             Z< G d5 d6ejT                        Z= G d7 d8ejT                        Z> ed9e:       G d: d;e9             Z?y)<zPyTorch YOLOS model.    N)	dataclass)DictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )YolosConfigr   zhustvl/yolos-small)r   iI  i  c                   0   e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZej                  ed<   dZej                  ed<   dZeee
      ed<   dZeej                     ed<   dZeeej                        ed	<   dZeeej                        ed
<   y)YolosObjectDetectionOutputaG
  
    Output type of [`YolosForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
            boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statehidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r    r   r!   r"   r   r#        [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/yolos/modeling_yolos.pyr   r   3   s    B )-D(5$$
%, $Ix~$ $FE$$(J!!(.2xT
+259x 1 1298<M8E%"3"345<59Ju00129r,   r   c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )YolosEmbeddingszT
    Construct the CLS token, detection tokens, position and patch embeddings.

    configreturnNc                 n   t         |           t        j                  t	        j
                  dd|j                              | _        t        j                  t	        j
                  d|j                  |j                              | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   dz   |j                              | _        t        j                  |j                        | _        t#        |      | _        || _        y Nr   )super__init__r
   	Parameterr(   zeroshidden_size	cls_tokennum_detection_tokensdetection_tokensYolosPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout$InterpolateInitialPositionEmbeddingsinterpolationr0   )selfr0   r>   	__class__s      r-   r5   zYolosEmbeddings.__init__f   s    ekk!Q8J8J&KL "U[[F<W<WY_YkYk-l m 4V <++77#%<<KK;)D)DDqH&J\J\]$
  zz&"<"<=A&Ir,   pixel_valuesc                    |j                   \  }}}}| j                  |      }|j                         \  }}}| j                  j	                  |dd      }	| j
                  j	                  |dd      }
t        j                  |	||
fd      }| j                  | j                  ||f      }||z   }| j                  |      }|S )Nr   dim)shaper=   sizer9   expandr;   r(   catrD   r?   rB   )rE   rG   
batch_sizenum_channelsheightwidth
embeddingsseq_len_
cls_tokensr;   r?   s               r-   forwardzYolosEmbeddings.forwardu   s    2>2D2D/
L&%**<8
!+!2
GQ ^^**:r2>
0077
BKYY
J8HIqQ
 #001I1IFTY?["55
\\*-
r,   
r$   r%   r&   r'   r   r5   r(   TensorrX   __classcell__rF   s   @r-   r/   r/   `   s6    
{ t ELL U\\ r,   r/   c                   B     e Zd Zd fdZddej
                  fdZ xZS )rC   r1   c                 0    t         |           || _        y Nr4   r5   r0   rE   r0   rF   s     r-   r5   z-InterpolateInitialPositionEmbeddings.__init__       r,   c                    |d d dd d f   }|d d d f   }|d d | j                   j                   d d d f   }|d d d| j                   j                   d d f   }|j                  dd      }|j                  \  }}}| j                   j                  d   | j                   j
                  z  | j                   j                  d   | j                   j
                  z  }
}	|j                  |||	|
      }|\  }}|| j                   j
                  z  || j                   j
                  z  }}t        j                  j                  |||fdd      }|j                  d      j                  dd      }t        j                  |||fd      }|S )Nr   r      bicubicFrM   modealign_cornersrJ   )r0   r:   	transposerL   
image_size
patch_sizeviewr
   
functionalinterpolateflattenr(   rO   )rE   	pos_embedimg_sizecls_pos_embeddet_pos_embedpatch_pos_embedrP   r8   rU   patch_heightpatch_widthrR   rS   new_patch_heigthnew_patch_widthscale_pos_embeds                   r-   rX   z,InterpolateInitialPositionEmbeddings.forward   s   !!Q'*%ag.!!dkk&F&F%F%H!"KL#AqDKK,L,L+L'La$OP)33Aq9+:+@+@(
K KK""1%)?)??KK""1%)?)?? " *..z;Vab ,2dkk6L6L,LeW[WbWbWmWmNm/--33#3_"EIej 4 
 *11!4>>q!D))]O]$SYZ[r,   r1   N)i   i@  r$   r%   r&   r5   r(   rZ   rX   r[   r\   s   @r-   rC   rC      s    %,, r,   rC   c                   B     e Zd Zd fdZddej
                  fdZ xZS ) InterpolateMidPositionEmbeddingsr1   c                 0    t         |           || _        y r_   r`   ra   s     r-   r5   z)InterpolateMidPositionEmbeddings.__init__   rb   r,   c                 v   |d d d d dd d f   }|d d d f   }|d d d d | j                   j                   d d d f   }|d d d d d| j                   j                   d d f   }|j                  dd      }|j                  \  }}}}	| j                   j                  d   | j                   j
                  z  | j                   j                  d   | j                   j
                  z  }}
|j                  ||z  ||
|      }|\  }}|| j                   j
                  z  || j                   j
                  z  }}t        j                  j                  |||fdd      }|j                  d      j                  dd      j                         j                  ||||z  |      }t        j                  |||fd      }|S )	Nr   r   rd   r   re   Frf   rJ   )r0   r:   ri   rL   rj   rk   rl   r
   rm   rn   ro   
contiguousr(   rO   )rE   rp   rq   rr   rs   rt   depthrP   r8   rU   ru   rv   rR   rS   new_patch_heightrx   ry   s                    r-   rX   z(InterpolateMidPositionEmbeddings.forward   s   !!Q1*-%ag.!!Q)I)I(I(KQ"NO#Aq!t{{/O/O.O*OQR$RS)33Aq92A2G2G/z; KK""1%)?)??KK""1%)?)?? " *..uz/A;P\^ij ,2dkk6L6L,LeW[WbWbWmWmNm/--33#3_"EIej 4 
 ##A&Yq!_Z\T%%5%GU	 	  ))]O]$SYZ[r,   rz   r{   r|   r\   s   @r-   r~   r~      s    %,, r,   r~   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )r<   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r4   r5   rj   rk   rQ   r8   
isinstancecollectionsabcIterabler>   r
   Conv2d
projection)rE   r0   rj   rk   rQ   r8   r>   rF   s          r-   r5   zYolosPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir,   rG   r1   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rd   r   )rL   rQ   
ValueErrorr   ro   ri   )rE   rG   rP   rQ   rR   rS   rT   s          r-   rX   zYolosPatchEmbeddings.forward   sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r,   )	r$   r%   r&   r'   r5   r(   rZ   rX   r[   r\   s   @r-   r<   r<      s)    jELL U\\ r,   r<   c            
            e Zd Zdeddf fdZdej                  dej                  fdZ	 d
deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )YolosSelfAttentionr0   r1   Nc                    t         |           |j                  |j                  z  dk7  r3t	        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r4   r5   r8   num_attention_headshasattrr   intattention_head_sizeall_head_sizer
   Linearqkv_biasquerykeyvaluer@   attention_probs_dropout_probrB   ra   s     r-   r5   zYolosSelfAttention.__init__   s1    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr,   xc                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )NrI   r   rd   r   r   )rM   r   r   rl   permute)rE   r   new_x_shapes      r-   transpose_for_scoresz'YolosSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r,   	head_maskoutput_attentionsc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }	| j                  |	      }	||	|z  }	t	        j
                  |	|      }
|
j                  dddd      j                         }
|
j!                         d d | j"                  fz   }|
j%                  |      }
|r|
|	f}|S |
f}|S )NrI   rJ   r   rd   r   r   )r   r   r   r   r(   matmulri   mathsqrtr   r
   rm   softmaxrB   r   r   rM   r   rl   )rE   r"   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r-   rX   zYolosSelfAttention.forward  sT    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r,   NF)r$   r%   r&   r   r5   r(   rZ   r   r   boolr	   r   rX   r[   r\   s   @r-   r   r      s    G{ Gt G$%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F!r,   r   c                        e Zd Zdeddf fdZ	 	 d	dej                  deej                     de	de
eej                  ej                  f   eej                     f   f fdZ xZS )
YolosSdpaSelfAttentionr0   r1   Nc                 F    t         |   |       |j                  | _        y r_   )r4   r5   r   ra   s     r-   r5   zYolosSdpaSelfAttention.__init__)  s     ,2,O,O)r,   r"   r   r   c           	      ^   |s|'t         j                  d       t        
|   |||      S | j	                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t        j                  j                  j                  ||||| j                  r| j                  nddd       }|j                  dddd	      j                         }|j!                         d d
 | j"                  fz   }	|j%                  |	      }|d fS )Na  `YolosSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r"   r   r           F)	is_causalscaler   rd   r   r   r   )loggerwarning_oncer4   rX   r   r   r   r   r(   r
   rm   scaled_dot_product_attentiontrainingr   r   r   rM   r   rl   )rE   r"   r   r   r   r   r   r   r   r   rF   s             r-   rX   zYolosSdpaSelfAttention.forward-  s:    	 5w 7?+#"3 #   !JJ}5--dhh}.EF	//

=0IJ//0AB++HH15D--C I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCd""r,   r   )r$   r%   r&   r   r5   r(   r)   r   rZ   r   r	   r   rX   r[   r\   s   @r-   r   r   (  s    P{ Pt P -1"'	'#(('# ELL)'#  	'#
 
uU\\5<</0%2EE	F'# '#r,   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	YolosSelfOutputz
    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r0   r1   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r_   )	r4   r5   r
   r   r8   denser@   rA   rB   ra   s     r-   r5   zYolosSelfOutput.__init__^  sB    YYv1163E3EF
zz&"<"<=r,   r"   input_tensorc                 J    | j                  |      }| j                  |      }|S r_   r   rB   rE   r"   r   s      r-   rX   zYolosSelfOutput.forwardc  s$    

=1]3r,   rY   r\   s   @r-   r   r   X  sD    
>{ >t >
U\\  RWR^R^ r,   r   c                        e Zd Zdeddf fdZdee   ddfdZ	 	 ddej                  de
ej                     d	edeeej                  ej                  f   eej                     f   fd
Z xZS )YolosAttentionr0   r1   Nc                     t         |           t        |      | _        t	        |      | _        t               | _        y r_   )r4   r5   r   	attentionr   outputsetpruned_headsra   s     r-   r5   zYolosAttention.__init__l  s0    +F3%f-Er,   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rJ   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rE   r   indexs      r-   prune_headszYolosAttention.prune_headsr  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r,   r"   r   r   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )rE   r"   r   r   self_outputsattention_outputr   s          r-   rX   zYolosAttention.forward  sE     ~~mY@QR;;|AF#%QR(88r,   r   )r$   r%   r&   r   r5   r   r   r   r(   rZ   r   r   r	   r   rX   r[   r\   s   @r-   r   r   k  s    "{ "t ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr,   r   c                   (     e Zd Zdeddf fdZ xZS )YolosSdpaAttentionr0   r1   Nc                 D    t         |   |       t        |      | _        y r_   )r4   r5   r   r   ra   s     r-   r5   zYolosSdpaAttention.__init__  s     /7r,   )r$   r%   r&   r   r5   r[   r\   s   @r-   r   r     s    8{ 8t 8 8r,   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )YolosIntermediater0   r1   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r_   )r4   r5   r
   r   r8   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnra   s     r-   r5   zYolosIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r,   r"   c                 J    | j                  |      }| j                  |      }|S r_   )r   r   )rE   r"   s     r-   rX   zYolosIntermediate.forward  s&    

=100?r,   	r$   r%   r&   r   r5   r(   rZ   rX   r[   r\   s   @r-   r   r     s1    9{ 9t 9U\\ ell r,   r   c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )YolosOutputr0   r1   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r_   )
r4   r5   r
   r   r   r8   r   r@   rA   rB   ra   s     r-   r5   zYolosOutput.__init__  sB    YYv779K9KL
zz&"<"<=r,   r"   r   c                 T    | j                  |      }| j                  |      }||z   }|S r_   r   r   s      r-   rX   zYolosOutput.forward  s.    

=1]3%4r,   r   r\   s   @r-   r   r     s?    >{ >t >
U\\  RWR^R^ r,   r   )eagersdpac                        e Zd ZdZdeddf fdZ	 	 d
dej                  deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )
YolosLayerz?This corresponds to the Block class in the timm implementation.r0   r1   Nc                    t         |           |j                  | _        d| _        t	        |j
                     |      | _        t        |      | _        t        |      | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r4   r5   chunk_size_feed_forwardseq_len_dimYOLOS_ATTENTION_CLASSES_attn_implementationr   r   intermediater   r   r
   	LayerNormr8   layer_norm_epslayernorm_beforelayernorm_afterra   s     r-   r5   zYolosLayer.__init__  s    '-'E'E$01L1LMfU-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr,   r"   r   r   c                     | j                  | j                  |      ||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )rE   r"   r   r   self_attention_outputsr   r   layer_outputs           r-   rX   zYolosLayer.forward  s     "&!!-0/ "0 "

 2!4(, )=8 ++M:((6 {{<?/G+r,   r   )r$   r%   r&   r'   r   r5   r(   rZ   r   r   r	   r   rX   r[   r\   s   @r-   r   r     s    I[{ [t [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr,   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  deej                     deded	ede	e
ef   fd
Z xZS )YolosEncoderr0   r1   Nc                 @   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        d|j                  d   |j                  d   z  |j                  dz  z  z   |j                  z   }|j                  rBt        j                  t        j                   |j                  dz
  d||j"                              nd | _        |j                  rt'        |      | _        y d | _        y c c}w )NFr   r   rd   )r4   r5   r0   r
   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrj   rk   r:   use_mid_position_embeddingsr6   r(   r7   r8   mid_position_embeddingsr~   rD   )rE   r0   rV   
seq_lengthrF   s       r-   r5   zYolosEncoder.__init__  s   ]]fF^F^@_#`1Jv$6#`a
&+# ""1%(9(9!(<<@Q@QST@TTUX^XsXss 	 11 LL,,q0&&	  	$ JPIkIk=fEqu' $as   Dr"   r   r   output_hidden_statesreturn_dictc                 P   |rdnd }|rdnd }	| j                   j                  r| j                  | j                  ||f      }
t	        | j
                        D ]  \  }}|r||fz   }|||   nd }| j                  r+| j                  r| j                  |j                  |||      }n
 ||||      }|d   }| j                   j                  r$|| j                   j                  dz
  k  r|
|   z   }|s|	|d   fz   }	 |r||fz   }|st        d |||	fD              S t        |||	      S )Nr+   r   r   c              3   &   K   | ]	  }||  y wr_   r+   ).0vs     r-   	<genexpr>z'YolosEncoder.forward.<locals>.<genexpr>.  s     mq_`_lms   )r!   r"   r#   )r0   r  rD   r  	enumerater  r  r   _gradient_checkpointing_func__call__r  tupler   )rE   r"   rR   rS   r   r   r
  r  all_hidden_statesall_self_attentions$interpolated_mid_position_embeddingsilayer_modulelayer_head_masklayer_outputss                  r-   rX   zYolosEncoder.forward  sa    #7BD$5b4;;22373E3EdFbFbekmrds3t0(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]OM^ _)!,M{{66559:$14XYZ4[$[M &9]1=M<O&O#/	P2   1]4D Dm]4EGZ$[mmm++*
 	
r,   )NFFT)r$   r%   r&   r   r5   r(   rZ   r   r   r	   r  r   rX   r[   r\   s   @r-   r   r     s}    v{ vt v: -1"'%* 2
||2

 ELL)2
  2
 #2
 2
 
uo%	&2
r,   r   c                       e Zd ZdZeZdZdZdZg Z	dZ
deej                  ej                  ej                  f   ddfdZy)	YolosPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitrG   Tmoduler1   Nc                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsr   )meanstdNg      ?)r   r
   r   r   weightdatanormal_r0   initializer_ranger   zero_r   fill_)rE   r  s     r-   _init_weightsz"YolosPreTrainedModel._init_weightsC  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r,   )r$   r%   r&   r'   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar	   r
   r   r   r   r)  r+   r,   r-   r  r  6  sV    
 L$O&*#N
*E"))RYY*L$M 
*RV 
*r,   r  aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`YolosConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aM  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`YolosImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z_The bare YOLOS Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zddedef fdZdefdZdee	e
e	   f   ddfdZ ee       eeeed	e
      	 	 	 	 	 ddeej*                     deej*                     dee   dee   dee   deeef   fd              Z xZS )
YolosModelr0   add_pooling_layerc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j                          y )Nr   )r4   r5   r0   r/   rT   r   encoderr
   r   r8   r   	layernormYolosPoolerpooler	post_init)rE   r0   r2  rF   s      r-   r5   zYolosModel.__init__w  si     )&1#F+f&8&8f>S>ST->k&)D 	r,   r1   c                 .    | j                   j                  S r_   )rT   r=   )rE   s    r-   get_input_embeddingszYolosModel.get_input_embeddings  s    ///r,   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)a	  
        Prunes heads of the model.

        Args:
            heads_to_prune (`dict`):
                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
                list of heads to prune in this layer}
        N)itemsr4  r  r   r   )rE   r;  r  r   s       r-   _prune_headszYolosModel._prune_heads  sE     +002 	CLE5LLu%//;;EB	Cr,   vision)
checkpointoutput_typer*  modalityexpected_outputrG   r   r   r
  r  c           	      `   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  || j                   j                        }| j                  |      }| j                  ||j                  d   |j                  d   ||||      }|d   }| j                  |      }| j                  | j                  |      nd }	|s|	||	fn|f}
|
|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_valuesr   rI   )rR   rS   r   r   r
  r  r   r   )r!   pooler_outputr"   r#   )r0   r   r
  use_return_dictr   get_head_maskr  rT   r4  rL   r5  r7  r   r"   r#   )rE   rG   r   r   r
  r  embedding_outputencoder_outputssequence_outputpooled_outputhead_outputss              r-   rX   zYolosModel.forward  sR     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y$++2O2OP	??<8,,%%b)$$R(/!5# ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""555)-')77&11	
 	
r,   )T)NNNNN)r$   r%   r&   r   r   r5   r<   r:  r   r   r   r>  r   YOLOS_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r(   rZ   r	   r   rX   r[   r\   s   @r-   r1  r1  r  s    
{ t 0&: 0
C4T#Y+? 
CD 
C ++AB&.$. 04,0,0/3&*0
u||,0
 ELL)0
 $D>	0

 'tn0
 d^0
 
u00	10
 C0
r,   r1  c                   *     e Zd Zdef fdZd Z xZS )r6  r0   c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r_   )r4   r5   r
   r   r8   r   Tanh
activationra   s     r-   r5   zYolosPooler.__init__  s9    YYv1163E3EF
'')r,   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   rT  )rE   r"   first_token_tensorrK  s       r-   rX   zYolosPooler.forward  s6     +1a40

#566r,   )r$   r%   r&   r   r5   rX   r[   r\   s   @r-   r6  r6    s    ${ $
r,   r6  c                   (     e Zd ZdZ fdZd Z xZS )YolosMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        y )Nr   c              3   N   K   | ]  \  }}t        j                  ||        y wr_   )r
   r   )r  nks      r-   r  z2YolosMLPPredictionHead.__init__.<locals>.<genexpr>  s     #g1BIIaO#gs   #%)r4   r5   
num_layersr
   r  ziplayers)rE   	input_dim
hidden_dim
output_dimr]  hrF   s         r-   r5   zYolosMLPPredictionHead.__init__  sS    $LJN+mm#gYKRSOUVZdYeUe@f#ggr,   c                     t        | j                        D ]D  \  }}|| j                  dz
  k  r%t        j                  j                   ||            n ||      }F |S r3   )r  r_  r]  r
   rm   relu)rE   r   r  r  s       r-   rX   zYolosMLPPredictionHead.forward  sT    !$++. 	VHAu01DOOa4G0G""58,USTXA	Vr,   )r$   r%   r&   r'   r5   rX   r[   r\   s   @r-   rX  rX    s    hr,   rX  zy
    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
    c                        e Zd Zdef fdZej                  j                  d        Z e	e
       eee      	 	 	 	 ddej                  deee      dee   dee   d	ee   d
eeef   fd              Z xZS )YolosForObjectDetectionr0   c                 "   t         |   |       t        |d      | _        t	        |j
                  |j
                  |j                  dz   d      | _        t	        |j
                  |j
                  dd      | _        | j                          y )NF)r2  r   r   )r`  ra  rb  r]     )
r4   r5   r1  r  rX  r8   
num_labelsclass_labels_classifierbbox_predictorr8  ra   s     r-   r5   z YolosForObjectDetection.__init__  s      f> (>((V5G5GTZTeTehiTivw(
$ 5((V5G5GTUbc

 	r,   c                 ^    t        |d d |d d       D cg c]
  \  }}||d c}}S c c}}w )NrI   )r   r   )r^  )rE   outputs_classoutputs_coordabs        r-   _set_aux_lossz%YolosForObjectDetection._set_aux_loss  s9    
 <?}Sb?QS`adbdSe;fg41a1A.gggs   ))rA  r*  rG   labelsr   r
  r  r1   c           
         ||n| j                   j                  }| j                  ||||      }|d   }|dd| j                   j                   dddf   }| j	                  |      }| j                  |      j                         }	d\  }
}}|d\  }}| j                   j                  rC|r|j                  n|d   }| j	                  |      }| j                  |      j                         }| j                  ||| j                  |	| j                   ||      \  }
}}|s|||	f|z   |z   }n||	f|z   }|
|
|f|z   S |S t        |
|||	||j                  |j                  |j                        S )a!	  
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
            4)`.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
        ```N)r   r
  r  r   )NNN)NNri  )r   r   r   r   r    r!   r"   r#   )r0   rF  r  r:   rk  rl  sigmoidauxiliary_lossintermediate_hidden_statesloss_functiondevicer   r!   r"   r#   )rE   rG   rs  r   r
  r  r   rJ  r   r   r   r   r    rn  ro  r   r   s                    r-   rX   zYolosForObjectDetection.forward  s   n &1%<k$++B]B] ((/!5#	  
 "!* *!dkk.N.N-N-PRS*ST --o>((9AAC
-=*i*+5(M={{))EPwAAV]^_V` $ < <\ J $ 3 3L A I I K151C1CZmUb2.D).  , *-0AAGK *-7373CT9%.OO)!/%77!//))	
 		
r,   )NNNN)r$   r%   r&   r   r5   r(   jitunusedrr  r   rM  r   r   rO  r)   r   r   r   r   r	   r   rX   r[   r\   s   @r-   rg  rg    s    { & YYh h ++AB+ETcd (,,0/3&*c
''c
 d$c
 $D>	c

 'tnc
 d^c
 
u00	1c
 e Cc
r,   rg  )@r'   collections.abcr   r   dataclassesr   typingr   r   r   r   r   r	   r(   torch.utils.checkpointr
   activationsr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_yolosr   
get_loggerr$   r   rO  rN  rP  r   Moduler/   rC   r~   r<   r   r   r   r   r   r   r   r   r   r   r  YOLOS_START_DOCSTRINGrM  r1  r6  rX  rg  r+   r,   r-   <module>r     s      ! : :    ! K - Q  - 
		H	%   + '  ): ): ):X(bii (V299 :ryy B299 D9 9z,#/ ,#`bii &$RYY $P8 8		 ""))  %3<NO ' 'TK
299 K
\*? *4	  . eU
% U
	U
p"))  RYY *  	@
2 @
@
r,   