
    sgX                    L   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddl
mZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z"  e       rddl#m$Z$  ejJ                  e&      Z'dZ(dZ)e G d de             Z*e G d de             Z+e G d de             Z,e G d de             Z- G d dej\                        Z/d Z0 G d dej\                        Z1 G d d ej\                        Z2 G d! d"ej\                        Z3 G d# d$ej\                        Z4d% Z5 G d& d'ej\                        Z6 G d( d)ej\                        Z7 G d* d+ej\                        Z8 G d, d-e      Z9d.Z:d/Z; G d0 d1e9      Z< G d2 d3e9      Z= ed4e:       G d5 d6e9             Z> G d7 d8ej\                        Z? ed9e:       G d: d;e9             Z@ ed<e:       G d= d>e9             ZAd?eBfd@ZC G dA dBej\                        ZD G dC dDej\                        ZEg dEZFy)FzPyTorch DETR model.    N)	dataclass)DictListOptionalTupleUnion)Tensornn   )ACT2FN)_prepare_4d_attention_mask)BaseModelOutput"BaseModelOutputWithCrossAttentionsSeq2SeqModelOutput)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_timm_availableloggingreplace_return_docstringsrequires_backends)load_backbone   )
DetrConfig)create_modelr   zfacebook/detr-resnet-50c                   :    e Zd ZU dZdZeej                     ed<   y)DetrDecoderOutputa  
    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
            layernorm.
    Nintermediate_hidden_states	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.pyr   r   3   s     2 ?C):): ;Br)   r   c                   :    e Zd ZU dZdZeej                     ed<   y)DetrModelOutputa)  
    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
            layernorm.
    Nr   r    r(   r)   r*   r,   r,   Q   s!    !F ?C):): ;Br)   r,   c                      e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZej                  ed<   dZej                  ed<   dZeee
      ed<   dZeej                     ed<   dZeeej                        ed	<   dZeeej                        ed
<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)DetrObjectDetectionOutputa  
    Output type of [`DetrForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
    Nloss	loss_dictlogits
pred_boxesauxiliary_outputslast_hidden_statedecoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentions)r!   r"   r#   r$   r/   r   r%   r&   r'   r0   r   r1   r2   r3   r   r4   r5   r   r6   r7   r8   r9   r:   r(   r)   r*   r.   r.   y   s   -^ )-D(5$$
%, $Ix~$ $FE$$(J!!(.2xT
+259x 1 129@D8E%*;*;$<=D=Au'8'8!9:A;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:Ar)   r.   c                      e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZej                  ed<   dZej                  ed<   dZej                  ed<   dZeee
      ed<   dZeej                     ed	<   dZeeej                        ed
<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)DetrSegmentationOutputaP  
    Output type of [`DetrForSegmentation`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
            Segmentation masks logits for all queries. See also
            [`~DetrImageProcessor.post_process_semantic_segmentation`] or
            [`~DetrImageProcessor.post_process_instance_segmentation`]
            [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
            segmentation masks respectively.
        auxiliary_outputs (`list[Dict]`, *optional*):
            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
            `pred_boxes`) for each decoder layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of each
            layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
            layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
    Nr/   r0   r1   r2   
pred_masksr3   r4   r5   r6   r7   r8   r9   r:   )r!   r"   r#   r$   r/   r   r%   r&   r'   r0   r   r1   r2   r=   r3   r   r4   r5   r   r6   r7   r8   r9   r:   r(   r)   r*   r<   r<      s$   3j )-D(5$$
%, $Ix~$ $FE$$(J!!($(J!!(.2xT
+259x 1 129@D8E%*;*;$<=D=Au'8'8!9:A;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:Ar)   r<   c                   2     e Zd ZdZ fdZ fdZd Z xZS )DetrFrozenBatchNorm2dz
    BatchNorm2d where the batch statistics and the affine parameters are fixed.

    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
    torchvision.models.resnet[18,34,50,101] produce nans.
    c                 J   t         |           | j                  dt        j                  |             | j                  dt        j
                  |             | j                  dt        j
                  |             | j                  dt        j                  |             y )Nweightbiasrunning_meanrunning_var)super__init__register_bufferr%   oneszeros)selfn	__class__s     r*   rF   zDetrFrozenBatchNorm2d.__init__  sn    Xuzz!}5VU[[^4^U[[^<]EJJqM:r)   c           	      H    |dz   }||v r||= t         	|   |||||||       y )Nnum_batches_tracked)rE   _load_from_state_dict)
rJ   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsnum_batches_tracked_keyrL   s
            r*   rO   z+DetrFrozenBatchNorm2d._load_from_state_dict  s?     #)+@"@"j023%oWa	
r)   c                 B   | j                   j                  dddd      }| j                  j                  dddd      }| j                  j                  dddd      }| j                  j                  dddd      }d}|||z   j                         z  }|||z  z
  }||z  |z   S )Nr   gh㈵>)rA   reshaperB   rD   rC   rsqrt)rJ   xrA   rB   rD   rC   epsilonscales           r*   forwardzDetrFrozenBatchNorm2d.forward  s     $$QAq1yy  B1-&&..q"a;((00B1=+/6688lU**5y4r)   )r!   r"   r#   r$   rF   rO   r_   __classcell__rL   s   @r*   r?   r?      s    ;	

 r)   r?   c                    | j                         D ]_  \  }}t        |t        j                        rt	        |j
                        }|j                  j                  t        j                  d      k(  s|j                  j                  j                  |j                         |j                  j                  j                  |j                         |j                  j                  j                  |j                         |j                  j                  j                  |j                         || j                  |<   t        t!        |j#                                     dkD  sUt%        |       b y)z
    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.

    Args:
        model (torch.nn.Module):
            input model
    metar   N)named_children
isinstancer
   BatchNorm2dr?   num_featuresrA   devicer%   datacopy_rB   rC   rD   _moduleslenlistchildrenreplace_batch_norm)modelnamemodule
new_modules       r*   ro   ro   '  s    ,,. 'ffbnn-.v/B/BCJ==''5<<+??!!&&,,V]];$$**6;;7'',,2263F3FG&&++11&2D2DE#-ENN4 tFOO%&'!+v&'r)   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )DetrConvEncoderz
    Convolutional backbone, using either the AutoBackbone API or one from the timm library.

    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.

    c                 ,   t         	|           || _        |j                  rt	        | dg       t        |di       }|i n|j                         }|j                  dd      }|j                  d|j                        }|j                  r|j                  dd      |d<   t        |j                  f|j                  d||d	|}nt        |      }t        j                          5  t#        |       d d d        || _        |j                  r$| j$                  j&                  j)                         n| j$                  j(                  | _        d }|j                  |j                  }n.|j,                  |j,                  j.                  }nt1        d
      d|v rp| j$                  j3                         D ]R  \  }}|j                  r!d|vsd|vsd|vs!|j5                  d       3d|vs8d|vs=d|vsB|j5                  d       T y y # 1 sw Y   xY w)Ntimmbackbone_kwargsout_indices)r      r      in_chansoutput_stride   T)
pretrainedfeatures_onlyry   r|   zGEither `backbone` or `backbone_config` should be provided in the configresnetlayer2layer3layer4Fzstage.1zstage.2zstage.3)rE   rF   configuse_timm_backboner   getattrcopypopnum_channelsdilationgetr   backboneuse_pretrained_backboner   r%   no_gradro   rp   feature_infochannelsintermediate_channel_sizesbackbone_config
model_type
ValueErrornamed_parametersrequires_grad_)
rJ   r   kwargsry   r   r   backbone_model_typerq   	parameterrL   s
            r*   rF   zDetrConvEncoder.__init__G  s    ## dVH-V%6;F!>Rv{{}F **]LAK!::j&2E2EFL*0**_b*I'#!99"'% H %V,H ]]_ 	)x(	)
282J2JDJJ##,,.PTPZPZPcPc 	' #??&"(//##/"("8"8"C"Cfgg**#'::#>#>#@ 8i++t+0DY]I]!007 ,$1F9\`K`!0078 +	) 	)s   %H		Hpixel_values
pixel_maskc                    | j                   j                  r| j                  |      n| j                  |      j                  }g }|D ]t  }t        j
                  j                  |d    j                         |j                  dd        j                  t        j                        d   }|j                  ||f       v |S )N)sizer   )r   r   rp   feature_mapsr
   
functionalinterpolatefloatshapetor%   boolappend)rJ   r   r   featuresoutfeature_mapmasks          r*   r_   zDetrConvEncoder.forward{  s    /3{{/L/L4::l+RVR\R\]iRjRwRw# 	,K==,,Z-=-C-C-EKL]L]^`^aLb,cffglgqgqrstuDJJT*+	, 
r)   )	r!   r"   r#   r$   rF   r%   r	   r_   r`   ra   s   @r*   ru   ru   ?  s)    28h	ELL 	ell 	r)   ru   c                   (     e Zd ZdZ fdZd Z xZS )DetrConvModelzp
    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
    c                 >    t         |           || _        || _        y N)rE   rF   conv_encoderposition_embedding)rJ   r   r   rL   s      r*   rF   zDetrConvModel.__init__  s    ("4r)   c                     | j                  ||      }g }|D ]?  \  }}|j                  | j                  ||      j                  |j                               A ||fS r   )r   r   r   r   dtype)rJ   r   r   r   posr   r   s          r*   r_   zDetrConvModel.forward  sd    j9!$ 	YKJJt..{DADD[EVEVWX	Y Cxr)   r!   r"   r#   r$   rF   r_   r`   ra   s   @r*   r   r     s    5
r)   r   c                   *     e Zd ZdZd fd	Zd Z xZS )DetrSinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    c                     t         |           || _        || _        || _        ||du rt        d      |dt        j                  z  }|| _        y )NFz+normalize should be True if scale is passedrz   )	rE   rF   embedding_dimtemperature	normalizer   mathpir^   )rJ   r   r   r   r^   rL   s        r*   rF   z"DetrSinePositionEmbedding.__init__  sW    *&"e!3JKK=KE
r)   c           
      h   |t        d      |j                  dt        j                        }|j                  dt        j                        }| j                  rB||d d dd d d f   dz   z  | j
                  z  }||d d d d dd f   dz   z  | j
                  z  }t        j                  | j                  t        j                  |j                        j                         }| j                  dt        j                  |dd	      z  | j                  z  z  }|d d d d d d d f   |z  }|d d d d d d d f   |z  }t        j                  |d d d d d d d
d df   j                         |d d d d d d dd df   j                         fd      j!                  d      }t        j                  |d d d d d d d
d df   j                         |d d d d d d dd df   j                         fd      j!                  d      }t        j"                  ||fd      j%                  d
ddd      }|S )NzNo pixel mask providedr   )r   rz   rY   gư>)r   rh   floor)rounding_moder   r{   dimr   )r   cumsumr%   float32r   r^   aranger   int64rh   r   r   divstacksincosflattencatpermute)	rJ   r   r   y_embedx_embeddim_tpos_xpos_yr   s	            r*   r_   z!DetrSinePositionEmbedding.forward  s   566##AU]]#;##AU]]#;>>BC!3d!:;djjHGArs!3d!:;djjHGT//u{{<K^K^_eeg  Q5!7)S%SVZVhVh%hi1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB
r)   )@   i'  FNr   ra   s   @r*   r   r     s    
	r)   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )DetrLearnedPositionEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    c                     t         |           t        j                  d|      | _        t        j                  d|      | _        y )N2   )rE   rF   r
   	Embeddingrow_embeddingscolumn_embeddings)rJ   r   rL   s     r*   rF   z%DetrLearnedPositionEmbedding.__init__  s4     ll2}=!#b-!@r)   c                 .   |j                   dd  \  }}t        j                  ||j                        }t        j                  ||j                        }| j	                  |      }| j                  |      }t        j                  |j                  d      j                  |dd      |j                  d      j                  d|d      gd      }	|	j                  ddd      }	|	j                  d      }	|	j                  |j                   d   ddd      }	|	S )Nr   rh   r   r   rY   r   rz   )
r   r%   r   rh   r   r   r   	unsqueezerepeatr   )
rJ   r   r   heightwidthwidth_valuesheight_valuesx_emby_embr   s
             r*   r_   z$DetrLearnedPositionEmbedding.forward  s    $**23/||E,2E2EFVL4G4GH&&|4##M2ii+2261a@%//RSBTB[B[\]_dfgBhioqrkk!Q"mmAjj++A.1a8
r)   )   r   r   ra   s   @r*   r   r     s    A

r)   r   c                     | j                   dz  }| j                  dk(  rt        |d      }|S | j                  dk(  rt        |      }|S t	        d| j                         )Nrz   sineT)r   learnedzNot supported )d_modelposition_embedding_typer   r   r   )r   n_stepsr   s      r*   build_position_encodingr     sq    nn!G%%/6w$O  
	'	'9	49'B  >&*H*H)IJKKr)   c                       e Zd ZdZ	 	 ddedededef fdZdej                  ded	efd
Z
dej                  dee	   fdZ	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dedeej                  eej                     eeej                        f   fdZ xZS )DetrAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.

    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
    	embed_dim	num_headsdropoutrB   c                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        t        j                  |||      | _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      ࿩rB   )rE   rF   r   r   r   head_dimr   scalingr
   Lineark_projv_projq_projout_proj)rJ   r   r   r   rB   rL   s        r*   rF   zDetrAttention.__init__  s     	""!Y.==9$6MdnnM] ^;b"  }}d*ii	94@ii	94@ii	94@		)YTBr)   tensorseq_len
batch_sizec                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   rz   )viewr   r   	transpose
contiguous)rJ   r   r   r   s       r*   _shapezDetrAttention._shape  s7    {{:wNXXYZ\]^iikkr)   object_queriesc                     ||S ||z   S r   r(   )rJ   r   r  s      r*   with_pos_embedzDetrAttention.with_pos_embed  s    '/vLVn5LLr)   hidden_statesattention_maskkey_value_statesspatial_position_embeddingsoutput_attentionsreturnc                    |du}|j                         \  }}	}
||}| j                  ||      }||}| j                  ||      }| j                  |      | j                  z  }|rE| j	                  | j                  |      d|      }| j	                  | j                        d|      }nD| j	                  | j                  |      d|      }| j	                  | j                        d|      }|| j                  z  d| j                  f} | j	                  ||	|      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t        d|| j                  z  |	|f d|j                                |{|j                         |d|	|fk7  r#t        d|d|	|f d|j                                |j                  || j                  |	|      |z   }|j                  || j                  z  |	|      }t        j                  j!                  |d      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t        j                  j#                  || j"                  | j$                  	      }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r7t        d
|| j                  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j'                  ||	|
      }| j)                  |      }||fS )z#Input shape: Batch x Time x ChannelNrY   r   rz   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   ptrainingz `attn_output` should be of size )r   r  r   r   r  r   r   r   r   r   r%   bmmr  r   r
   r   softmaxr   r  rZ   r   )rJ   r  r  r  r	  r
  r  is_cross_attentionr   
target_lenr   hidden_states_originalkey_value_states_originalquery_states
key_statesvalue_states
proj_shape
source_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                         r*   r_   zDetrAttention.forward
  s    .T9,9,>,>,@)
J	 %%2" //~NM '2(8%#223CE`a {{=1DLL@T[[1A%BB
SJ;;t{{3L'MrS]^L T[[%?ZPJ;;t{{3I'JBPZ[L 4>>12t}}E
Mt{{<ZHMMzZ$Z__j1
(|((*5__Q'
yyz/C/CAq/IJ:#>
J"WW6
T^^8SU_ak7l6m n %%'(* 
 %""$Q
J(OO 7Q
T^8_7` a&++-.0  (,,ZU_`cqqL',,Z$..-H*V`aL}},,\r,B
 %1$5$5j$..R\^h$i!055j4>>6QS]_ijL$(!]]**<4<<RVR_R_*`
ii
L9*t~~"=z4==!YY2JPZ\`\i\i3j2k l$$&') 
 "&&z4>>:t}}]!++Aq1!))*j)LmmK0111r)   )        T)NNNNF)r!   r"   r#   r$   intr   r   rF   r%   r	   r  r   r  r   r_   r`   ra   s   @r*   r   r     s2    CC C 	C
 C0lU\\ lC lS lMU\\ M8FCS M 261537>B"'Y2||Y2 !.Y2 !.	Y2
 #5<<0Y2 &.ell%;Y2  Y2 
u||Xell3XeELL>Q5RR	SY2r)   r   c            	       ~     e Zd Zdef fdZ	 	 ddej                  dej                  dej                  defdZ xZ	S )	DetrEncoderLayerr   c                 f   t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _
        |j                  | _        t        |j                     | _        |j                  | _        t        j                   | j                  |j"                        | _        t        j                   |j"                  | j                        | _        t        j                  | j                        | _        y )Nr   r   r   )rE   rF   r   r   r   encoder_attention_headsattention_dropout	self_attnr
   	LayerNormself_attn_layer_normr   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2final_layer_normrJ   r   rL   s     r*   rF   zDetrEncoderLayer.__init__g  s    &nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r)   r  r  r  r  c                    |}| j                  ||||      \  }}t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }| j                  rt        j                  |      j                         s#t        j                  |      j                         rEt        j                  |j                         j"                  dz
  }t        j$                  || |      }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            object_queries (`torch.FloatTensor`, *optional*):
                Object queries (also called content embeddings), to be added to the hidden states.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r  r  r  r  r  i  )minmax)r(  r
   r   r   r  r*  r,  r/  r-  r0  r1  r%   isinfanyisnanfinfor   r5  clamp)	rJ   r  r  r  r  residualr  clamp_valueoutputss	            r*   r_   zDetrEncoderLayer.forwardw  s   & !&*nn'))/	 '5 '
#| --mt||VZVcVc-d =011-@ **488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m<=={{=)--/5;;}3M3Q3Q3S#kk-*=*=>BBTI %M|Q\ ] "&Gr)   NF)
r!   r"   r#   r   rF   r%   r	   r   r_   r`   ra   s   @r*   r#  r#  f  sN    =z =( (,"'3||3 3 	3
  3r)   r#  c                        e Zd Zdef fdZ	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	ee   fd
Z	 xZ
S )DetrDecoderLayerr   c                    t         |           |j                  | _        t	        | j                  |j
                  |j                        | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        t	        | j                  |j
                  |j                        | _        t        j                  | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                  | j                        | _        y )Nr%  )r   )rE   rF   r   r   r   decoder_attention_headsr'  r(  r   r   r+  r,  r-  r
   r)  r*  encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr/  r0  r1  r2  s     r*   rF   zDetrDecoderLayer.__init__  s   &nn44,,

 ~~#F$>$>?"(";";$&LL$@!)NN**,,

 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r)   r  r  r  query_position_embeddingsr9   encoder_attention_maskr  c                 ,   |}| j                  ||||      \  }}	t        j                  j                  || j                  | j                        }||z   }| j                  |      }d}
|h|}| j                  ||||||      \  }}
t        j                  j                  || j                  | j                        }||z   }| j                  |      }|}| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }||z   }| j                  |      }|f}|r||	|
fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            object_queries (`torch.FloatTensor`, *optional*):
                object_queries that are added to the hidden states
            in the cross-attention layer.
            query_position_embeddings (`torch.FloatTensor`, *optional*):
                position embeddings that are added to the queries and keys
            in the self-attention layer.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
                values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r  r  r  r  r  N)r  r  r	  r  r
  r  )r(  r
   r   r   r  r*  rC  rD  r,  r/  r-  r0  r1  )rJ   r  r  r  rF  r9   rG  r  r;  self_attn_weightscross_attn_weightsr=  s               r*   r_   zDetrDecoderLayer.forward  s   > ! ,0>>'4)/	 ,: ,
(( --mt||VZVcVc-d =011-@ " ,$H040A0A+8!65,:"3 1B 1-M- MM11-4<<Z^ZgZg1hM$}4M 88GM !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>Gr)   )NNNNNF)r!   r"   r#   r   rF   r%   r	   r   r   r_   r`   ra   s   @r*   r@  r@    s    =z =6 2615<@8<9=,1M||M !.M !.	M
 $,ELL#9M  (5M !) 6M $D>Mr)   r@  c                   &    e Zd ZeZdZdZg dZd Zy)DetrPreTrainedModelrp   r   )ru   r#  r@  c                 .   | j                   j                  }| j                   j                  }t        |t              rt
        j                  j                  |j                  j                         t
        j                  j                  |j                  j                         t
        j                  j                  |j                  j                  |       t
        j                  j                  |j                  j                  |       nvt        |t              rft
        j                  j                  |j                  j                         t
        j                  j                  |j                   j                         t        |t
        j"                  t
        j$                  t
        j&                  f      rY|j                  j(                  j+                  d|       |j                  %|j                  j(                  j-                          y y t        |t
        j.                        rf|j                  j(                  j+                  d|       |j0                  2|j                  j(                  |j0                     j-                          y y y )N)gainr   )meanstd)r   init_stdinit_xavier_stdre   DetrMHAttentionMapr
   initzeros_k_linearrB   q_linearxavier_uniform_rA   r   uniform_r   r   r   Conv2drf   ri   normal_zero_r   padding_idx)rJ   rr   rP  
xavier_stds       r*   _init_weightsz!DetrPreTrainedModel._init_weights  s   kk""[[00
f01GGNN6??//0GGNN6??//0GG##FOO$:$:#LGG##FOO$:$:#L <=GGV2299:GGV55<<=fryy"))R^^DE MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r)   N)	r!   r"   r#   r   config_classbase_model_prefixmain_input_name_no_split_modulesr_  r(   r)   r*   rL  rL    s    L$OV?r)   rL  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DetrConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a`	  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it.

            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DetrImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).

            [What are attention masks?](../glossary#attention-mask)

        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
            Not used by default. Can be used to mask object queries.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
            can choose to directly pass a flattened representation of an image.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
            embedded representation.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   <     e Zd ZdZdef fdZ	 	 	 	 	 	 ddZ xZS )DetrEncoderaU  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`DetrEncoderLayer`].

    The encoder updates the flattened feature map through multiple self-attention layers.

    Small tweak for DETR:

    - object_queries are added to the forward pass.

    Args:
        config: DetrConfig
    r   c                    t         |   |       |j                  | _        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        | j                          y c c}w r   )rE   rF   r   encoder_layerdrop	layerdropr
   
ModuleListrangeencoder_layersr#  layers	post_initrJ   r   _rL   s      r*   rF   zDetrEncoder.__init__y  sf     ~~11mmuVMbMbGc$d!%5f%=$de
 	 %es   Bc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|}t        j
                  j                  || j                  | j                        }|t        ||j                        }|rdnd}|rdnd}	t        | j                        D ]c  \  }
}|r||fz   }d}| j                  r&t        j                  g       }|| j                  k  rd}|rd}n |||||      }|d   }|s[|	|d	   fz   }	e |r||fz   }|st        d
 |||	fD              S t!        |||	      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.

            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:

                - 1 for pixel features that are real (i.e. **not masked**),
                - 0 for pixel features that are padding (i.e. **masked**).

                [What are attention masks?](../glossary#attention-mask)

            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Object queries that are added to the queries in each self-attention layer.

            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr  r(   FTNN)r  r  r   r   c              3   &   K   | ]	  }||  y wr   r(   .0vs     r*   	<genexpr>z&DetrEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   r4   r  
attentions)r   r  output_hidden_statesuse_return_dictr
   r   r   r  r   r   	enumeraterl  r%   randrh  tupler   )rJ   inputs_embedsr  r  r  ry  return_dictr  encoder_statesall_attentionsiencoder_layerto_dropdropout_probabilitylayer_outputss                  r*   r_   zDetrEncoder.forward  s   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%--mt||VZVcVc-d %7H[H[\N30d )$++ 6 	FA}#!/=2B!BG}}&+jjn#&7"G , !.!"#1&7	! !.a 0 !/=3C2E!E1	F4  +}.>>Ne]NN$Seee+>Vd
 	
r)   )NNNNNNr!   r"   r#   r$   r   rF   r_   r`   ra   s   @r*   re  re  j  s.    z  !S
r)   re  c                   B     e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 ddZ xZS )DetrDecodera  
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].

    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.

    Some small tweaks for DETR:

    - object_queries and query_position_embeddings are added to the forward pass.
    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.

    Args:
        config: DetrConfig
    r   c                 p   t         |   |       |j                  | _        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        t        j                  |j                        | _        d| _        | j                          y c c}w r>  )rE   rF   r   decoder_layerdroprh  r
   ri  rj  decoder_layersr@  rl  r)  r   	layernormgradient_checkpointingrm  rn  s      r*   rF   zDetrDecoder.__init__  s     ~~11mmuVMbMbGc$d!%5f%=$defnn5&+# %es   B3c
           
      X   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||}
|j	                         dd }d}| ||t        ||j                  d         z   }||t        ||j                  d         }| j                   j                  rdnd}|rdnd}|rdnd}|r|dnd}t        | j                        D ]  \  }}|r|
fz  }| j                  r%t        j                  g       }|| j                  k  r?| j                  r-| j                  r!| j                  |j                   
|||d      }n |
||||||      }|d   }
| j                   j                  r| j#                  |
      }
||
fz  }|s||d   fz  }|||d   fz  } | j#                  
      }
|r||
fz  }| j                   j                  rt        j$                  |      }|	st'        d	 |
||||fD              S t)        |
||||
      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The query embeddings that are passed into the decoder.

            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:

                - 1 for queries that are **not masked**,
                - 0 for queries that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
                in `[0, 1]`:

                - 1 for pixels that are real (i.e. **not masked**),
                - 0 for pixels that are padding (i.e. **masked**).

            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Object queries that are added to the queries and keys in each cross-attention layer.
            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.

            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrY   )tgt_lenr(   )r  r  rF  r9   rG  r  r   r   rz   c              3   $   K   | ]  }|| 
 y wr   r(   rs  s     r*   rv  z&DetrDecoder.forward.<locals>.<genexpr>~  s      = s   )r4   r  rx  r7   r   )r   r  ry  rz  r   r   r   auxiliary_lossr{  rl  r  r%   r|  rh  r  _gradient_checkpointing_func__call__r  r   r}  r   )rJ   r~  r  r9   rG  r  rF  r  ry  r  r  input_shapecombined_attention_maskintermediateall_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerr  r  s                        r*   r_   zDetrDecoder.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$)M',,.s3K"&%*A*M&=@Z 3 3[_A '#
 !,1G1S%?&(;(;[QS_&"
 "[[77rT #7BD0d&7<Q<]rdh"+DKK"8 '	@C#!m%55!}}&+jjn#&7**t}} $ A A!**!+)*! !.!#:#1.G*?+A&7! *!,M{{)) $} = 00 =#3"55(4(]1-=,??(O'	@T }5  -!11 ;;%% ;;|4L '):NL`bno  
 !++%1'3
 	
r)   	NNNNNNNNNr  ra   s   @r*   r  r    s7    z  "#"&!Q
r)   r  z
    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
    any specific head on top.
    c                   z    e Zd Zdef fdZd Zd Zd Zd Z e	e
       eee      	 	 	 	 	 	 	 	 ddej                  d	eej"                     d
eej                     deej                     deej                     deej                     dee   dee   dee   deeej                     ef   fd              Z xZS )	DetrModelr   c                    t         |   |       t        |      }t        |      }t	        ||      | _        t        j                  |j                  d   |j                  d      | _
        t        j                  |j                  |j                        | _        t        |      | _        t!        |      | _        | j%                          y )NrY   r   )kernel_size)rE   rF   ru   r   r   r   r
   rZ  r   r   input_projectionr   num_queriesrF  re  encoderr  decoderrm  )rJ   r   r   r  rL   s       r*   rF   zDetrModel.__init__  s      #6*08%h? !#		(*M*Mb*QSYSaSaop q)+f6H6H&..)Y&"6*"6* 	r)   c                     | j                   S r   )r  rJ   s    r*   get_encoderzDetrModel.get_encoder      ||r)   c                     | j                   S r   )r  r  s    r*   get_decoderzDetrModel.get_decoder  r  r)   c                     | j                   j                  j                  j                         D ]  \  }}|j	                  d        y r>  r   r   rp   r   r   rJ   rq   params      r*   freeze_backbonezDetrModel.freeze_backbone  s<    ==55;;LLN 	(KD%  '	(r)   c                     | j                   j                  j                  j                         D ]  \  }}|j	                  d        y )NTr  r  s      r*   unfreeze_backbonezDetrModel.unfreeze_backbone  s<    ==55;;LLN 	'KD%  &	'r)   output_typer`  r   r   decoder_attention_maskencoder_outputsr~  decoder_inputs_embedsr  ry  r  r  c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|j                  \  }
}}}|j
                  }|t        j                  |
||f|      }| j                  ||      \  }}|d   \  }}|t        d      | j                  |      }|j                  d      j                  ddd      }|d   j                  d      j                  ddd      }|j                  d      }|| j                  ||||||	      }nI|	rGt        |t              s7t        |d   t!        |      dkD  r|d   ndt!        |      dkD  r|d   nd	      }| j"                  j$                  j'                  d      j)                  |
dd      }t        j*                  |      }| j-                  |d|||d   ||||	
	      }|	s||z   S t/        |j0                  |j2                  |j4                  |j6                  |j0                  |j2                  |j4                  |j8                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetrModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)

        >>> # the last hidden states are the final query embeddings of the Transformer decoder
        >>> # these are of shape (batch_size, num_queries, hidden_size)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 100, 256]
        ```Nr   rY   z/Backbone does not return downsampled pixel maskrz   r   r   r~  r  r  r  ry  r  rw  	r~  r  r  rF  r9   rG  r  ry  r  )r4   r5   r6   r7   r8   r9   r:   r   )r   r  ry  rz  r   rh   r%   rH   r   r   r  r   r   r  re   r   rl   rF  rA   r   r   
zeros_liker  r,   r4   r  rx  r7   r   )rJ   r   r   r  r  r~  r  r  ry  r  r   r   r   r   rh   r   object_queries_listr   r   projected_feature_mapflattened_featuresr  flattened_maskrF  queriesdecoder_outputss                             r*   r_   zDetrModel.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]2>2D2D/
L&%$$j&%%@&QJ
 )-lJ(O%% %RLT<NOO !% 5 5k B 3::1=EEaAN,R088;CCAq!La
 ""ll0--"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %)$B$B$I$I$S$STU$V$]$]^hjkmn$o!""#<= ,,!)&?"1!"4#1/!5# ' 

 "_44-??"1"?"?.99,==&5&G&G"1"?"?.99'6'Q'Q	
 		
r)   )NNNNNNNN)r!   r"   r#   r   rF   r  r  r  r  r   DETR_INPUTS_DOCSTRINGr   r,   _CONFIG_FOR_DOCr%   r&   r   
LongTensorr   r   r   r_   r`   ra   s   @r*   r  r    s4   z &(' ++@A?Y 26>B7;59=A,0/3&*z
''z
 U--.z
 !)):): ;	z

 "%"3"34z
   1 12z
  ((9(9:z
 $D>z
 'tnz
 d^z
 
uU&&'8	9z
 Z Bz
r)   r  c                   (     e Zd ZdZ fdZd Z xZS )DetrMLPPredictionHeada  
    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
    height and width of a bounding box w.r.t. an image.

    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py

    c                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        y )Nr   c              3   N   K   | ]  \  }}t        j                  ||        y wr   )r
   r   )rt  rK   ks      r*   rv  z1DetrMLPPredictionHead.__init__.<locals>.<genexpr>B  s     #g1BIIaO#gs   #%)rE   rF   
num_layersr
   ri  ziprl  )rJ   	input_dim
hidden_dim
output_dimr  hrL   s         r*   rF   zDetrMLPPredictionHead.__init__>  sS    $LJN+mm#gYKRSOUVZdYeUe@f#ggr)   c                     t        | j                        D ]D  \  }}|| j                  dz
  k  r%t        j                  j                   ||            n ||      }F |S )Nr   )r{  rl  r  r
   r   relu)rJ   r\   r  layers       r*   r_   zDetrMLPPredictionHead.forwardD  sT    !$++. 	VHAu01DOOa4G0G""58,USTXA	Vr)   r   ra   s   @r*   r  r  5  s    hr)   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
    such as COCO detection.
    c                   t    e Zd Zdef fdZ ee       eee	      	 	 	 	 	 	 	 	 	 dde
j                  dee
j                     dee
j                     dee
j                     dee
j                     d	ee
j                     d
eee      dee   dee   dee   deee
j                     ef   fd              Z xZS )DetrForObjectDetectionr   c                    t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz         | _        t        |j                  |j                  dd      | _
        | j                          y )Nr   r{   r   )r  r  r  r  )rE   rF   r  rp   r
   r   r   
num_labelsclass_labels_classifierr  bbox_predictorrm  r2  s     r*   rF   zDetrForObjectDetection.__init__R  sr      v&
 (*yyNNF--1(
$ 4nnAZ[

 	r)   r  r   r   r  r  r~  r  labelsr  ry  r  r  c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      j                         }d\  }}}|d\  }}| j                   j                  rC|
r|j                  n|d   }| j                  |      }| j	                  |      j                         }| j                  ||| j                  || j                   ||      \  }}}|
s|||f|z   |z   }n||f|z   }|||f|z   S |S t        ||||||j                  |j                  |j                  |j                  |j                  |j                   |j"                        S )a	  
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> target_sizes = torch.tensor([image.size[::-1]])
        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
        ...     0
        ... ]

        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(
        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
        ...         f"{round(score.item(), 3)} at location {box}"
        ...     )
        Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
        Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
        Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
        Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
        Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
        ```)r   r  r  r~  r  r  ry  r  r   NNNrq  r{   )r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   )r   rz  rp   r  r  sigmoidr  r   loss_functionrh   r.   r4   r5   r6   r7   r8   r9   r:   )rJ   r   r   r  r  r~  r  r  r  ry  r  r=  sequence_outputr1   r2   r/   r0   r3   outputs_classoutputs_coordr  outputs                         r*   r_   zDetrForObjectDetection.forwardc  s   v &1%<k$++B]B] **!#9+'"7/!5#  

 "!* --o>((9AAC
-=*i*+5(M={{))EPwAAV]^_V` $ < <\ J $ 3 3L A I I K151C1CZmUb2.D).  , *-0AAGK *-7373CT9%.OO(!/%77")"?"?&99$55&-&G&G")"?"?&99
 	
r)   r  )r!   r"   r#   r   rF   r   r  r   r.   r  r%   r&   r   r  r   dictr   r   r   r_   r`   ra   s   @r*   r  r  J  s:   z " ++@A+DSbc 26>B7;59=A'+,0/3&*m
''m
 U--.m
 !)):): ;	m

 "%"3"34m
   1 12m
  ((9(9:m
 d$m
 $D>m
 'tnm
 d^m
 
uU&&')BB	Cm
 d Bm
r)   r  z
    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
    such as COCO panoptic.

    c                   t    e Zd Zdef fdZ ee       eee	      	 	 	 	 	 	 	 	 	 dde
j                  dee
j                     dee
j                     dee
j                     dee
j                     d	ee
j                     d
eee      dee   dee   dee   deee
j                     ef   fd              Z xZS )DetrForSegmentationr   c                 v   t         |   |       t        |      | _        |j                  |j
                  }}| j                  j                  j                  j                  j                  }t        ||z   |d d d   dd  |      | _        t        |||d|j                        | _        | j                          y )NrY   r   )r   rP  )rE   rF   r  detrr   r&  rp   r   r   r   DetrMaskHeadSmallConv	mask_headrS  rR  bbox_attentionrm  )rJ   r   hidden_sizenumber_of_headsr   rL   s        r*   rF   zDetrForSegmentation.__init__  s      +62	 (.~~v7U7U_%)YY__%=%=%J%J%e%e"./)+Edd+KBC+PR]
 1osH^H^
 	r)   r  r   r   r  r  r~  r  r  r  ry  r  r  c                    |
|
n| j                   j                  }
|j                  \  }}}}|j                  }|t	        j
                  |||f|      }| j                  j                  j                  ||      \  }}|d   \  }}|j                  \  }}}}| j                  j                  j                  |      }|j                  d      j                  ddd      }|d   j                  d      j                  ddd      }|j                  d      }|,| j                  j                  j                  |||||	|
      }nI|
rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd	      }| j                  j                  j                   j"                  j%                  d      j'                  |dd      }t	        j(                  |      }| j                  j                  j+                  |d|||d   |||	|

	      }|d   }| j                  j-                  |      }| j                  j/                  |      j1                         }|d   j                  ddd      j3                  || j                   j4                  ||      }|j3                  |||      }| j7                  |||       }| j9                  |||d   d   |d   d   |d   d   g      } | j3                  || j                  j                   j:                  | j                  d   | j                  d         }!d\  }"}#}$|d\  }%}&| j                   j<                  rW|
r|j>                  n|d   }'| j                  j-                  |'      }%| j                  j/                  |'      j1                         }&| jA                  |||||!| j                   |%|&      \  }"}#}$|
s'|$|||!f|$z   |z   |z   }(n|||!f|z   |z   }(|"|"|#f|(z   S |(S tC        |"|#|||!|$|jD                  |jF                  |jH                  |jJ                  |jD                  |jF                  |jH                        S )a  
        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.

        Returns:

        Examples:

        ```python
        >>> import io
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> import numpy

        >>> from transformers import AutoImageProcessor, DetrForSegmentation
        >>> from transformers.image_transforms import rgb_to_id

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)

        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
        >>> # Segmentation results are returned as a list of dictionaries
        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])

        >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
        >>> panoptic_seg = result[0]["segmentation"]
        >>> # Get prediction score and segment_id to class_id mapping of each segment
        >>> panoptic_segments_info = result[0]["segments_info"]
        ```Nr   )r   rY   rz   r   r   r  rw  r  )r   r   r  rq  )r/   r0   r1   r2   r=   r3   r4   r5   r6   r7   r8   r9   r:   )&r   rz  r   rh   r%   rH   r  rp   r   r  r   r   r  re   r   rl   rF  rA   r   r   r  r  r  r  r  r   r   r  r  r  r  r   r  r<   r4   r  rx  r7   ))rJ   r   r   r  r  r~  r  r  r  ry  r  r   r   r   r   rh   r   r  r   r   r  r  r  r  rF  r  r  r  r1   r2   memory	bbox_mask	seg_masksr=   r/   r0   r3   r  r  r  r  s)                                            r*   r_   zDetrForSegmentation.forward  s   x &1%<k$++B]B]2>2D2D/
L&%$$Z$?OJ )-		(@(@Zd(@(e%% %RLT2=2C2C/
L&% $		 @ @ M 3::1=EEaAN,R088;CCAq!La
 ""iioo550--"3%9' 6 O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %)IIOO$M$M$T$T$^$^_`$a$h$h1%
! ""#<= ))//11!)&?"1!"4#1/!5# 2 

 *!, 22?CYY--o>FFH
 #++Aq!499*dkkFYFY[achi"":vu=
 ''te'L	NN#8)hqkRSnV^_`VabcVdfnopfqrsftEuv	^^J		0@0@0L0Lioo^`Naclcrcrsucvw
-=*i*+5(M={{))MXII^mnp^q $		 A A, O $		 8 8 F N N P151C1C
J]\i2.D).  , *j9<MMP__bqq *j9OKo]373CT9%.OO%!!/-??"1"?"?.99,==&5&G&G"1"?"?.99
 	
r)   r  )r!   r"   r#   r   rF   r   r  r   r<   r  r%   r&   r   r  r   r  r   r   r   r_   r`   ra   s   @r*   r  r    s:   z ( ++@A+AP_` 26>B7;59=A'+,0/3&*j
''j
 U--.j
 !)):): ;	j

 "%"3"34j
   1 12j
  ((9(9:j
 d$j
 $D>j
 'tnj
 d^j
 
uU&&')??	@j
 a Bj
r)   r  lengthc                 |    | j                  d      j                  dt        |      ddd      j                  dd      S )Nr   r   )r   r   r!  r   )r   r  s     r*   _expandr    s7    A%%aVaA>FFq!LLr)   c                   <     e Zd ZdZ fdZdededee   fdZ xZS )r  z^
    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
    c                    t         |           |dz  dk7  rt        d      ||dz  |dz  |dz  |dz  |dz  g}t        j                  ||dd	
      | _        t        j                  d|      | _        t        j                  ||d	   dd	
      | _        t        j                  t        d|d	         |d	         | _
        t        j                  |d	   |d   dd	
      | _        t        j                  t        d|d         |d         | _        t        j                  |d   |d   dd	
      | _        t        j                  t        d|d         |d         | _        t        j                  |d   |d   dd	
      | _        t        j                  t        d|d         |d         | _        t        j                  |d   d	dd	
      | _        || _        t        j                  |d   |d	   d	      | _        t        j                  |d	   |d   d	      | _        t        j                  |d   |d   d	      | _        | j-                         D ]r  }t/        |t        j                        st        j0                  j3                  |j4                  d	       t        j0                  j7                  |j8                  d       t y )N   r   zsThe hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8rz   r{   r~   r   r   r   )padding)a)rE   rF   r   r
   rZ  lay1	GroupNormgn1lay2r4  gn2lay3gn3lay4gn4lay5gn5out_layr   adapter1adapter2adapter3modulesre   rT  kaiming_uniform_rA   	constant_rB   )rJ   r   fpn_dimscontext_dim
inter_dimsmrL   s         r*   rF   zDetrMaskHeadSmallConv.__init__  sG   7a<) 
 ;!+[A-={a?OQ\`bQbdosuduv
IIc315	<<3'IIc:a=!Q?	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EIIjmZ]AqI	<<Az!} 5z!}EyyA1a@		(1+z!}a@		(1+z!}a@		(1+z!}a@ 	-A!RYY'((Q(7!!!&&!,	-r)   r\   r  fpnsc                    t        j                  t        ||j                  d         |j	                  dd      gd      }| j                  |      }| j                  |      }t        j                  j                  |      }| j                  |      }| j                  |      }t        j                  j                  |      }| j                  |d         }|j                  d      |j                  d      k7  r-t        ||j                  d      |j                  d      z        }|t        j                  j                  ||j                  dd  d      z   }| j                  |      }| j!                  |      }t        j                  j                  |      }| j#                  |d         }|j                  d      |j                  d      k7  r-t        ||j                  d      |j                  d      z        }|t        j                  j                  ||j                  dd  d      z   }| j%                  |      }| j'                  |      }t        j                  j                  |      }| j)                  |d         }|j                  d      |j                  d      k7  r-t        ||j                  d      |j                  d      z        }|t        j                  j                  ||j                  dd  d      z   }| j+                  |      }| j-                  |      }t        j                  j                  |      }| j/                  |      }|S )Nr   r   r   nearest)r   moderz   )r%   r   r  r   r   r  r  r
   r   r  r  r  r  r   r   r  r  r  r   r  r  r  r  r  )rJ   r\   r  r  cur_fpns        r*   r_   zDetrMaskHeadSmallConv.forward  s    IIwq)//!"45y7H7HA7NOQRSIIaLHHQKMMq!IIaLHHQKMMq!--Q(<<?affQi'gqvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!--Q(<<?affQi'gqvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!--Q(<<?affQi'gqvvayGLLO'CDGbmm//bc8JQZ/[[IIaLHHQKMMq!LLOr)   )	r!   r"   r#   r$   rF   r	   r   r_   r`   ra   s   @r*   r  r    s,     -D& &F &$v, &r)   r  c                   8     e Zd ZdZd fd	Zddee   fdZ xZS )rS  zdThis is a 2D attention module, which only returns the attention softmax (no multiplication by value)c                 &   t         |           || _        || _        t	        j
                  |      | _        t	        j                  |||      | _        t	        j                  |||      | _	        t        || j                  z        dz  | _        y )Nr   r   )rE   rF   r   r  r
   Dropoutr   r   rW  rV  r   normalize_fact)rJ   	query_dimr  r   r   rB   rP  rL   s          r*   rF   zDetrMHAttentionMap.__init__  so    "$zz'*		)ZdC		)ZdC#J$?@DHr)   r   c                    | j                  |      }t        j                  j                  || j                  j
                  j                  d      j                  d      | j                  j                        }|j                  |j                  d   |j                  d   | j                  | j                  | j                  z        }|j                  |j                  d   | j                  | j                  | j                  z  |j                  d   |j                  d         }t        j                  d|| j                  z  |      }|W|j                  |j                  d      j                  d      t        j                   |j"                        j$                         t        j                  j'                  |j)                  d      d      j                  |j+                               }| j-                  |      }|S )NrY   r   r   r   zbqnc,bnchw->bqnhwrz   r   )rW  r
   r   conv2drV  rA   r   rB   r   r   r   r  r%   einsumr  masked_fill_r9  r   r4  r  r   r   r   )rJ   qr  r   queries_per_headkeys_per_headweightss          r*   r_   zDetrMHAttentionMap.forward  s{   MM!MM  DMM$8$8$B$B2$F$P$PQS$TVZVcVcVhVhi66!''!*aggaj$..$//]a]k]kJklqwwqz4>>4??dnn;\^_^e^efh^iklkrkrsukvw,,24DtGZGZ4Z\ij  !2!<!<Q!?W]]A[A_A_`--''(:'CHHX,,w'r)   )r   TNr   )	r!   r"   r#   r$   rF   r   r	   r_   r`   ra   s   @r*   rS  rS    s    n	I(6"2 r)   rS  )r  r  r  rL  )Gr$   r   dataclassesr   typingr   r   r   r   r   r%   r	   r
   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.backbone_utilsr   configuration_detrr   rw   r   
get_loggerr!   loggerr  _CHECKPOINT_FOR_DOCr   r,   r.   r<   Moduler?   ro   ru   r   r   r   r   r   r#  r@  rL  DETR_START_DOCSTRINGr  re  r  r  r  r  r  r!  r  r  rS  __all__r(   r)   r*   <module>r0     s     ! 5 5   ! B g g -   2 * ! 
		H	%/  C: C C: $C( $C $CN ;B ;B ;B| BB[ BB BBN$ BII $ N'0Ebii EPBII *"		 "J299 .
~2BII ~2BDryy DNfryy fR?/ ?<  # Lo
% o
dm
% m
`  ^
# ^
^
DBII *  A
0 A
A
H 
 A
- A
A
HMC M
MBII M` 8r)   