
    sg}              	          d Z ddlZddlZddlmZmZ ddlZddlZddl	Zddlm
Z
mZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlm Z  ddl!m"Z"  ejF                  e$      Z%dZ&dZ'g dZ(dZ)dZ*d<deee+f   fdZ, G d dejZ                        Z. G d dej^                        Z0 G d dejb                        Z2 G d dejf                        Z4 G d dejb                        Z5d=dej                  de6d e+dej                  fd!Z7 G d" d#ejb                        Z8d>d$Z9 G d% d&ejb                        Z: G d' d(ejb                        Z; G d) d*ejb                        Z< G d+ d,ejb                        Z= G d- d.ejb                        Z> G d/ d0e      Z?d1Z@d2ZA ed3e@       G d4 d5e?             ZB ed6e@       G d7 d8e?             ZC ed9e@       G d: d;e?e              ZDy)?z9PyTorch BiT model. Also supports backbone for ViT hybrid.    N)OptionalTuple)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )	BitConfigr   zgoogle/bit-50)r   i      r   z	tiger catreturnc                    d}| |dz
  ||dz
  z  z   dz  } | |fS t        | t              ra| j                         } | dk(  r0|dk(  r#||dz
  z  dz  dk(  r|dz
  ||dz
  z  z   dz  } | |fS d} d}| |fS | dk(  rd} | |fS |dz
  ||dz
  z  z   dz  } | |fS )al  
    Utility function to get the tuple padding value given the kernel_size and padding.

    Args:
        padding (Union[`str`, `int`], *optional*):
            Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
            PyTorch is used.
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size of the convolution layers.
        stride (`int`, *optional*, defaults to 1):
            Stride value of the convolution layers.
        dilation (`int`, *optional*, defaults to 1):
            Dilation value of the convolution layers.
    Fr      samer   Tvalid)
isinstancestrlower)paddingkernel_sizestridedilationdynamics        W/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bit/modeling_bit.pyget_padding_valuer(   <   s     GQJ(kAo">>1D'3--/f{K!O <AQF"QJ(kAo*FF1L G  G G G 
h+/&BBqHGG    c                   6     e Zd ZdZ	 	 	 	 	 	 d fd	Zd Z xZS )WeightStandardizedConv2dzConv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.

    Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
    Standardization](https://arxiv.org/abs/1903.10520v2)
    c
           
          t        ||||      \  }}
t        | 	  ||||||||       |
rt        |||      | _        |	| _        y d | _        |	| _        y )N)r$   r%   )r$   r"   r%   groupsbias)r(   super__init__DynamicPad2dpadeps)self
in_channelout_channelsr#   r$   r"   r%   r-   r.   r3   
is_dynamic	__class__s              r'   r0   z!WeightStandardizedConv2d.__init__l   ss     0V^fg 	 		
 #KBDH  DHr)   c           	         | j                   | j                  |      }t        j                  j                  | j                  j                  d| j                  d      d d dd| j                        j                  | j                        }t        j                  j                  ||| j                  | j                  | j                  | j                  | j                        }|S )Nr   T        )trainingmomentumr3   )r2   r   
functional
batch_normweightreshaper6   r3   
reshape_asconv2dr.   r$   r"   r%   r-   )r4   hidden_stater@   s      r'   forwardz WeightStandardizedConv2d.forward   s    8888L1L))KK4#4#4b94PT_bhlhphp * 

*T[[
! 	 }}++&$))T[[$,,W[WbWb
 r)   )r   SAMEr   r   Fgư>__name__
__module____qualname____doc__r0   rE   __classcell__r8   s   @r'   r+   r+   e   s&     :	r)   r+   c                   *     e Zd ZdZd fd	Zd Z xZS )BitGroupNormActivationzQ
    A module that combines group normalization with an activation function.
    c                     t         t        |   |j                  |||       |rt        |j
                     | _        y t        j                         | _        y )N)r3   affine)	r/   rO   r0   
num_groupsr   
hidden_act
activationr   Identity)r4   confignum_channelsr3   rQ   apply_activationr8   s         r'   r0   zBitGroupNormActivation.__init__   sF    $d4V5F5FZ]fl4m$V%6%67DO kkmDOr)   c                     t         j                  j                  || j                  | j                  | j
                  | j                        }| j                  |      }|S N)r   r>   
group_normrR   r@   r.   r3   rT   )r4   rD   s     r'   rE   zBitGroupNormActivation.forward   sH    }}//doot{{\`\e\egkgogop|4r)   )gh㈵>TTrG   rM   s   @r'   rO   rO      s    ,r)   rO   c                   *     e Zd ZdZd fd	Zd Z xZS )r1   z
    A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
    hidden states.
    c                     t         |           t        |t              r||f}t        |t              r||f}t        |t              r||f}|| _        || _        || _        || _        d }|| _        y )Nc                 p    t        t        j                  | |z        dz
  |z  |dz
  |z  z   dz   | z
  d      S )Nr   r   )maxmathceil)xr#   r$   r%   s       r'   compute_paddingz.DynamicPad2d.__init__.<locals>.compute_padding   sB    		!f*-1V;{QRZ>ZZ]^^abbdeffr)   )	r/   r0   r   intr#   r$   r%   valuerc   )r4   r#   r$   r%   re   rc   r8   s         r'   r0   zDynamicPad2d.__init__   sw    k3'&4Kfc"f%Fh$ (+H& 
	g  /r)   c           	         |j                         dd  \  }}| j                  || j                  d   | j                  d   | j                  d         }| j                  || j                  d   | j                  d   | j                  d         }|dkD  s|dkD  rBt
        j                  j                  ||dz  ||dz  z
  |dz  ||dz  z
  g| j                        }|S )Nr   r   r   )re   )	sizerc   r#   r$   r%   r   r>   r2   re   )r4   inputinput_heightinput_widthpadding_heightpadding_widths         r'   __call__zDynamicPad2d.__call__   s    $)JJL$5!k --lD<L<LQ<OQUQ\Q\]^Q_aeananopaqr,,[$:J:J1:Mt{{[\~_c_l_lmn_op A!2MM%%!Q&!MQ$66"a'"^q%88	 jj & 	E r)   )r   )rH   rI   rJ   rK   r0   rn   rL   rM   s   @r'   r1   r1      s    
/,r)   r1   c                   <     e Zd ZdZ	 	 	 	 	 	 ddef fdZd Z xZS )BitMaxPool2dz1Tensorflow like 'SAME' wrapper for 2D max poolingr#   c                    t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}t        |   |||||       |rt        ||||      | _        y t        j                         | _        y rZ   )
r   collectionsabcIterabler/   r0   r1   r2   r   rU   )	r4   r#   r$   r%   	ceil_moder"   padding_valueuse_dynamic_paddingr8   s	           r'   r0   zBitMaxPool2d.__init__   s     &0[__=U=U%Vk]hju\v%fkoo.F.FGfV\M])(KOO4L4LM8T\^fSgfgxK#K=QDH{{}DHr)   c                     | j                  |      }t        j                  j                  || j                  | j
                  | j                  | j                  | j                        S rZ   )	r2   r   r>   
max_pool2dr#   r$   r"   r%   ru   r4   hidden_statess     r'   rE   zBitMaxPool2d.forward   sM    /}}''4++T[[$,,W[WeWe
 	
r)   )Nr   F)r   r   r   T)rH   rI   rJ   rK   rd   r0   rE   rL   rM   s   @r'   rp   rp      s,    ;
  %%&
r)   rp   c                   8     e Zd ZdZdef fdZdedefdZ xZS )BitEmbeddingszL
    BiT Embeddings (stem) composed of a single aggressive convolution.
    rV   c                 .   t         |           t        |j                  |j                  ddd|j
                        | _        t        dd|j                        | _	        |j
                  7|j
                  j                         dk(  rt        j                         | _        nt        j                  dd	
      | _        |j                  dk(  st!        ||j                        | _        nt        j                         | _        |j                  | _        y )Nr   r   :0yE>)r#   r$   r3   r"   r
   )r#   r$   rw   rF   )r   r   r   r   r;   )r"   re   preactivationrW   )r/   r0   r+   rW   embedding_sizeglobal_paddingconvolutionrp   embedding_dynamic_paddingpoolerupperr   rU   r2   ConstantPad2d
layer_typerO   normr4   rV   r8   s     r'   r0   zBitEmbeddings.__init__   s    3!!))
 #qPVPpPpq   ,1F1F1L1L1NRX1X{{}DH''CHDH  O3.vFDYDYZDIDI"//r)   pixel_valuesr   c                     |j                   d   }|| j                  k7  rt        d      | j                  |      }| j	                  |      }| j                  |      }| j                  |      }|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)shaperW   
ValueErrorr   r2   r   r   )r4   r   rW   	embeddings       r'   rE   zBitEmbeddings.forward  sr    #))!,4,,,w  $$\2	HHY'	IIi(	KK	*	r)   )	rH   rI   rJ   rK   r   r0   r   rE   rL   rM   s   @r'   r}   r}      s'    0y 06F v r)   r}   ri   	drop_probr<   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r;   r   r   )r   )dtypedevice)r   ndimtorchrandr   r   floor_div)ri   r   r<   	keep_probr   random_tensoroutputs          r'   	drop_pathr   *  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr)   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
BitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 0    t         |           || _        y rZ   )r/   r0   r   )r4   r   r8   s     r'   r0   zBitDropPath.__init__B  s    "r)   r{   c                 D    t        || j                  | j                        S rZ   )r   r   r<   rz   s     r'   rE   zBitDropPath.forwardF  s    FFr)   c                 8    dj                  | j                        S )Nzp={})formatr   )r4   s    r'   
extra_reprzBitDropPath.extra_reprI  s    }}T^^,,r)   rZ   )rH   rI   rJ   rK   r   floatr0   r   r   rE   r    r   rL   rM   s   @r'   r   r   ?  sG    b#(5/ #T #GU\\ Gell G-C -r)   r   c                 f    |}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }|S )Nr   g?)r_   rd   )re   divisor	min_value	new_values       r'   make_divr   M  sG    IIs57Q;#677BWLMI3;W	r)   c                   :     e Zd ZdZ	 	 	 	 	 	 	 	 d fd	Zd Z xZS )BitPreActivationBottleneckLayera  Pre-activation (v2) bottleneck block.
    Follows the implementation of "Identity Mappings in Deep Residual Networks":
    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua

    Except it puts the stride on 3x3 conv when available.
    c           	         t         |           |xs |}|xs |}t        ||z        }|
rt        ||||d      | _        nd | _        t        ||      | _        t        ||dd|j                        | _	        t        ||      | _
        t        ||d||d|j                        | _        t        ||      | _        t        ||dd|j                        | _        |	d	kD  rt        |	      | _        y t        j                          | _        y )
NTr$   preactr   r   r3   r"   r   r
   )r$   r-   r3   r"   r   )r/   r0   r   BitDownsampleConv
downsamplerO   norm1r+   r   conv1norm2conv2norm3conv3r   r   rU   r   )r4   rV   in_channelsr6   bottle_ratior$   r%   first_dilationr-   drop_path_rateis_first_layermid_channelsr8   s               r'   r0   z(BitPreActivationBottleneckLayer.__init__]  s    	'38#2{| ;</DO #DO+FK@
-k<PT^d^s^st
+FN
-,&T[a[p[p

 ,FLA
-lL!QU_e_t_tu
8F8J^4PRP[P[P]r)   c                 0   | j                  |      }|}| j                  | j                  |      }| j                  |      }| j                  | j	                  |            }| j                  | j                  |            }| j                  |      }||z   S rZ   )r   r   r   r   r   r   r   r   )r4   r{   hidden_states_preactshortcuts       r'   rE   z'BitPreActivationBottleneckLayer.forward  s    #zz-8 !??&';<H 

#78

4::m#<=

4::m#<=}5x''r)   N      ?r   r   Nr   r;   FrG   rM   s   @r'   r   r   U  s.     *^X(r)   r   c                   :     e Zd ZdZ	 	 	 	 	 	 	 	 d fd	Zd Z xZS )BitBottleneckLayerz\Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid.c           
      D   t         |           |xs |}|xs |}t        ||z        }|
rt        ||||d      | _        nd | _        t        ||dd|j                        | _        t        ||      | _	        t        ||d|||d|j                        | _
        t        ||      | _        t        ||dd|j                        | _        t        ||d	      | _        |	d
kD  rt        |	      nt        j                          | _        t$        |j&                     | _        y )NFr   r   r   r   r   r
   )r$   r%   r-   r3   r"   rW   rX   r   )r/   r0   r   r   r   r+   r   r   rO   r   r   r   r   r   r   r   rU   r   r   rS   rT   )r4   rV   r   r6   r   r$   r%   r   r-   r   r   mid_chsr8   s               r'   r0   zBitBottleneckLayer.__init__  s    	'38#2{<,67/DO #DO-k7A4Y_YnYno
+FI
-#))	

 ,FI
-g|QDZ`ZoZop
+F`ef
8F8J^4PRP[P[P] !2!23r)   c                 Z   |}| j                   | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  ||z         }|S rZ   )	r   r   r   r   r   r   r   r   rT   )r4   r{   r   s      r'   rE   zBitBottleneckLayer.forward  s     ??&}5H 

=1

=1

=1

=1

=1

=1}5(@Ar)   r   rG   rM   s   @r'   r   r     s+    f /4br)   r   c                   *     e Zd Z	 	 d fd	Zd Z xZS )r   c                     t         |           t        ||d|d|j                        | _        |rt        j                         | _        y t        ||d      | _        y )Nr   r   )r$   r3   r"   Fr   )	r/   r0   r+   r   convr   rU   rO   r   )r4   rV   r   r6   r$   r   r8   s         r'   r0   zBitDownsampleConv.__init__  s\     	,qT6K`K`
	
  KKM 		 (\\ab 		r)   c                 B    | j                  | j                  |            S rZ   )r   r   )r4   rb   s     r'   rE   zBitDownsampleConv.forward  s    yy1&&r)   )r   T)rH   rI   rJ   r0   rE   rL   rM   s   @r'   r   r     s     
$'r)   r   c                   >     e Zd ZdZ	 	 d fd	Zd ZdedefdZ xZS )BitStagez7
    A ResNet v2 stage composed by stacked layers.
    c	                 ^   t         |           |dv rdnd}	|j                  dk(  rt        }
nt        }
|}t        j                         | _        t        |      D ]Q  }| j                  |||      \  }}}| j                  j                  t        |       |
|||||||	||	             |}|}	S y )N)r   r   r   r   
bottleneck)r$   r%   r   r   r   r   )r/   r0   r   r   r   r   
Sequentiallayersrange_get_updated_hyperparameters
add_moduler    )r4   rV   r   r6   r$   r%   depthr   layer_dropoutr   	layer_clsprev_chs	layer_idxr   r   r8   s                  r'   r0   zBitStage.__init__  s     	&&0a ,*I7Immou 	&I595V5V6=62FNN KK""I !%!-#1#1#1
 $H%N+	&r)   c                 8    |r||   }nd}|dk7  rd}|dk(  }|||fS )zt
        Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
        r;   r   r    )r4   r   r$   r   r   r   s         r'   r   z%BitStage._get_updated_hyperparameters,  s8     *95N N>F"a~~55r)   ri   r   c                 T    |}t        | j                        D ]  \  }} ||      } |S rZ   )	enumerater   )r4   ri   rD   _layers        r'   rE   zBitStage.forward<  s3    !$++. 	/HAu .L	/r)   )r   N)	rH   rI   rJ   rK   r0   r   r   rE   rL   rM   s   @r'   r   r     s.     ,&\6 V  r)   r   c            	       F     e Zd Zdef fdZd Z	 d	dedededefdZ	 xZ
S )

BitEncoderrV   c           
         t         |           t        j                  g       | _        |j
                  }d}d}t        j                  t        j                  d|j                  t        |j                                    j                  |j                        D cg c]  }|j                          }}t        t!        |j                  |j"                  |            D ]`  \  }\  }}	}
| j%                  |||	||      \  }}}t'        |||||||
      }|}||z  }| j                  j)                  t+        |      |       b y c c}w )N   r   r   )r$   r%   r   r   )r/   r0   r   
ModuleListstagesr   r   r   nplinspacer   sumdepthssplittolistr   ziphidden_sizesr   r   r   r    )r4   rV   r   current_strider%   rb   layer_dropouts	stage_idxcurrent_depthcurrent_hidden_sizer   r6   r$   stager8   s                 r'   r0   zBitEncoder.__init__D  sA   mmB'((  \\"++a1F1FFMMHZ"[\bbcicpcpq
 HHJ
 

 OXv22NCO
 	:JIJ':M .2-N-N>+>&.*L&( !#+E $Hf$NKK""3y>59+	:
s   Ec                 z    t        ||j                  z        }|dk(  rdnd}||j                  k\  r||z  }d}|||fS )Nr   r   r   )r   width_factoroutput_stride)r4   r   r   r   r%   rV   r6   r$   s           r'   r   z'BitEncoder._get_updated_hyperparametersj  sO     3f6I6I IJ1n!V111HFVX--r)   rD   output_hidden_statesreturn_dictr   c                     |rdnd }| j                   D ]  }|r||fz   } ||      } |r||fz   }|st        d ||fD              S t        ||      S )Nr   c              3   &   K   | ]	  }||  y wrZ   r   ).0vs     r'   	<genexpr>z%BitEncoder.forward.<locals>.<genexpr>  s     SqQ]Ss   )last_hidden_stater{   )r   tupler   )r4   rD   r   r   r{   stage_modules         r'   rE   zBitEncoder.forwardr  sv     3 KK 	6L# - ?'5L		6  )\O;MS\=$ASSS-*'
 	
r)   )FT)rH   rI   rJ   r   r0   r   r   boolr   rE   rL   rM   s   @r'   r   r   C  sA    $:y $:L. ]a
"
:>
UY
	'
r)   r   c                   (    e Zd ZdZeZdZdZdgZd Z	y)BitPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bitr   r}   c                 J   t        |t        j                        r-t        j                  j	                  |j
                  dd       y t        |t        j                        rt        j                  j                  |j
                  t        j                  d             |j                  xt        j                  j                  |j
                        \  }}|dkD  rdt        j                  |      z  nd}t        j                  j                  |j                  | |       y y t        |t        j                  t        j                  f      rUt        j                  j                  |j
                  d       t        j                  j                  |j                  d       y y )Nfan_outrelu)modenonlinearity   )ar   r   )r   r   Conv2dinitkaiming_normal_r@   Linearkaiming_uniform_r`   sqrtr.   _calculate_fan_in_and_fan_outuniform_BatchNorm2d	GroupNorm	constant_)r4   modulefan_inr   bounds        r'   _init_weightsz BitPreTrainedModel._init_weights  s   fbii(GG##FMM	PV#W		*GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  >?GGfmmQ/GGfkk1- @r)   N)
rH   rI   rJ   rK   r   config_classbase_model_prefixmain_input_name_no_split_modulesr  r   r)   r'   r  r    s'    
 L$O().r)   r  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BitConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aA  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zLThe bare BiT model outputting raw features without any specific head on top.c                   |     e Zd Z fdZ ee       eeee	de
      	 d	dedee   dee   defd              Z xZS )
BitModelc                 J   t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  rt        ||j                  d         nt        j                         | _        t        j                  d      | _        | j                          y )Nr   r:   r   )r   r   )r/   r0   rV   r}   embedderr   encoderr   rO   r   r   rU   r   AdaptiveAvgPool2dr   	post_initr   s     r'   r0   zBitModel.__init__  s     %f-!&)   O3 #68K8KB8OP 		 **62r)   vision)
checkpointoutput_typer  modalityexpected_outputr   r   r   r   c                 J   ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  |||      }|d   }| j                  |      }| j                  |      }|s
||f|dd  z   S t        |||j                        S )Nr   r   r   r   )r   pooler_outputr{   )	rV   r   use_return_dictr$  r%  r   r   r   r{   )r4   r   r   r   embedding_outputencoder_outputsr   pooled_outputs           r'   rE   zBitModel.forward  s     %9$D $++JjJj 	 &1%<k$++B]B]==6,,3GU` ' 
 ,A. II&78$56%}58KKK7/')77
 	
r)   NN)rH   rI   rJ   r0   r   BIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   r  rE   rL   rM   s   @r'   r"  r"    sp    
" ++?@&<$. pt
"
:B4.
^fgk^l
	1
 A
r)   r"  z
    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Z fdZ ee       eeee	e
      	 	 	 	 d	deej                     deej                     dee   dee   def
d              Z xZS )
BitForImageClassificationc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  t        j                         |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                               | _        | j                          y )Nr   r:   )r/   r0   
num_labelsr"  r  r   r   Flattenr  r   rU   
classifierr'  r   s     r'   r0   z"BitForImageClassification.__init__   s      ++F#--JJLEKEVEVYZEZBIIf))"-v/@/@A`b`k`k`m

 	r)   )r)  r*  r  r,  r   labelsr   r   r   c                    ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j
                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t!        |||j"                  	      S )
a0  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr.  r   
regressionsingle_label_classificationmulti_label_classificationr:   r   )losslogitsr{   )rV   r0  r  r/  r>  problem_typer<  r   r   longrd   r	   squeezer   viewr   r   r{   )r4   r   r?  r   r   outputsr3  rE  rD  loss_fctr   s              r'   rE   z!BitForImageClassification.forward  s   & &1%<k$++B]B]((<>R`k(l1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F'+'7D7V#CVC3f\c\q\qrrr)   )NNNN)rH   rI   rJ   r0   r   r5  r   _IMAGE_CLASS_CHECKPOINTr   r7  _IMAGE_CLASS_EXPECTED_OUTPUTr   r   FloatTensor
LongTensorr  rE   rL   rM   s   @r'   r:  r:    s    
 ++?@*8$4	 59-1/3&*/su001/s ))*/s 'tn	/s
 d^/s 
./s A/sr)   r:  zL
    BiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                   v     e Zd Z fdZ ee       eee      	 dde	de
e   de
e   defd              Z xZS )	BitBackbonec                     t         |   |       t         | 	  |       t        |      | _        |j
                  g|j                  z   | _        | j                          y rZ   )	r/   r0   _init_backboner"  r  r   r   num_featuresr'  r   s     r'   r0   zBitBackbone.__init__L  sQ     v&F##223f6I6II 	r)   )r*  r  r   r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }| j                  |dd      }|j                  }d}t        | j                        D ]  \  }}|| j                  v s|||   fz  } |s|f}	|r|	|j                  fz  }	|	S t        ||r|j                  d      S dd      S )a`  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("google/bit-50")
        >>> model = AutoBackbone.from_pretrained("google/bit-50")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr.  r   )feature_mapsr{   
attentions)	rV   r0  r   r  r{   r   stage_namesout_featuresr   )
r4   r   r   r   rJ  r{   rV  idxr   r   s
             r'   rE   zBitBackbone.forwardV  s    2 &1%<k$++B]B]$8$D $++JjJj 	 ((<dPT(U--#D$4$45 	6JC)))s!3 55	6 "_F#70022M%3G'//
 	
MQ
 	
r)   r4  )rH   rI   rJ   r0   r   r5  r   r   r7  r   r   r  rE   rL   rM   s   @r'   rQ  rQ  E  s`     ++?@>Xos/
"/
:B4./
^fgk^l/
	/
 Y A/
r)   rQ  )Nr   r   r   )r;   F)   )ErK   rr   r`   typingr   r   numpyr   r   torch.utils.checkpointr   r   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_bitr   
get_loggerrH   loggerr7  r6  r8  rL  rM  r  r(   r  r+   r  rO   Moduler1   	MaxPool2drp   r}   r   r   r   r   r   r   r   r   r   r  BIT_START_DOCSTRINGr5  r"  r:  rQ  r   r)   r'   <module>rk     s-   @   "     A A !  .  2 ( 
		H	%  & (  * * &ERWY]R]L^ &R-ryy -`R\\ $0299 0f
2<< 
:/BII /fU\\ e T V[VbVb *-")) -A(bii A(HF FR'		 '.Gryy GTC
 C
L. .4	   R5
! 5
	5
p  Cs 2 CsCsL  	<
$m <
<
r)   