
    sgJ                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$  e!jJ                  e&      Z'dZ(dZ)dZ*d Z+ G d dejX                        Z- G d dejX                        Z.dej^                  de0de0dej^                  fdZ1 G d dejX                        Z2 G d dejX                        Z3 G d d ejX                        Z4 G d! d"ejX                        Z5	 dMd#ej^                  d$e0d%e0d&e6d'e6dej^                  fd(Z7 G d) d*ejX                        Z8 G d+ d,ejX                        Z9 G d- d.e      Z: G d/ d0ejX                        Z;e G d1 d2e             Z<d3Z=d4Z> ed5e=       G d6 d7e:             Z? ed8e=       G d9 d:e:             Z@ ed;e=        G d< d=e:      ZA ed>e=       G d? d@e:             ZB edAe=       G dB dCe:             ZC edDe=       G dE dFe:             ZD edGe=       G dH dIe:             ZE edJe=       G dK dLe:             ZFy)Nz!PyTorch Funnel Transformer model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )FunnelConfigr   zfunnel-transformer/smallg    .Ac                 |   	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` ddd	d
ddddddddddd}t        ||	      D ]  \  }
}|
j                  d      }
t!        d |
D              r(t        j                  ddj#                  |
              R|
d   dk(  r[| }d}|
dd D ]  }t%        |t&              s|j)                  d|      rt+        |j-                  d|      j/                         d         }||j0                  k  rQd}||j2                  |   k\  r*||j2                  |   z  }|dz  }||j2                  |   k\  r*|j4                  |   |   }||j0                  z  }|j6                  |   }|dk(  rt%        |t8              r|j:                  } n%||v rt=        |||         }		 t=        ||      } |rtE        |jB                        tE        |jB                        k7  r|jG                  |jB                        }dk(  r |jH                  |      }tK        jL                  |      |_'         | S # t        $ r t        j                  d        w xY w# t>        $ r. tA        ddj#                  |
       |jB                         d}Y  w xY w)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape k_headq_headv_head	post_projlinear_1linear_2	attentionffnweightbiasword_embeddings
embeddings)kqvolayer_1layer_2rel_attnffkernelgammabetalookup_tableword_embeddinginput/c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/funnel/modeling_funnel.py	<genexpr>z,load_tf_weights_in_funnel.<locals>.<genexpr>f   s      
 nn
s   z	Skipping 	generatorFr   z	layer_\d+zlayer_(\d+)rTr2   )(renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin
isinstanceFunnelPositionwiseFFN	fullmatchintsearchgroupsnum_hidden_layersblock_sizesblockslayersFunnelRelMultiheadAttentionr_kernelgetattrAttributeErrorprintshapelenreshape	transposetorch
from_numpydata)modelconfigtf_checkpoint_pathrF   nptftf_path	init_varsnamesarraysnamerg   array
_layer_mappointerskippedm_namelayer_index	block_idxs                      rB   load_tf_weights_in_funnelr   8   sI   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	  +J" 5&) +3ezz#  

 
 KK)CHHTN#3457k!12h 	Fg'<=",,|]cBd!"))NF"C"J"J"LQ"OP!9!99 !I%););I)FF#v'9'9)'DD!Q	 &););I)FF &nnY7DG6#;#;;K%nn[9G3:g7R#S!**:%!':f+=>%gv6G'	0 7==!S%55gmm4!$U+ ++E2GLW+3Z La  Q	
 	J & Ichhtn%56D"Gs   K! L! L3L;:L;c                        e Zd Zdeddf fdZ	 ddeej                     deej                     dej                  fdZ xZ	S )	FunnelEmbeddingsro   returnNc                 @   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idr(   	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfro   	__class__s     rB   r   zFunnelEmbeddings.__init__   sh    !||F,=,=v?Q?Q_e_r_rs,,v~~6;P;PQzz&"7"78    	input_idsinputs_embedsc                 p    || j                  |      }| j                  |      }| j                  |      }|S N)r(   r   r   )r   r   r   r)   s       rB   forwardzFunnelEmbeddings.forward   s<       00;M__]3
\\*-
r   NN)
__name__
__module____qualname__r   r   r   rk   Tensorr   __classcell__r   s   @rB   r   r      sS    9| 9 9 ae!%,,/GOPUP\P\G]	r   r   c                       e Zd ZU dZdZeed<   deddf fdZ	 	 d de	j                  d	ee	j                     d
ee	j                     dee	j                     fdZd
e	j                  de	j                  fdZdede	j                  de	j                   deee	j                     eee	j                        f   fdZde	j                  defdZd!de	j                  dedede	j                  fdZdee	j                  ee	j                     ee	j                     f   deeee   ee   f   de	j                  fdZ	 d"dee	j                  ee	j                     ee	j                     f   dedede	j                  fdZdee	j                     dee	j                  ee	j                     f   fdZdee	j                     dee	j                     fdZ xZS )#FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idro   r   Nc                     t         |           || _        t        j                  |j
                        | _        t        j                  |j
                        | _        d | _        y r   )	r   r   ro   r   r   r   sin_dropoutcos_dropoutpooling_multr   s     rB   r   z!FunnelAttentionStructure.__init__   sM    ::f&;&;<::f&;&;< !r   r   attention_masktoken_type_idsc                 b   d| _         |j                  d      x| _        }| j                  ||j                  |j
                        }|| j                  |      nd}| j                  j                  r7t        j                  j                  |j                  |dz
  |dz
  g      d      nd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )r   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matro   separate_clsr   
functionalpadnew_ones)r   r   r   r   r   position_embedstoken_type_matcls_masks           rB   init_attention_inputsz.FunnelAttentionStructure.init_attention_inputs   s     !.!3!3A!66w227M<O<OQ^QeQefGUGa33NCgk {{'' MMm44gk7Q;5OPR^_ 	
  JJr   c                     |dddddf   |dddf   k(  }|| j                   k(  }|dddddf   |dddf   z  }||z  S )z-Convert `token_type_ids` to `token_type_mat`.N)r   )r   r   r   cls_idscls_mats        rB   r   z.FunnelAttentionStructure.token_type_ids_to_mat   sY    '1d
3~ag7NN D$:$::!Q*%4(88''r   r   r   r   c                 z   | j                   j                  }| j                   j                  dk(  rEt        j                  d|dt        j
                  |      j                  |      }t        j                  d|dz  dt        j
                  |      j                  |      }dd||dz  z  z  z  }|dddf   |d   z  }t        j                  |      }	| j                  |	      }
t        j                  |      }| j                  |      }t        j                  |
|
gd	
      }t        j                  ||	gd	
      }t        j                  ||gd	
      }t        j                  |	 |gd	
      }||||fS t        j                  d|dz  dt        j
                  |      j                  |      }dd||dz  z  z  z  }t        j                  | dz  |dz  dt        j
                  |      j                  |      }|dz  }|dddf   |d   z  }| j                  t        j                  |            }	| j                  t        j                  |            }t        j                  |	|gd	
      }t        j                  d|t        j
                  |      j                  |      }|}g }t        d| j                   j                        D ]  }|dk(  rd}ns| j                  ||      }d|dz
  z  }| j                  |||d      }|dddf   |z   }|j!                  |j#                  d      |      }t        j$                  |d|      }|}d|z  }| j                  ||      }|dddf   |z   }|j!                  |j#                  d      |      }t        j$                  |d|      }|j'                  ||g        |S )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://arxiv.org/abs/2006.03236
        
factorizedr         ?r   r   r   r   i'  Ndim)shift)ro   r   attention_typerk   arangeint64tosinr   cosr   catrange
num_blocksstride_pool_posrelative_posexpandr   gatherrS   )r   r   r   r   r   pos_seqfreq_seqinv_freqsinusoid	sin_embedsin_embed_d	cos_embedcos_embed_dphipsipiomega
rel_pos_idzero_offset	pos_embedpos
pooled_posposition_embeds_listblock_indexposition_embeds_poolingstriderel_posposition_embeds_no_poolings                               rB   r   z,FunnelAttentionStructure.get_position_embeds   s|    ++%%;;%%5 ll1gs%++fUXXY^_G||Aw!|STZ[^^_deHEh'Q,&?@AHq$w'(4.8H		(+I**95K		(+I**95K))[+6B?C))Y	2;CK52>BII	z952>ES%(( ||Aw!|STZ[^^_deHEh'Q,&?@AHwhlGaKEKK`fgjjkpqJ!A+K!!T'*Xd^;H((8)<=I((8)<=I		9i"8bAI,,q'VLOOPUVCJ#% $Q(>(>? c !#.2+!%!5!5c;!GJ ;?3F"//VZq/QG%ag.<G%nnW\\!_gFG.3ll9a.Q+ !K++C8!!T'*[8!..a'B-2\\)Q-P*$++-GI`,ab9c: ('r   pos_idr   c                     | j                   j                  rW|j                  d|z   dz   g      }| j                   j                  r|dd n|dd }t	        j
                  ||ddd   gd      S |ddd   S )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        r   r   r   Nr   )ro   r   
new_tensortruncate_seqrk   r   )r   r   r   cls_pospooled_pos_ids        rB   r   z(FunnelAttentionStructure.stride_pool_pos  s     ;;##
 ''1k>):Q)>(?@G,0KK,D,DF1RL&QRQS*M99g}SqS'9:A>>#A#;r   r   r   r   c                     ||}|d   |d   z
  }|t        |      z  }|||z  z   }|d   |d   z
  }t        j                  ||dz
  | t        j                  |j                        S )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        r   r   r   r   )rh   rk   r   longr   )	r   r   r   r   r   	ref_point
num_removemax_distmin_dists	            rB   r   z%FunnelAttentionStructure.relative_pos.  sx     JqMCF*	S_,
zF22a=3r7*||HhlVG5::VYV`V`aar   tensoraxisc                 H    |yt        t        t        f      rD ]  } j                  ||      } |S t        |t        t        f      r t	        |       fd|D              S |j
                  z   j                  j                  r# j                  j                  rt        ddd      nt        ddd      }t        d      gz  |gz   } j                  j                  r9t        d      gz  t        dd      gz   }t        j                  ||   |g      }||   S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc              3   B   K   | ]  }j                  |        y wr   )stride_pool)r@   xr   r   s     rB   rC   z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>O  s     Ja 0 0D 9Js   r   r   r   )r   )rX   listtupler   typendimro   r   r   slicerk   r   )r   r   r   ax
axis_slice	enc_slice	cls_slices   ` `    rB   r   z$FunnelAttentionStructure.stride_pool<  s    > dT5M* 6))&"56M fudm,4<J6JJJ 	 #'++":":t{{?W?WE$A]bcgimop]q 	 4[MD(J<7	;;##t,dA/??IYYy 16:FFi  r   modec                     yt        t        t        f      r t               fdD              S  j                  j
                  rE j                  j                  rddddf   n}t        j                  ddddf   |gd      j                  }|dk(  rddddddf   n|dk(  rdddddddf   dfdk(  r$t        j                  j                  d	
      n_dk(  r$t        j                  j                  d	
      n6dk(  r&t        j                  j                   d	
       nt        d      |dk(  rddddddf   S |dk(  r	dddf   S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc              3   F   K   | ]  }j                           yw))r  r   N)pool_tensor)r@   r   r  r   r   r   s     rB   rC   z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>f  s$     cWX 0 0d6 0 Rcs   !r   r   r   r   r   meanT)r   	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )rX   r   r   r   ro   r   r   rk   r   r  r   r   
avg_pool2d
max_pool2dNotImplementedError)r   r   r  r   suffixr  s   ````  rB   r
  z$FunnelAttentionStructure.pool_tensor]  s{    > fudm,4<c\bccc;;##'+{{'?'?VAssF^VFYYq"1"uv6A>F{{19AtQ,-FQYAtQM*F!6>]]--ffVW[-\FU]]]--ffVW[-\FU]mm..wvY].^^F%&XYY19!Q1*%%QY!Q$<r   attention_inputsc                    |\  }}}}| j                   j                  r| j                   j                  dk(  r| j                  |dd d      |dd z   }| j                  |d      }| j                  |d      }| j	                  || j                   j
                        }n| xj                  dz  c_        | j                   j                  dk(  r| j                  |d      }| j                  |ddg      }| j                  |ddg      }| j	                  |d      }| j	                  || j                   j
                        }||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.r   Nr   r   r   r  r  )ro   pool_q_onlyr   r   r
  pooling_typer   )r   outputr  r   r   r   r   s          rB   pre_attention_poolingz.FunnelAttentionStructure.pre_attention_pooling  sM    EUA;;""{{))\9"&"2"2?2A3F"J_]^]_M`"`!--na@N''!4H%%f4;;3K3K%LF"{{))\9"&"2"2?A"F!--nq!fEN''1a&9H!--n5-IN%%f4;;3K3K%LF+^^XV'''r   c                 L   |\  }}}}| j                   j                  r| xj                  dz  c_        | j                   j                  dk(  r|dd | j	                  |dd d      z   }| j	                  |d      }| j	                  |d      }| j                  |d      }||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.r   r   Nr   r   r  r  )ro   r  r   r   r   r
  )r   r  r   r   r   r   s         rB   post_attention_poolingz/FunnelAttentionStructure.post_attention_pooling  s    DTA;;"""{{))\9"1"1"58H8HYZY[I\^_8`"`!--na@N''!4H!--n5-IN+^^XVr   r   Nr   )r  r   )r   r   r   __doc__r   r[   __annotations__r   r   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   strr
  r  r  r   r   s   @rB   r   r      sC    s!| ! ! 2615	K||K !.K !.	K
 
u||	K((ELL (U\\ (N(N(#(;;N(8=N(	uU\\"Dell);$<<	=N(`ell  b bc bSV b_d_k_k b!ellE%,,$7ell9KKL! CsT#Y./! 
	!D wx$ELL%*=tELL?QQR$Z]$ps$	$L((-ell(;(	u||U5<<00	1(, uU\\7J  uUZUaUaOb  r   r   positional_attncontext_lenr   r   c                     | j                   \  }}}}t        j                  | ||||g      } | d d d d |d d d f   } t        j                  | |||||z
  g      } | dd |f   } | S )N.)rg   rk   ri   )r   r!  r   
batch_sizen_headr   max_rel_lens          rB   _relative_shift_gatherr&    s    />/D/D,J mmOj&+W^5_`O%aEFAo6OmmOj&'S^afSf5ghO%c<K<&78Or   c                        e Zd Zdededdf fdZddZddZ	 ddej                  d	ej                  d
ej                  de
ej                     dede
ej                  df   fdZ xZS )rb   ro   r   r   Nc                 J   t         |           || _        || _        |j                  |j
                  |j                  }}}t        j                  |j                        | _	        t        j                  |j                        | _
        t        j                  |||z  d      | _        t        j                  |||z        | _        t        j                  |||z        | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  |||g            | _        t        j                  t!        j"                  ||g            | _        t        j                  t!        j"                  d||g            | _        t        j                  ||z  |      | _        t        j0                  ||j2                        | _        d|dz  z  | _        y )NF)r'   r   r   r   g      ?)r   r   ro   r   r   r$  d_headr   r   r   attention_dropoutLinearr   r   r    	Parameterrk   zerosr_w_biasr_r_biasrc   r_s_bias	seg_embedr!   r   r   r   scale)r   ro   r   r   r$  r)  r   s         rB   r   z$FunnelRelMultiheadAttention.__init__  s   &"(..&-- jj)>)>?!#F,D,D!Eii&uEii&9ii&9U[[&&1A%BCU[[&&1A%BCU[['661J%KLU[[&&1A%BCekk1ff2E&FG6F?G<,,wF4I4IJFCK(
r   c                 ~   | j                   j                  dk(  r|\  }}}}| j                  | j                  z  }	| j                  }
t        j                  d||	z   |
      }||dddf   z  }||dddf   z  }t        j                  d||      t        j                  d||      z   }n|j                  d   |k7  rdnd}|| j                     |dz
     }| j                  | j                  z  }| j                  }
t        j                  d||
      }t        j                  d||z   |      }t        |||      }|||z  }|S )	z5Relative attention score for the positional encodingsr   zbinh,dnh->bindNzbind,jd->bnijr   r   ztd,dnh->tnhzbinh,tnh->bnit)
ro   r   r/  r2  rc   rk   einsumrg   r   r&  )r   r   r   r!  r   r   r   r   r   uw_rq_r_attentionq_r_attention_1q_r_attention_2r   r   rE   r,   r_heads                      rB   relative_positional_attentionz9FunnelRelMultiheadAttention.relative_positional_attention  sK    ;;%%5 #2CS%

*A--C "LL)96A:sKM+c!T'l:O+bDk9O $ll?OSQTYT`T`%U O  aK7AQE   0 01%!)<A

*A--C \\-C8F#ll+;VaZPO4_kSXYOx'Or   c                    |y|j                   \  }}}| j                  | j                  z  }t        j                  d||z   | j
                        }|dddf   j                  ||j                   d   ||g      }t        j                  |dd      \  }	}
t        j                  ||
j                  |j                         |	j                  |j                               }|||z  }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisr   r   r   r   )	rg   r0  r2  rk   r4  r1  r   rU   where)r   r   r   r   r#  r   r!  r0  token_type_biasdiff_token_typesame_token_typetoken_type_attns               rB   relative_token_type_attentionz9FunnelRelMultiheadAttention.relative_token_type_attention  s    !+9+?+?(
G[ ==4::-  ,,'7(9JDNN['4077V\\RS_V]_j8kl+0;;r+R(++O22>3G3GH/J`J`aoauauJv
 x'Or   querykeyvaluer  output_attentions.c                    |\  }}}}	|j                   \  }
}}|j                   d   }| j                  j                  | j                  j                  }}| j	                  |      j                  |
|||      }| j                  |      j                  |
|||      }| j                  |      j                  |
|||      }|| j                  z  }| j                  | j                  z  }t        j                  d||z   |      }| j                  ||||	      }| j                  |||	      }||z   |z   }|j                  }|j                         }|%|t         d|d d d d f   j                         z
  z  z
  }t        j"                  |d|      }| j%                  |      }t        j                  d||      }| j'                  |j)                  |
|||z              }| j+                  |      }| j-                  ||z         }|r||fS |fS )Nr   zbind,bjnd->bnijr   )r   r   zbnij,bjnd->bind)rg   ro   r$  r)  r   viewr   r    r2  r.  rk   r4  r;  rB  r   floatINFsoftmaxr*  r!   ri   r   r   )r   rC  rD  rE  r  rF  r   r   r   r   r#  r   _r!  r$  r)  r   r   r    r.  content_scorer   rA  
attn_scorer   	attn_probattn_vecattn_outr  s                                rB   r   z#FunnelRelMultiheadAttention.forward  s    EUA!&
GQiil++T[[-?-? U#((WffMS!&&z;OU#(([&&Q$**$==4::-%68I6R<<_fVackl<<^VU]^ #_4F
   %%'
%#cQ41N1T1T1V-V&WWJMM*"EB	**95	 << 19fE >>("2"2:wQW"XY&&x0!12&7	"FfYFr   r   F)r   r   r   r   r[   r   r;  rB  rk   r   r   boolr   r   r   s   @rB   rb   rb     s    )| )# )$ ).(T< #(3G||3G \\3G ||	3G
  -3G  3G 
u||S 	!3Gr   rb   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )rY   ro   r   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                         | _        y r   )r   r   r   r+  r   d_innerr"   r   
hidden_actactivation_functionr   activation_dropoutr#   r   r   r   r   r   r   s     rB   r   zFunnelPositionwiseFFN.__init__H  s    		&..&..A#)&*;*;#< "$**V-F-F"G		&..&..Azz&"7"78,,v~~v7L7LMr   hiddenc                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  ||z         S r   )r"   rX  rY  r#   r   r   )r   rZ  hs      rB   r   zFunnelPositionwiseFFN.forwardQ  s^    MM&!$$Q'##A&MM!LLOvz**r   )	r   r   r   r   r   rk   r   r   r   r   s   @rB   rY   rY   G  s4    N| N N+ell +u|| +r   rY   c                        e Zd Zdededdf fdZ	 ddej                  dej                  dej                  d	ede	f
d
Z
 xZS )FunnelLayerro   r   r   Nc                 d    t         |           t        ||      | _        t	        |      | _        y r   )r   r   rb   r$   rY   r%   )r   ro   r   r   s      rB   r   zFunnelLayer.__init__[  s(    4V[I(0r   rC  rD  rE  rF  c                 n    | j                  |||||      }| j                  |d         }|r||d   fS |fS )NrF  r   r   )r$   r%   )r   rC  rD  rE  r  rF  attnr  s           rB   r   zFunnelLayer.forward`  sH     ~~eS%1AUf~g$q'"$5Q DF9Dr   rR  )r   r   r   r   r[   r   rk   r   rS  r   r   r   r   s   @rB   r^  r^  Z  si    1| 1# 1$ 1 #(
E||
E \\
E ||	
E  
E 

Er   r^  c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  deej                     deej                     ded	ed
ede	e
ef   fdZ xZS )FunnelEncoderro   r   Nc                 T   t         |           || _        t        |      | _        t        j                  t        |j                        D cg c];  \  }}t        j                  t        |      D cg c]  }t        ||       c}      = c}}}      | _        y c c}w c c}}}w r   )r   r   ro   r   attention_structurer   
ModuleList	enumerater_   r   r^  r`   )r   ro   r   
block_sizerL  r   s        rB   r   zFunnelEncoder.__init__n  s    #;F#C mm 099K9K/L +K zIZ[A{6;?[\
[s   $B#3B	B#B#r   r   r   rF  output_hidden_statesreturn_dictc           
         |j                  |      }| j                  j                  |||      }|}|r|fnd }	|rdnd }
t        | j                        D ]  \  }}|j                  d      | j                  j                  rdndkD  }|xr |dkD  }|r| j                  j                  ||      \  }}t        |      D ]  \  }}t        | j                  j                  |         D ]{  }|dk(  xr	 |dk(  xr |}|r}| j                  j                  r|n|x}}n|x}x}} ||||||      }|d   }|r| j                  j                  |      }|r|
|dd  z   }
|sv|	|fz   }	}   |st        d ||	|
fD              S t        ||	|
      S )	Nr   r   r?   r   r   r   ra  c              3   &   K   | ]	  }||  y wr   r?   r@   r,   s     rB   rC   z(FunnelEncoder.forward.<locals>.<genexpr>       aqSTS`a   last_hidden_statehidden_states
attentions)type_asrf  r   rh  r`   r   ro   r   r  r   block_repeatsr  r  r   r   )r   r   r   r   rF  rj  rk  r  rZ  all_hidden_statesall_attentionsr   blockpooling_flagpooled_hiddenr}   layerrepeat_index
do_poolingrC  rD  rE  layer_outputs                          rB   r   zFunnelEncoder.forwardy  s    (//>33II)) J 

 0D],$0d"+DKK"8 	JK!;;q>$++2J2JQPQRL';K!OL262J2J2`2`,3// '0&6 J"U$)$++*C*CK*P$Q JL".!"3!\+:J!\P\J! -040G0Gf]Ze.444e#(U<L`q#rL)!_F!+/+C+C+Z+Z[k+l(()7,qr:J)J+,=	,I)JJ	J2 aV->$OaaaGXesttr   NNFFTr   r   r   r   r   rk   r   r   rS  r   r   r   r   r   r   s   @rB   rd  rd  m  s    	
| 	
 	
 2615"'%* 0u||0u !.0u !.	0u
  0u #0u 0u 
uo%	&0ur   rd  r   r   
target_lenr   r   c           	      6   |dk(  r| S |r| ddddf   }| ddddf   } t        j                  | |d      }|rT|r)t        j                  j	                  |ddd|dz
  ddf      }|ddd|dz
  f   }t        j
                  |gd      }|S |ddd|f   }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)repeatsr   r   r   )rk   repeat_interleaver   r   r   r   )r   r   r  r   r   clsr  s          rB   upsampler    s     {2A2haeH$$QA>F]]&&v1a!Q/JKF+Z!^++,C=a0 M ;J;'Mr   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  dej                  deej                     deej                     d	ed
edede	e
ef   fdZ xZS )FunnelDecoderro   r   Nc           	          t         |           || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |d       c}      | _
        y c c}w )Nr   )r   r   ro   r   rf  r   rg  r   num_decoder_layersr^  ra   )r   ro   rL  r   s      rB   r   zFunnelDecoder.__init__  sR    #;F#C mmU6KdKdEe$f[%;$fg$fs   A-final_hiddenfirst_block_hiddenr   r   rF  rj  rk  c                    t        |dt        | j                  j                        dz
  z  |j                  d   | j                  j
                  | j                  j                        }||z   }	|r|	fnd }
|rdnd }| j                  j                  |	||      }| j                  D ]'  } ||	|	|	||      }|d   }	|r||dd  z   }|s"|
|	fz   }
) |st        d |	|
|fD              S t        |	|
|	      S )
Nr   r   )r   r  r   r   r?   rm  ra  r   c              3   &   K   | ]	  }||  y wr   r?   ro  s     rB   rC   z(FunnelDecoder.forward.<locals>.<genexpr>  rp  rq  rr  )r  rh   ro   r_   rg   r   r   rf  r   ra   r   r   )r   r  r  r   r   rF  rj  rk  upsampled_hiddenrZ  rx  ry  r  r}  r  s                  rB   r   zFunnelDecoder.forward  s%    $T[[4459:)//21111
 "$66)=VI40d33II)) J 
 [[ 	BE 9I]noL!!_F !/,qr2B!B#$5	$A!	B aV->$OaaaGXesttr   r  r  r   s   @rB   r  r    s    h| h h 2615"'%* 'ull'u "LL'u !.	'u
 !.'u  'u #'u 'u 
uo%	&'ur   r  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.ro   r   Nc                     t         |           || _        t        j                  |j
                  |j
                        | _        t        j                  |j
                  d      | _        y r  )r   r   ro   r   r+  r   densedense_predictionr   s     rB   r   z'FunnelDiscriminatorPredictions.__init__  sF    YYv~~v~~>
 "		&..! <r   discriminator_hidden_statesc                     | j                  |      }t        | j                  j                     |      }| j	                  |      j                  d      }|S )Nr   )r  r   ro   rW  r  squeeze)r   r  rt  logitss       rB   r   z&FunnelDiscriminatorPredictions.forward  sJ    

#>?t{{556}E&&}5==bAr   )
r   r   r   r  r   r   rk   r   r   r   r   s   @rB   r  r    s4    O=| = =5<< ELL r   r  c                   "    e Zd ZdZeZeZdZd Z	y)FunnelPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    funnelc                    |j                   j                  }|j                  d      dk7  rt        |dd       | j                  j
                  >|j                  j                  \  }}t        j                  dt        ||z         z        }n| j                  j
                  }t        j                  j                  |j                  |       t        |dd       +t        j                  j                  |j                  d       y y |dk(  r<t        j                  j!                  |j"                  | j                  j$                  	       t        j                  j!                  |j&                  | j                  j$                  	       t        j                  j!                  |j(                  | j                  j$                  	       t        j                  j!                  |j*                  | j                  j$                  	       t        j                  j!                  |j,                  | j                  j$                  	       y |d
k(  r| j                  j
                  dn| j                  j
                  }t        j                  j                  |j.                  j                  |       |j.                  j0                  F|j.                  j                  j2                  |j.                  j0                     j5                          y y y )Nr+  r   r&   r   )stdr'   g        rb   )br   )r   r   findrd   ro   initializer_stdr&   rg   rq   sqrtrI  r   initnormal_	constant_r'   uniform_r.  initializer_ranger/  rc   r0  r1  r(   r   rm   zero_)r   module	classnamefan_outfan_inr  s         rB   _init_weightsz#FunnelPreTrainedModel._init_weights  s   $$--	>>(#r)vx.:;;..6&,mm&9&9OGV''#fw.>(?"?@C++55C37vvt,8!!&++s3 977GGV__0M0MNGGV__0M0MNGGV__0M0MNGGV__0M0MNGGV--1N1NO,,44<#$++B]B]CGGOOF2299sOC%%11=&&--2263I3I3U3UV\\^ > -r   N)
r   r   r   r  r   config_classr   load_tf_weightsbase_model_prefixr  r?   r   rB   r  r    s    
  L/O _r   r  c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )FunnelClassificationHeadro   n_labelsr   Nc                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |      | _	        y r   )
r   r   r   r+  r   linear_hiddenr   r   r   
linear_out)r   ro   r  r   s      rB   r   z!FunnelClassificationHead.__init__'  sU    YYv~~v~~Fzz&"7"78))FNNH=r   rZ  c                     | j                  |      }t        j                  |      }| j                  |      }| j	                  |      S r   )r  rk   tanhr   r  )r   rZ  s     rB   r   z FunnelClassificationHead.forward-  s=    ##F+F#f%v&&r   )
r   r   r   r   r[   r   rk   r   r   r   r   s   @rB   r  r  &  s8    >| >s >t >'ell 'u|| 'r   r  c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FunnelForPreTrainingOutputa  
    Output type of [`FunnelForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss of the ELECTRA-style objective.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Prediction scores of the head (scores for each token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossr  rt  ru  )r   r   r   r  r  r   rk   FloatTensorr  r  rt  r   ru  r?   r   rB   r  r  4  sb    * )-D(5$$
%, $FE$8<M8E%"3"345<59Ju00129r   r  a(  

    The Funnel Transformer model was proposed in [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient
    Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`FunnelConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    c                       e Zd Zdeddf fdZdej                  fdZdej                  ddfdZ e	e
j                  d             ed	ee
      	 	 	 	 	 	 	 	 	 ddeej"                     deej"                     deej"                     deej"                     deej"                     deej"                     dee   dee   dee   deeef   fd              Z xZS )FunnelBaseModelro   r   Nc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r   r   r   r)   rd  encoder	post_initr   s     rB   r   zFunnelBaseModel.__init__  s4     *62$V, 	r   c                 .    | j                   j                  S r   r)   r(   r   s    rB   get_input_embeddingsz$FunnelBaseModel.get_input_embeddings      ...r   new_embeddingsc                 &    || j                   _        y r   r  r   r  s     rB   set_input_embeddingsz$FunnelBaseModel.set_input_embeddings      *8'r   batch_size, sequence_lengthfunnel-transformer/small-base
checkpointoutput_typer  r   r   r   position_ids	head_maskr   rF  rj  rk  c
                 V   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      ||j                  n|j                  }|t        j                  |
|      }|&t        j                  |
t        j                  |      }| j                  ||      }| j                  ||||||	      }|S )NDYou cannot specify both input_ids and inputs_embeds at the same timer   5You have to specify either input_ids or inputs_embedsr   r   r   r   r   rF  rj  rk  )ro   rF  rj  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rk   onesr-  r   r)   r  )r   r   r   r   r  r  r   rF  rj  rk  input_shaper   encoder_outputss                rB   r   zFunnelBaseModel.forward  s=   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN 	O,,))/!5# ' 
 r   	NNNNNNNNN)r   r   r   r   r   r   r   r  r  r   FUNNEL_INPUTS_DOCSTRINGformatr   r   _CONFIG_FOR_DOCr   rk   r   rS  r   r   r   r   r   s   @rB   r  r    sE   |  /bll /92<< 9D 9 ++B+I+IJg+hi2#$ -11515/3,004,0/3&*/ELL)/ !./ !.	/
 u||,/ ELL)/  -/ $D>/ 'tn/ d^/ 
uo%	&/ j/r   r  zlThe bare Funnel Transformer Model transformer outputting raw hidden-states without any specific head on top.c                   x    e Zd Zdeddf fdZdej                  fdZdej                  ddfdZ e	e
j                  d             eeee	      	 	 	 	 	 	 	 dd
eej$                     deej$                     deej$                     deej$                     dee   dee   dee   deeef   fd              Z xZS )FunnelModelro   r   Nc                     t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        | j                          y r   )
r   r   ro   r   r)   rd  r  r  decoderr  r   s     rB   r   zFunnelModel.__init__  sG     *62$V,$V, 	r   c                 .    | j                   j                  S r   r  r  s    rB   r  z FunnelModel.get_input_embeddings  r  r   r  c                 &    || j                   _        y r   r  r  s     rB   r  z FunnelModel.set_input_embeddings  r  r   r  r  r   r   r   r   rF  rj  rk  c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         d d }nt	        d      ||j                  n|j                  }	|t        j                  ||	      }|&t        j                  |t        j                  |	      }| j                  ||      }| j                  ||||d|      }
| j                  |
d	   |
d
   | j                   j                  d	      |||||      }|s6d	}|d	   f}|r|d
z  }||
d
   ||   z   fz   }|r|d
z  }||
d   ||   z   fz   }|S t!        |d	   |r|
j"                  |j"                  z   nd |r|
j$                  |j$                  z         S d       S )Nr  r   r  r  r   r  Tr  r   r   )r  r  r   r   rF  rj  rk  r   rr  )ro   rF  rj  r  r  r  r   r   rk   r  r-  r   r)   r  r  r_   r   rt  ru  )r   r   r   r   r   rF  rj  rk  r  r   r  decoder_outputsidxoutputss                 rB   r   zFunnelModel.forward  sA     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN 	O,,))/!%# ' 
 ,,(+.q1$++2I2I!2LM))/!5# ' 
 C&q)+G#q!_Q%7/#:N%N$PP q!_Q%7/#:N%N$PPN-a0# +88?;X;XXTe22_5O5OO
 	

 lp
 	
r   )NNNNNNN)r   r   r   r   r   r   r   r  r  r   r  r  r   _CHECKPOINT_FOR_DOCr   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r    s#   
|  /bll /92<< 9D 9 ++B+I+IJg+hi&#$ -1151504,0/3&*H
ELL)H
 !.H
 !.	H

  -H
 $D>H
 'tnH
 d^H
 
uo%	&H
 jH
r   r  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                   R    e Zd Zdeddf fdZ eej                  d             ee	e
      	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deee	f   fd              Z xZS )FunnelForPreTrainingro   r   Nc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r   r   r  r  r  discriminator_predictionsr  r   s     rB   r   zFunnelForPreTraining.__init__J  s3     !&))G)O&r   r  )r  r  r   r   r   r   labelsrF  rj  rk  c	           	      `   ||n| j                   j                  }| j                  |||||||      }	|	d   }
| j                  |
      }d}|t	        j
                         }|a|j                  d|
j                  d         dk(  }|j                  d|
j                  d         |   }||   } |||j                               }n4 ||j                  d|
j                  d         |j                               }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )a4  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```Nr   r   r   rF  rj  rk  r   r   r   r  r  rt  ru  )ro   r  r  r  r   r	   rH  rg   rI  r  rt  ru  )r   r   r   r   r   r  rF  rj  rk  r  discriminator_sequence_outputr  r  loss_fctactive_lossactive_logitsactive_labelsr  s                     rB   r   zFunnelForPreTraining.forwardR  sj   F &1%<k$++B]B]&*kk))'/!5# '2 '
# )DA(F%//0MN++-H),11"6S6Y6YZ[6\]abb &B0M0S0STU0V WXc d &{ 3}/B/B/DEB0M0S0STU0V WY_YeYeYghY!<QR!@@F)-)9TGf$EvE)5CC2==	
 	
r   NNNNNNNN)r   r   r   r   r   r   r  r  r   r  r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r  I  s   |   ++B+I+IJg+hi+ETcd -1151504)-,0/3&*D
ELL)D
 !.D
 !.	D

  -D
 &D
 $D>D
 'tnD
 d^D
 
u00	1D
 e jD
r   r  z@Funnel Transformer Model with a `language modeling` head on top.c                       e Zd ZdgZdeddf fdZdej                  fdZdej                  ddfdZ
 eej                  d	             eeeed
      	 	 	 	 	 	 	 	 ddeej(                     deej(                     deej(                     deej(                     deej(                     dee   dee   dee   deeef   fd              Z xZS )FunnelForMaskedLMzlm_head.weightro   r   Nc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r   r   r  r  r   r+  r   r   lm_headr  r   s     rB   r   zFunnelForMaskedLM.__init__  sD     !&)yy1B1BC 	r   c                     | j                   S r   r   r  s    rB   get_output_embeddingsz'FunnelForMaskedLM.get_output_embeddings  s    ||r   r  c                     || _         y r   r  r  s     rB   set_output_embeddingsz'FunnelForMaskedLM.set_output_embeddings  s	    %r   r  z<mask>)r  r  r  maskr   r   r   r   r  rF  rj  rk  c	           	         ||n| j                   j                  }| j                  |||||||      }	|	d   }
| j                  |
      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   r   r   r  )
ro   r  r  r   r
   rH  r   r   rt  ru  )r   r   r   r   r   r  rF  rj  rk  r  rs  prediction_logitsmasked_lm_lossr  r  s                  rB   r   zFunnelForMaskedLM.forward  s    0 &1%<k$++B]B]++))'/!5#  
 $AJ LL):;')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   r  )r   r   r   _tied_weights_keysr   r   r   r+  r  r   r  r   r  r  r   r  r   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r    s;   *+|  ryy &BLL &T & ++B+I+IJg+hi&"$	 -1151504)-,0/3&*.
ELL).
 !..
 !.	.

  -.
 &.
 $D>.
 'tn.
 d^.
 
un$	%.
 j.
r   r  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                   T    e Zd Zdeddf fdZ eej                  d             ede	e
      	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deee	f   fd              Z xZS )FunnelForSequenceClassificationro   r   Nc                     t         |   |       |j                  | _        || _        t	        |      | _        t        ||j                        | _        | j                          y r   )	r   r   
num_labelsro   r  r  r  
classifierr  r   s     rB   r   z(FunnelForSequenceClassification.__init__  sN      ++%f-266;L;LMr   r  r  r  r   r   r   r   r  rF  rj  rk  c	           	      ,   ||n| j                   j                  }| j                  |||||||      }	|	d   }
|
dddf   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|s|f|	dd z   }||f|z   S |S t        |||	j                   |	j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr   r  )ro   r  r  r  problem_typer  r   rk   r   r[   r   r  r
   rH  r	   r   rt  ru  )r   r   r   r   r   r  rF  rj  rk  r  rs  pooled_outputr  r  r  r  s                   rB   r   z'FunnelForSequenceClassification.forward  s   . &1%<k$++B]B]++))'/!5#  
 $AJ)!Q$//{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   r  )r   r   r   r   r   r   r  r  r   r   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r    s   |   ++B+I+IJg+hi2,$ -1151504)-,0/3&*A
ELL)A
 !.A
 !.	A

  -A
 &A
 $D>A
 'tnA
 d^A
 
u..	/A
 jA
r   r  z
    Funnel Transformer Model with a multiple choice classification head on top (two linear layer on top of the first
    timestep of the last hidden state, and a softmax) e.g. for RocStories/SWAG tasks.
    c                   T    e Zd Zdeddf fdZ eej                  d             ede	e
      	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deee	f   fd              Z xZS )FunnelForMultipleChoicero   r   Nc                     t         |   |       t        |      | _        t	        |d      | _        | j                          y r  )r   r   r  r  r  r  r  r   s     rB   r   z FunnelForMultipleChoice.__init__J  s4     %f-261=r   z(batch_size, num_choices, sequence_lengthr  r  r   r   r   r   r  rF  rj  rk  c	           	         ||n| j                   j                  }||j                  d   n|j                  d   }	|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  |||||||      }
|
d   }|dddf   }| j                  |      }|j                  d|	      }d}|t               } |||      }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r  r   r  )ro   r  rg   rH  r   r  r  r
   r   rt  ru  )r   r   r   r   r   r  rF  rj  rk  num_choicesr  rs  r  r  reshaped_logitsr  r  r  s                     rB   r   zFunnelForMultipleChoice.forwardR  s   . &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImq ( r=#5#5b#9=;M;Mb;QR 	 ++))'/!5#  
 $AJ)!Q$// ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   r  )r   r   r   r   r   r   r  r  r   r   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r  B  s	   |   ++B+I+IJt+uv2-$ -1151504)-,0/3&*:
ELL):
 !.:
 !.	:

  -:
 &:
 $D>:
 'tn:
 d^:
 
u//	0:
 w:
r   r  z
    Funnel Transformer Model with a token classification head on top (a linear layer on top of the hidden-states
    output) e.g. for Named-Entity-Recognition (NER) tasks.
    c                   T    e Zd Zdeddf fdZ eej                  d             ee	e
e      	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deee
f   fd              Z xZS )FunnelForTokenClassificationro   r   Nc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   )r   r   r  r  r  r   r   r   r   r+  r   r  r  r   s     rB   r   z%FunnelForTokenClassification.__init__  si      ++!&)zz&"7"78))F$6$68I8IJ 	r   r  r  r   r   r   r   r  rF  rj  rk  c	           	         ||n| j                   j                  }| j                  |||||||      }	|	d   }
| j                  |
      }
| j	                  |
      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )ro   r  r  r   r  r
   rH  r  r   rt  ru  )r   r   r   r   r   r  rF  rj  rk  r  rs  r  r  r  r  s                  rB   r   z$FunnelForTokenClassification.forward  s    * &1%<k$++B]B]++))'/!5#  
 $AJ LL):;!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   r  )r   r   r   r   r   r   r  r  r   r  r   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r  r    s	   	| 	 	 ++B+I+IJg+hi&)$ -1151504)-,0/3&*-
ELL)-
 !.-
 !.	-

  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 j-
r   r  z
    Funnel Transformer Model with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   t    e Zd Zdeddf fdZ eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deee
f   fd              Z xZS )FunnelForQuestionAnsweringro   r   Nc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  r  r  r   r+  r   
qa_outputsr  r   s     rB   r   z#FunnelForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r   r  r  r   r   r   r   start_positionsend_positionsrF  rj  rk  c
           	      $   |	|	n| j                   j                  }	| j                  |||||||	      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|	s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   r   )ignore_indexr   )r  start_logits
end_logitsrt  ru  )ro   r  r  r$  rU   r  
contiguousrh   r   squezeclampr
   r   rt  ru  )r   r   r   r   r   r%  r&  rF  rj  rk  r  rs  r  r)  r*  
total_lossignored_indexr  
start_lossend_lossr  s                        rB   r   z"FunnelForQuestionAnswering.forward  s   8 &1%<k$++B]B]++))'/!5#  
 $AJ!23#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"8"8"<=%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   r  )r   r   r   r   r   r   r  r  r   r  r   r  r   rk   r   rS  r   r   r   r   r   s   @rB   r"  r"    s+   |   ++B+I+IJg+hi&0$ -11515042604,0/3&*D
ELL)D
 !.D
 !.	D

  -D
 "%,,/D
  -D
 $D>D
 'tnD
 d^D
 
u22	3D
 jD
r   r"  )TF)Gr  rL   dataclassesr   typingr   r   r   r   rG   rq   rk   r   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_funnelr   
get_loggerr   rJ   r  r  rJ  r   Moduler   r   r   r[   r&  rb   rY   r^  rd  rS  r  r  r  r  r  r  FUNNEL_START_DOCSTRINGr  r  r  r  r  r  r  r  r"  r?   r   rB   <module>r=     s   ( 	 ! / /    A A !  .  / 
		H	% 0  
Wtryy "A ryy A HELL s SV [`[g[g  MG")) MG`+BII +&E")) E&<uBII <u@ di|| .1AE\`
\\,.uBII .ubRYY   _O  _F'ryy ' : : :8 &$ N  E+ EEP r_
' _
	_
D  O
0 O
d \^tuG
- G
 vG
T  R
&; R
R
j  I
3 I
I
X  ?
#8 ?
?
D  U
!6 U
U
r   