
    sg                     f   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e%jP                  e)      Z*dZ+dZ,d Z- G d dej\                        Z/ G d de      Z0 G d dej\                        Z1 G d dej\                        Z2 G d dej\                        Z3 G d dej\                        Z4 G d dej\                        Z5 G d  d!ej\                        Z6 G d" d#ej\                        Z7 G d$ d%ej\                        Z8 G d& d'ej\                        Z9 G d( d)ej\                        Z:d*Z;d+Z< e#d,e;       G d- d.e0             Z= G d/ d0ej\                        Z> e#d1e;       G d2 d3e0             Z? G d4 d5ej\                        Z@ e#d6e;       G d7 d8e0             ZA e#d9e;       G d: d;e0             ZB e#d<e;       G d= d>e0             ZC e#d?e;       G d@ dAe0             ZDy)BzPyTorch ConvBERT model.    N)
attrgetter)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModelSequenceSummary)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ConvBertConfigzYituTech/conv-bert-baser   c                    	 ddl }t        j
                  j                  |      }t        j                  d|        |j                  j                  |      }i }|D ]A  \  }}t        j                  d| d|        |j                  j                  ||      }	|	||<   C ddd	d
dddd}
|j                  dkD  rd}nd}t        |j                        D ]:  }d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d |
d| d!<   d| d"|
d| d#<   d| d$|
d| d%<   d| d&|
d| d'<   d| d(|
d| d)<   d| d*|
d| d+<   d| d,|
d| d-<   d| d.|
d| d/<   d| d0|
d| d1<   d| d2|
d| d3<   d| d4|
d| d5<   d| d6| d7|
d| d8<   d| d6| d9|
d| d:<   d| d;| d7|
d| d<<   d| d;| d9|
d| d=<   d| d>|
d| d?<   d| d@|
d| dA<   = | j                         D ]  }|d   }t        |      } ||       }|
|   }t!        j"                  ||         }t        j                  dB| dC| dD       |j%                  d7      r.|j%                  dE      s|j%                  dF      s|j&                  }|j%                  dG      r|j)                  ddHd      }|j%                  dI      r|j)                  dHdd      }|j%                  dJ      r|j+                  dK      }||_         | S # t        $ r t        j                  d        w xY w)Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   g_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variable
num_groupsrangenum_hidden_layersnamed_parametersr   torch
from_numpyendswithTpermute	unsqueezedata)modelconfigtf_checkpoint_pathtftf_path	init_varstf_datanameshapearrayparam_mappinggroup_dense_namejparam
param_name	retrieverresulttf_namevalues                      a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbertrO   0   so    ggoo01G
KK8	BC''0IG  e(l5'BC&&w5 .R1Y3]'K%H%H#DM 1$"6++, Cw$QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'AB 	qc)CDE %QC'?@ 	qc)ABC %QC'CD 	qc)EFG %QC'AB 	qc)CDE %QC'UV 	qc)]^_ %QC'UV 	qc)]^_ %QC'IJ 	qc)QRS %QC'NO 	qc)QRS %QC'LM 	qc)OPQ %QC'MN 	qc)NOP %QC'KL 	qc)LMN %QC'EF 	qc)GHI %QC'HI 	qc)KLM %QC'CD 	qc)EFG %QC'GH 	qc)IJK %QC~6F5GwO 	qc)CDE %QC~6F5GuM 	qc)ABC %QCx0@/AI 	qc)=>? %QCx0@/AG 	qc);<= %QC'>? 	qc)ABC G]]^\__uDvqc)?@AGCwJ '') 1X
z*	5!
+  !12d7)6*Q78I&##$BC''(@A!GGE/0MM!Q*E/0MM!Q*E12OOB'E#$ Lk  Q	
 	s   L+ + Mc                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     deej                     dej                  f
dZ	 xZ
S )
ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_ids)r   r$   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr4   arangeexpandzerosrV   sizelongselfr<   	__class__s     rN   r[   zConvBertEmbeddings.__init__   s   !||F,=,=v?T?Tbhbubuv#%<<0N0NPVPePe#f %'\\&2H2H&J_J_%`" f&;&;AVAVWzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrX   rV   inputs_embedsreturnc                 2   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	| j                  |      }
||	z   |
z   }| j                  |      }| j                  |      }|S )Nr$   r   rX   r   rY   device)rn   rV   hasattrrX   rl   r4   rm   ro   ry   r`   rb   rd   re   ri   )rq   rt   rX   rV   ru   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrb   rd   
embeddingss               rN   forwardzConvBertEmbeddings.forward   s,     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M"66|D $ : :> J"%88;PP
^^J/
\\*-
rs   )NNNN)__name__
__module____qualname____doc__r[   r   r4   
LongTensorFloatTensorr   __classcell__rr   s   @rN   rQ   rQ      s    Q
( 15593759$E,,-$ !!1!12$ u//0	$
   1 12$ 
		$rs   rQ   c                   &    e Zd ZdZeZeZdZdZ	d Z
y)ConvBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    convbertTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights        meanstdNg      ?)
isinstancer   Linearweightr:   normal_r<   initializer_rangebiaszero_r\   rS   re   fill_)rq   modules     rN   _init_weightsz%ConvBertPreTrainedModel._init_weights   s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .rs   N)r   r   r   r   r   config_classrO   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr    rs   rN   r   r      s$    
 "L1O"&*#*rs   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )SeparableConv1DzSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t         |           t        j                  |||||dz  d      | _        t        j                  ||dd      | _        t        j                  t        j                  |d            | _	        | j                  j                  j                  j                  d|j                         | j
                  j                  j                  j                  d|j                         y )Nr#   F)kernel_sizegroupspaddingr   r   )r   r   r   r   )rZ   r[   r   Conv1d	depthwise	pointwise	Parameterr4   rm   r   r   r:   r   r   )rq   r<   input_filtersoutput_filtersr   kwargsrr   s         rN   r[   zSeparableConv1D.__init__  s    # 1$
 =.aV[\LL^Q!?@	""**9Q9Q*R""**9Q9Q*Rrs   hidden_statesrv   c                 h    | j                  |      }| j                  |      }|| j                  z  }|S N)r   r   r   )rq   r   xs      rN   r   zSeparableConv1D.forward  s0    NN=)NN1	TYYrs   	r   r   r   r   r[   r4   Tensorr   r   r   s   @rN   r   r     s'    ]S U\\ ell rs   r   c                        e Zd Z fdZd Z	 	 	 	 d
dej                  deej                     deej                     deej                     dee	   de
ej                  eej                     f   fd	Z xZS )ConvBertSelfAttentionc                 j   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  |j                  z  }|dk  r|j                  | _        d| _        n|| _        |j                  | _        |j                  | _        |j                  | j                  z  dk7  rt        d      |j                  | j                  z  dz  | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        ||j                  | j                  | j                        | _        t        j                  | j                  | j                  | j                  z        | _        t        j                  |j                  | j                        | _        t        j&                  | j                  dgt)        | j                  dz
  dz        dg	      | _        t        j,                  |j.                        | _        y )
Nr   r^   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr#   )r   r   )rZ   r[   hidden_sizenum_attention_headsrz   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r   querykeyrM   r   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldrg   attention_probs_dropout_probri   )rq   r<   new_num_attention_headsrr   s      rN   r[   zConvBertSelfAttention.__init__  s>    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)"<"<@Q@Q"Q"Q&$88DO'(D$'>D$$//DO & 7 7 8 88A=UVV$*$6$6$:R:R$RWX#X !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
#2F&&(:(:D<Q<Q$
  "$4+=+=t?W?WZ^ZoZo?o!p ii(:(:D<N<NOii..2S$BWBWZ[B[_`A`=acd<e
 zz&"E"EFrs   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr$   r   r#   r   r   )rn   r   r   viewr8   )rq   r   new_x_shapes      rN   transpose_for_scoresz*ConvBertSelfAttention.transpose_for_scoresF  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$rs   r   attention_mask	head_maskencoder_hidden_statesoutput_attentionsrv   c                    | j                  |      }|j                  d      }|#| j                  |      }| j                  |      }	n"| j                  |      }| j                  |      }	| j	                  |j                  dd            }
|
j                  dd      }
| j                  |      }| j                  |      }| j                  |	      }t        j                  |
|      }| j                  |      }t        j                  |d| j                  dg      }t        j                  |d      }| j                  |      }t        j                  ||d| j                  g      }|j                  dd      j                         j!                  d      }t"        j$                  j'                  || j                  dgd| j                  dz
  dz  dgd      }|j                  dd      j                  |d| j                  | j                        }t        j                  |d| j(                  | j                  g      }t        j*                  ||      }t        j                  |d| j                  g      }t        j*                  ||j                  dd            }|t-        j.                  | j(                        z  }|||z   }t"        j$                  j                  |d      }| j1                  |      }|||z  }t        j*                  ||      }|j3                  dddd      j                         }t        j                  ||d| j4                  | j(                  g      }t        j6                  ||gd      }|j                         d d | j4                  | j(                  z  dz  fz   } |j8                  | }|r||f}|S |f}|S )	Nr   r   r#   r$   dim)r   dilationr   strider   )r   rn   r   rM   r   	transposer   r4   multiplyr   reshaper   softmaxr   r   
contiguousr9   r   
functionalr   r   matmulmathsqrtri   r8   r   catr   )rq   r   r   r   r   r   mixed_query_layer
batch_sizemixed_key_layermixed_value_layermixed_key_conv_attn_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                          rN   r   zConvBertSelfAttention.forwardK  s    !JJ}5"''*
 !,"hh'<=O $

+@ A"hh}5O $

= 9$($<$<]=T=TUVXY=Z$[!$=$G$G1$M!//0AB--o>	//0AB..)BDUV 22?C!MM*;b$BWBWYZ=[\!MM*;C,,];~
BHZHZ7[\'11!Q7BBDNNrR--..2++a/A5q9 . 
 (11!Q7??D..0E0E
 ~D<T<TVZVkVk7lmn6GH~D<N<N7OP !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF==*b$BZBZ\`\t\t1uv		=(";Q? #0"4"4"6s";$$t'?'??!C?
 #
 +**,CD6G=/2 O\M]rs   NNNF)r   r   r   r[   r   r4   r   r   r   boolr   r   r   r   s   @rN   r   r     s    %GN% 7;158<,1P||P !!2!23P E--.	P
  (5P $D>P 
u||Xell33	4Prs   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrT   )rZ   r[   r   r   r   r!   re   rf   rg   rh   ri   rp   s     rN   r[   zConvBertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rs   r   input_tensorrv   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r!   ri   re   rq   r   r   s      rN   r   zConvBertSelfOutput.forward  7    

=1]3}|'CDrs   r   r   r   r[   r4   r   r   r   r   s   @rN   r   r     s1    >U\\  RWR^R^ rs   r   c                        e Zd Z fdZd Z	 	 	 	 d
dej                  deej                     deej                     deej                     dee	   de
ej                  eej                     f   fd	Z xZS )ConvBertAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rZ   r[   r   rq   r   outputsetpruned_headsrp   s     rN   r[   zConvBertAttention.__init__  s0    )&1	(0Ers   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rq   r   r   r   r   r   r   rM   r   r!   r   union)rq   headsindexs      rN   prune_headszConvBertAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rs   r   r   r   r   r   rv   c                 l    | j                  |||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )rq   r   )	rq   r   r   r   r   r   self_outputsattention_outputr   s	            rN   r   zConvBertAttention.forward  sQ     yy!
  ;;|AF#%QR(88rs   r   )r   r   r   r[   r  r4   r   r   r   r   r   r   r   r   s   @rN   r   r     s    ";* 7;158<,1|| !!2!23 E--.	
  (5 $D> 
u||Xe&7&788	9rs   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GroupedLinearLayerc                    t         |           || _        || _        || _        | j                  | j                  z  | _        | j                  | j                  z  | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        j                  t        j                  |            | _        y r   )rZ   r[   
input_sizeoutput_sizer0   group_in_dimgroup_out_dimr   r   r4   emptyr   r   )rq   r  r  r0   rr   s       rN   r[   zGroupedLinearLayer.__init__  s    $&$ OOt>!--@ll5;;t@Q@QSWSeSe#fgLL[!9:	rs   r   rv   c                    t        |j                               d   }t        j                  |d| j                  | j
                  g      }|j                  ddd      }t        j                  || j                        }|j                  ddd      }t        j                  ||d| j                  g      }|| j                  z   }|S )Nr   r$   r   r#   )listrn   r4   r   r0   r  r8   r   r   r  r   )rq   r   r   r   s       rN   r   zGroupedLinearLayer.forward  s    -,,./2
MM-"doot?P?P)QRIIaALLDKK(IIaAMM!j"d.>.>?@		Mrs   r   r   s   @rN   r  r    s#    ;U\\ ell rs   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertIntermediatec                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y )Nr   r  r  r0   )rZ   r[   r0   r   r   r   intermediate_sizer!   r  r   
hidden_actstrr   intermediate_act_fnrp   s     rN   r[   zConvBertIntermediate.__init__  s    !6#5#5v7O7OPDJ+!--6;S;S`f`q`qDJ f''-'-f.?.?'@D$'-'8'8D$rs   r   rv   c                 J    | j                  |      }| j                  |      }|S r   )r!   r  rq   r   s     rN   r   zConvBertIntermediate.forward  s&    

=100?rs   r   r   s   @rN   r  r    s#    9U\\ ell rs   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ConvBertOutputc                    t         |           |j                  dk(  r0t        j                  |j
                  |j                        | _        n1t        |j
                  |j                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   r  rT   )rZ   r[   r0   r   r   r  r   r!   r  re   rf   rg   rh   ri   rp   s     rN   r[   zConvBertOutput.__init__  s    !6#;#;V=O=OPDJ+!33ASAS`f`q`qDJ f&8&8f>S>STzz&"<"<=rs   r   r   rv   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      rN   r   zConvBertOutput.forward  r   rs   r   r   s   @rN   r  r    s1    	>U\\  RWR^R^ rs   r  c                       e Zd Z fdZ	 	 	 	 	 ddej
                  deej                     deej                     deej
                     deej
                     dee   de	ej
                  eej                     f   fd	Z
d
 Z xZS )ConvBertLayerc                 b   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r*| j                  st        |  d      t	        |      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is added)rZ   r[   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr  intermediater  r   rp   s     rN   r[   zConvBertLayer.__init__  s    '-'E'E$*62 ++#)#=#= ##??4&(f ghh"3F";D08$V,rs   r   r   r   r   encoder_attention_maskr   rv   c                 >   | j                  ||||      }|d   }|dd  }	| j                  r?|=t        | d      st        d|  d      | j	                  |||||      }
|
d   }|	|
dd  z   }	t        | j                  | j                  | j                  |      }|f|	z   }	|	S )N)r   r   r   r+  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r'  r(  rz   AttributeErrorr+  r   feed_forward_chunkr%  r&  )rq   r   r   r   r   r-  r   self_attention_outputsr	  r   cross_attention_outputslayer_outputs               rN   r   zConvBertLayer.forward%  s     "&/	 "0 "
 2!4(,??4@4!12$=dV DD D  '+&9&9 &%!'#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+rs   c                 L    | j                  |      }| j                  ||      }|S r   )r,  r   )rq   r	  intermediate_outputr3  s       rN   r0  z ConvBertLayer.feed_forward_chunkM  s,    "//0@A{{#68HIrs   )NNNNF)r   r   r   r[   r4   r   r   r   r   r   r   r0  r   r   s   @rN   r#  r#    s    -" 7;158<9=,1&||& !!2!23& E--.	&
  (5& !) 6& $D>& 
u||Xe&7&788	9&Prs   r#  c                        e Zd Z fdZ	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej
                     deej
                     dee   dee   d	ee   d
e	e
ef   fdZ xZS )ConvBertEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rZ   r[   r<   r   
ModuleListr1   r2   r#  layergradient_checkpointing)rq   r<   _rr   s      rN   r[   zConvBertEncoder.__init__T  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#r   r   r   r   r-  r   output_hidden_statesreturn_dictrv   c	           
         |rdnd }	|rdnd }
|r| j                   j                  rdnd }t        | j                        D ]  \  }}|r|	|fz   }	|||   nd }| j                  r.| j
                  r"| j                  |j                  ||||||      }n |||||||      }|d   }|sf|
|d   fz   }
| j                   j                  s||d   fz   } |r|	|fz   }	|st        d ||	|
|fD              S t        ||	|
|      S )Nr   r   r   r#   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     rN   	<genexpr>z*ConvBertEncoder.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   
attentionscross_attentions)
r<   r)  	enumerater:  r;  training_gradient_checkpointing_func__call__tupler   )rq   r   r   r   r   r-  r   r=  r>  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                   rN   r   zConvBertEncoder.forwardZ  s`    #7BD$5b4%64;;;Z;Zr`d(4 	VOA|#$58H$H!.7.CilO**t}} $ A A ))!"#)*%! !-!"#)*%! *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(;	V>   1]4D D '):<OQef  
 2++*1	
 	
rs   )NNNNFFT)r   r   r   r[   r4   r   r   r   r   r   r   r   r   r   r   s   @rN   r7  r7  S  s    , 7;158<9=,1/4&*;
||;
 !!2!23;
 E--.	;

  (5;
 !) 6;
 $D>;
 'tn;
 d^;
 
u88	9;
rs   r7  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ConvBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rZ   r[   r   r   r   r!   r   r  r  r   transform_act_fnre   rf   rp   s     rN   r[   z(ConvBertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STrs   r   rv   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r!   rV  re   r  s     rN   r   z'ConvBertPredictionHeadTransform.forward  s4    

=1--m<}5rs   r   r   s   @rN   rT  rT    s$    UU\\ ell rs   rT  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a8
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:


            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.c                   |    e Zd Z fdZd Zd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 	 ddeej                      deej"                     d	eej                      d
eej                      deej"                     deej"                     dee   dee   dee   deeef   fd              Z xZS )ConvBertModelc                 "   t         |   |       t        |      | _        |j                  |j
                  k7  r/t        j                  |j                  |j
                        | _        t        |      | _
        || _        | j                          y r   )rZ   r[   rQ   r   r^   r   r   r   embeddings_projectr7  encoderr<   	post_initrp   s     rN   r[   zConvBertModel.__init__  sl     ,V4  F$6$66&(ii0E0EvGYGY&ZD#&v.rs   c                 .    | j                   j                  S r   r   r`   rq   s    rN   get_input_embeddingsz"ConvBertModel.get_input_embeddings  s    ...rs   c                 &    || j                   _        y r   r_  )rq   rM   s     rN   set_input_embeddingsz"ConvBertModel.set_input_embeddings  s    */'rs   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr\  r:  r'  r  )rq   heads_to_pruner:  r  s       rN   _prune_headszConvBertModel._prune_heads   sE    
 +002 	CLE5LLu%//;;EB	Crs   batch_size, sequence_length
checkpointoutput_typer   rt   r   rX   rV   r   ru   r   r=  r>  rv   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  |
|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }t        | d      r| j'                  |      }| j)                  ||||||		      }|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer$   z5You have to specify either input_ids or inputs_embeds)ry   rX   rx   )rt   rV   rX   ru   r[  )r   r   r   r=  r>  )r<   r   r=  use_return_dictr   %warn_if_padding_and_no_attention_maskrn   ry   r4   onesrz   r   rX   rl   rm   ro   get_extended_attention_maskget_head_maskr2   r[  r\  )rq   rt   r   rX   rV   r   ru   r   r=  r>  r{   r   r|   ry   r}   r~   extended_attention_maskr   s                     rN   r   zConvBertModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZFCN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z"&"B"B>S^"_&&y$++2O2OP	l>iv ( 
 4-. 33MBM2/!5# % 
 rs   )	NNNNNNNNN)r   r   r   r[   ra  rc  rg  r   CONVBERT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r4   r   r   r   r   r   r   r   r   s   @rN   rY  rY    s-   

/0C ++D+K+KLi+jk&6$ 156:59371559,0/3&*<E,,-< !!2!23< !!1!12	<
 u//0< E--.<   1 12< $D>< 'tn< d^< 
u88	9< l<rs   rY  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                     t         |           t        d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _
        y )NgelurT   )rZ   r[   r   
activationr   re   r^   rf   r   r   r!   rp   s     rN   r[   z%ConvBertGeneratorPredictions.__init__P  sV    (0f&;&;AVAVWYYv1163H3HI
rs   generator_hidden_statesrv   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r!   r{  re   )rq   r|  r   s      rN   r   z$ConvBertGeneratorPredictions.forwardW  s3    

#:;6}5rs   )	r   r   r   r   r[   r4   r   r   r   r   s   @rN   rx  rx  M  s+    KJu/@/@ UEVEV rs   rx  z6ConvBERT Model with a `language modeling` head on top.c                       e Zd ZdgZ fdZd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 	 	 ddeej                      deej"                     d	eej                      d
eej                      deej"                     deej"                     deej                      dee   dee   dee   deeef   fd              Z xZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  |j                        | _
        | j                          y r   )rZ   r[   rY  r   rx  generator_predictionsr   r   r^   r]   generator_lm_headr]  rp   s     rN   r[   zConvBertForMaskedLM.__init__c  sR     %f-%A&%I"!#6+@+@&BSBS!Trs   c                     | j                   S r   r  r`  s    rN   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddingsm  s    %%%rs   c                     || _         y r   r  )rq   r`   s     rN   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddingsp  s
    !0rs   rh  ri  rt   r   rX   rV   r   ru   labelsr   r=  r>  rv   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|Pt        j                         } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r$   r   losslogitsr   rE  )r<   rm  r   r  r  r   r	   r   r]   r   r   rE  )rq   rt   r   rX   rV   r   ru   r  r   r=  r>  r|  generator_sequence_outputprediction_scoresr  loss_fctr   s                    rN   r   zConvBertForMaskedLM.forwards  s   2 &1%<k$++B]B]"&-- 
#
 %<A$>! 667PQ 223DE**,H-222t{{7M7MNPVP[P[\^P_`D'),CAB,GGF)-)9TGf$EvE$1??.99	
 	
rs   
NNNNNNNNNN)r   r   r   _tied_weights_keysr[   r  r  r   rs  rt  r   ru  r   rv  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r  _  sG   45&1 ++D+K+KLi+jk&"$ 156:59371559-1,0/3&*4
E,,-4
 !!2!234
 !!1!12	4

 u//04
 E--.4
   1 124
 ))*4
 $D>4
 'tn4
 d^4
 
un$	%4
 l4
rs   r  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 h   t         |           t        j                  |j                  |j                        | _        |j                  |j                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        || _        y r   )rZ   r[   r   r   r   r!   classifier_dropoutrh   rg   ri   
num_labelsout_projr<   rq   r<   r  rr   s      rN   r[   z#ConvBertClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrs   r   rv   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )ri   r!   r   r<   r  r  )rq   r   r   r   s       rN   r   z"ConvBertClassificationHead.forward  se    !Q'"LLOJJqM4;;))*1-LLOMM!rs   r   r   s   @rN   r  r    s&    7	U\\  rs   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deee	f   fd              Z xZS )!ConvBertForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        | j                          y r   )	rZ   r[   r  r<   rY  r   r  
classifierr]  rp   s     rN   r[   z*ConvBertForSequenceClassification.__init__  sH      ++%f-4V< 	rs   rh  ri  rt   r   rX   rV   r   ru   r  r   r=  r>  rv   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rX   rV   r   ru   r   r=  r>  r   r   
regressionsingle_label_classificationmulti_label_classificationr$   r  )r<   rm  r   r  problem_typer  rY   r4   ro   r   r
   squeezer	   r   r   r   r   rE  rq   rt   r   rX   rV   r   ru   r  r   r=  r>  r   sequence_outputr  r  r  r   s                    rN   r   z)ConvBertForSequenceClassification.forward  s   2 &1%<k$++B]B]--))%'/!5#   

 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rs   r  )r   r   r   r[   r   rs  rt  r   ru  r   rv  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    sB    ++D+K+KLi+jk&,$ 156:59371559-1,0/3&*D
E,,-D
 !!2!23D
 !!1!12	D

 u//0D
 E--.D
   1 12D
 ))*D
 $D>D
 'tnD
 d^D
 
u..	/D
 lD
rs   r  z
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deee	f   fd              Z xZS )ConvBertForMultipleChoicec                     t         |   |       t        |      | _        t	        |      | _        t        j                  |j                  d      | _	        | j                          y )Nr   )rZ   r[   rY  r   r   sequence_summaryr   r   r   r  r]  rp   s     rN   r[   z"ConvBertForMultipleChoice.__init__/  sM     %f- / 7))F$6$6: 	rs   z(batch_size, num_choices, sequence_lengthri  rt   r   rX   rV   r   ru   r  r   r=  r>  rv   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r$   r   r  r   r  )r<   rm  rC   r   rn   r   r  r  r	   r   r   rE  )rq   rt   r   rX   rV   r   ru   r  r   r=  r>  num_choicesr   r  pooled_outputr  reshaped_logitsr  r  r   s                       rN   r   z!ConvBertForMultipleChoice.forward9  s   6 &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 --))%'/!5#   

 "!*--o>/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rs   r  )r   r   r   r[   r   rs  rt  r   ru  r   rv  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r  '  sE    +!(()ST  &-$ 156:59371559-1,0/3&*@
E,,-@
 !!2!23@
 !!1!12	@

 u//0@
 E--.@
   1 12@
 ))*@
 $D>@
 'tn@
 d^@
 
u//	0@
@
rs   r  z
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deee	f   fd              Z xZS )ConvBertForTokenClassificationc                 `   t         |   |       |j                  | _        t        |      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )rZ   r[   r  rY  r   r  rh   r   rg   ri   r   r   r  r]  r  s      rN   r[   z'ConvBertForTokenClassification.__init__  s      ++%f-)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rs   rh  ri  rt   r   rX   rV   r   ru   r  r   r=  r>  rv   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r$   r   r  )r<   rm  r   ri   r  r	   r   r  r   r   rE  r  s                    rN   r   z&ConvBertForTokenClassification.forward  s    . &1%<k$++B]B]--))%'/!5#   

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rs   r  )r   r   r   r[   r   rs  rt  r   ru  r   rv  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    s5    ++D+K+KLi+jk&)$ 156:59371559-1,0/3&*2
E,,-2
 !!2!232
 !!1!12	2

 u//02
 E--.2
   1 122
 ))*2
 $D>2
 'tn2
 d^2
 
u++	,2
 l2
rs   r  z
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deee	f   fd              Z xZS )ConvBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
rZ   r[   r  rY  r   r   r   r   
qa_outputsr]  rp   s     rN   r[   z%ConvBertForQuestionAnswering.__init__  sS      ++%f-))F$6$68I8IJ 	rs   rh  ri  rt   r   rX   rV   r   ru   start_positionsend_positionsr   r=  r>  rv   c                 (   ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r$   r   )ignore_indexr#   )r  start_logits
end_logitsr   rE  )r<   rm  r   r  splitr  r   r  rn   clampr	   r   r   rE  )rq   rt   r   rX   rV   r   ru   r  r  r   r=  r>  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r   z$ConvBertForQuestionAnswering.forward  s   < &1%<k$++B]B]--))%'/!5#   

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rs   )NNNNNNNNNNN)r   r   r   r[   r   rs  rt  r   ru  r   rv  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    s[    ++D+K+KLi+jk&0$ 156:593715596:48,0/3&*H
E,,-H
 !!2!23H
 !!1!12	H

 u//0H
 E--.H
   1 12H
 "%"2"23H
   0 01H
 $D>H
 'tnH
 d^H
 
u22	3H
 lH
rs   r  )Er   r   r)   operatorr   typingr   r   r   r4   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_convbertr   
get_loggerr   r'   ru  rv  rO   ModulerQ   r   r   r   r   r   r  r  r  r#  r7  rT  CONVBERT_START_DOCSTRINGrs  rY  rx  r  r  r  r  r  r  r   rs   rN   <module>r     su     	  ) )    A A 1  ? l l u u 2 
		H	%/ "yx9 9x*o *8bii 4}BII }@ *		 *Z ,299 (RYY &:BII :zB
bii B
Jbii "	 2 j h]+ ]	]@299 $ RTlmM
1 M
 nM
` 0  U
(? U
U
p  S
 7 S
S
l  G
%< G
G
T  Y
#: Y
Y
rs   