
    sg3                     `   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z   ejB                  e"      Z#dZ$dZ% G d dejL                        Z' G d dejL                        Z( G d dejL                        Z)de(iZ* G d dejL                        Z+ G d dejL                        Z, G d dejL                        Z- G d dejL                        Z. G d  d!ejL                        Z/ G d" d#e      Z0d$Z1d%Z2 ed&e1       G d' d(e0             Z3 G d) d*ejL                        Z4 G d+ d,ejL                        Z5 ed-e1       G d. d/e0             Z6e G d0 d1e             Z7 ed2e1       G d3 d4e0             Z8y)5zPyTorch Splinter model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsModelOutputQuestionAnsweringModelOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )SplinterConfigztau/splinter-baser   c                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     dee	   de
fd	Z xZS )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       t+        |dd      | _        y )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr"   selfconfig	__class__s     a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/splinter/modeling_splinter.pyr%   zSplinterEmbeddings.__init__+   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$    	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                  |      }||z   }	| j                  dk(  r| j                  |      }
|	|
z  }	| j                  |	      }	| j                  |	      }	|	S )Nr    r   dtypedevicer#   )sizer   r5   zeroslongrG   r*   r.   r"   r,   r/   r3   )r:   r?   r@   r   rA   rB   input_shape
seq_lengthr.   
embeddingsr,   s              r=   forwardzSplinterEmbeddings.forward<   s     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r>   )NNNNr   )__name__
__module____qualname____doc__r%   r   r5   
LongTensorFloatTensorintr   rN   __classcell__r<   s   @r=   r   r   (   s    Q^& 1559375901E,,- !!1!12 u//0	
   1 12 !) 
r>   r   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )SplinterSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r"   r#   relative_keyrelative_key_query   r   )r$   r%   r(   num_attention_headshasattr
ValueErrorrU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer1   attention_probs_dropout_probr3   r8   r"   r+   r&   distance_embedding
is_decoderr:   r;   r"   r<   s      r=   r%   zSplinterSelfAttention.__init__`   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r>   xrC   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr    r   r_   r   r
   )rH   r`   rc   viewpermute)r:   rm   new_x_shapes      r=   transpose_for_scoresz*SplinterSelfAttention.transpose_for_scoresz   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r>   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   r_   dimr    r]   r^   rE   )rF   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) rf   rr   rg   rh   r5   catrk   matmul	transposer"   shapetensorrJ   rG   ro   r6   rj   r+   torF   einsummathsqrtrc   r   
functionalsoftmaxr3   rp   
contiguousrH   rd   )r:   rs   rt   ru   rv   rw   rx   ry   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r=   rN   zSplinterSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr>   NNNNNNF)rO   rP   rQ   r%   r5   Tensorrr   r   rT   r   boolrN   rV   rW   s   @r=   rY   rY   _   s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr>   rY   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r$   r%   r   re   r(   denser/   r0   r1   r2   r3   r9   s     r=   r%   zSplinterSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r>   rs   input_tensorrC   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r3   r/   r:   rs   r   s      r=   rN   zSplinterSelfOutput.forward   7    

=1]3}|'CDr>   rO   rP   rQ   r%   r5   r   rN   rV   rW   s   @r=   r   r      1    >U\\  RWR^R^ r>   r   eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )SplinterAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr"   )	r$   r%   SPLINTER_SELF_ATTENTION_CLASSES_attn_implementationr:   r   outputsetpruned_headsrl   s      r=   r%   zSplinterAttention.__init__   sC    3F4O4OP,C
	 )0Er>   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r{   )lenr   r:   r`   rc   r   r   rf   rg   rh   r   r   rd   union)r:   headsindexs      r=   prune_headszSplinterAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r>   rs   rt   ru   rv   rw   rx   ry   rC   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )r:   r   )r:   rs   rt   ru   rv   rw   rx   ry   self_outputsattention_outputr   s              r=   rN   zSplinterAttention.forward  sW     yy!"
  ;;|AF#%QR(88r>   r   r   )rO   rP   rQ   r%   r   r5   r   r   rT   r   r   rN   rV   rW   s   @r=   r   r      s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r>   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )SplinterIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r$   r%   r   re   r(   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr9   s     r=   r%   zSplinterIntermediate.__init__/  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r>   rs   rC   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )r:   rs   s     r=   rN   zSplinterIntermediate.forward7  s&    

=100?r>   r   rW   s   @r=   r   r   .  s#    9U\\ ell r>   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )SplinterOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r$   r%   r   re   r   r(   r   r/   r0   r1   r2   r3   r9   s     r=   r%   zSplinterOutput.__init__?  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r>   rs   r   rC   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r=   rN   zSplinterOutput.forwardE  r   r>   r   rW   s   @r=   r   r   >  r   r>   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )SplinterLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedr#   r   )r$   r%   chunk_size_feed_forwardseq_len_dimr   	attentionrk   add_cross_attentionrb   crossattentionr   intermediater   r   r9   s     r=   r%   zSplinterLayer.__init__N  s    '-'E'E$*62 ++#)#=#= ##?? D6)g!hii"3FT^"_D08$V,r>   rs   rt   ru   rv   rw   rx   ry   rC   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr_   )ry   rx   r   r   r    r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r}   )	r   rk   ra   rb   r   r   feed_forward_chunkr   r   )r:   rs   rt   ru   rv   rw   rx   ry   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r=   rN   zSplinterLayer.forward\  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr>   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r:   r   intermediate_outputr   s       r=   r   z SplinterLayer.feed_forward_chunk  s,    "//0@A{{#68HIr>   r   )rO   rP   rQ   r%   r5   r   r   rT   r   r   rN   r   rV   rW   s   @r=   r   r   M  s    -" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br>   r   c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )SplinterEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r$   r%   r;   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r:   r;   _r<   s      r=   r%   zSplinterEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#rs   rt   ru   rv   rw   past_key_valuesr   ry   output_hidden_statesreturn_dictrC   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r    r   r_   c              3   $   K   | ]  }|| 
 y wr   r   ).0vs     r=   	<genexpr>z*SplinterEncoder.forward.<locals>.<genexpr>  s      
 = 
s   last_hidden_stater   rs   
attentionscross_attentions)r;   r   r   trainingloggerwarning_once	enumerater   _gradient_checkpointing_func__call__tupler   )r:   rs   rt   ru   rv   rw   r   r   ry   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskrx   layer_outputss                       r=   rN   zSplinterEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r>   )	NNNNNNFFT)rO   rP   rQ   r%   r5   r   r   rT   r   r   r   r   rN   rV   rW   s   @r=   r   r     s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r>   r   c                   "    e Zd ZdZeZdZdZd Zy)SplinterPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    splinterTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r   r   re   weightdatanormal_r;   initializer_rangebiaszero_r&   r   r/   fill_)r:   modules     r=   _init_weightsz%SplinterPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r>   N)	rO   rP   rQ   rR   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r>   r=   r  r    s    
 "L"&*#*r>   r  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SplinterConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a/
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare Splinter Model transformer outputting raw hidden-states without any specific head on top.c            !           e Zd ZdZ fdZd Zd Zd Z ee	j                  d             eeee      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej"                     d	eej"                     d
eej"                     deej"                     deej"                     deej"                     deej"                     deej"                     deeej&                        dee   dee   dee   dee   deeef   fd              Z xZS )SplinterModela*  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r$   r%   r;   r   rM   r   encoder	post_initr9   s     r=   r%   zSplinterModel.__init__e  s;     ,V4&v. 	r>   c                 .    | j                   j                  S r   rM   r*   )r:   s    r=   get_input_embeddingsz"SplinterModel.get_input_embeddingso  s    ...r>   c                 &    || j                   _        y r   r  )r:   rh   s     r=   set_input_embeddingsz"SplinterModel.set_input_embeddingsr  s    */'r>   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r:   heads_to_pruner   r   s       r=   _prune_headszSplinterModel._prune_headsu  sE    
 +002 	CLE5LLu%//;;EB	Cr>   batch_size, sequence_length
checkpointoutput_typer  r?   rt   r@   r   ru   rA   rv   rw   r   r   ry   r   r   rC   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  |||z   f|      }|&t        j                  |t        j                  |	      }| j                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j!                  |      }nd}| j#                  || j                   j$                        }| j'                  |||||
      }| j)                  ||||||	|
|||
      }|d   }|s	|f|dd z   S t+        ||j,                  |j.                  |j0                  |j2                        S )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embedsr   r_   )rG   rE   )r?   r   r@   rA   rB   )	rt   ru   rv   rw   r   r   ry   r   r   r   r   )r;   ry   r   use_return_dictrk   r   rb   %warn_if_padding_and_no_attention_maskrH   rG   r   r5   onesrI   rJ   get_extended_attention_maskinvert_attention_maskget_head_maskr   rM   r  r   r   rs   r   r   )r:   r?   rt   r@   r   ru   rA   rv   rw   r   r   ry   r   r   rK   
batch_sizerL   rG   rB   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputs                               r=   rN   zSplinterModel.forward}  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,#%(;;;8-+;;)77&11,==
 	
r>   )NNNNNNNNNNNNN)rO   rP   rQ   rR   r%   r   r"  r&  r   SPLINTER_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r5   r   r   rT   r   r   r   rN   rV   rW   s   @r=   r  r  Z  s   
/0C ++D+K+KLi+jk&=$ -11515/3,0048<9==A$(,0/3&*w
ELL)w
 !.w
 !.	w

 u||,w
 ELL)w
  -w
  (5w
 !) 6w
 "$u'8'8"9:w
 D>w
 $D>w
 'tnw
 d^w
 
u??	@w
 lw
r>   r  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )SplinterFullyConnectedLayerc                     t         |           || _        || _        t	        j
                  | j                  | j                        | _        t        |   | _        t	        j                  | j                        | _	        y r   )
r$   r%   	input_dim
output_dimr   re   r   r   act_fnr/   )r:   rB  rC  r   r<   s       r=   r%   z$SplinterFullyConnectedLayer.__init__  sV    "$YYt~~t?
Z(doo6r>   inputsrC   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rD  r/   )r:   rE  rs   s      r=   rN   z#SplinterFullyConnectedLayer.forward  s2    

6*M2}5r>   )gelur   rW   s   @r=   r@  r@    s#    7ell u|| r>   r@  c                   (     e Zd ZdZ fdZd Z xZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    t         |           t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                  d      | _        y )NF)r  )r$   r%   r@  r(   query_start_transformquery_end_transformstart_transformend_transformr   re   start_classifierend_classifierr9   s     r=   r%   z'QuestionAwareSpanSelectionHead.__init__  s    %@ASASU[UgUg%h"#>v?Q?QSYSeSe#f :6;M;MvOaOab89K9KVM_M_` "		&*<*<f>P>PW\ ] ii(:(:F<N<NUZ[r>   c                    |j                         \  }}}|j                  d      j                  dd|      }t        j                  |d|      }| j                  |      }| j                  |      }| j                  |      }	| j                  |      }
| j                  |      }|	j                  ddd      }	t        j                  ||	      }| j                  |      }|
j                  ddd      }
t        j                  ||
      }||fS )Nr    r   )r|   r   r   r_   )rH   	unsqueezerepeatr5   gatherrK  rL  rM  rN  rO  rp   r   rP  )r:   rE  	positionsr   r|   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrs   start_logits
end_logitss                 r=   rN   z&QuestionAwareSpanSelectionHead.forward   s    KKM	1c##B'..q!S9V%@55mD11-@))&1
%%f---.>?''1a0
||M:>++N;##Aq!,\\-:
Z''r>   )rO   rP   rQ   rR   r%   rN   rV   rW   s   @r=   rI  rI    s    
	\(r>   rI  z
    Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deej                     deee	f   fd              Z xZS )SplinterForQuestionAnsweringc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r$   r%   r  r	  rI  splinter_qassquestion_token_idr  r9   s     r=   r%   z%SplinterForQuestionAnswering.__init__=  C     %f-;FC!'!9!9 	r>   r'  r(  r?   rt   r@   r   ru   rA   start_positionsend_positionsry   r   r   question_positionsrC   c                    ||n| j                   j                  }d}||Dt        j                  t        j                  || j
                        j                         d      }nJt        j                  |j                  d      t        j                  |j                  |j                        }|j                  d      }d}| j                  |||||||	|
|	      }|d   }| j                  ||      \  }}|r"|j                  d	      |j                  d	      }}|d|d	|z
  t        j                   |j"                        j$                  z  z   }|d	|z
  t        j                   |j"                        j$                  z  z   }d}||t'        |j                               d	kD  r|j                  d      }t'        |j                               d	kD  r|j                  d      }|j                  d	      }|j)                  d|       |j)                  d|       t+        |
      } |||      } |||      }||z   dz  }|s||f|d	d z   }||f|z   S |S t-        ||||j.                  |j0                        S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr    r{   r   )rF   layoutrG   Trt   r@   r   ru   rA   ry   r   r   r   ignore_indexr_   lossr[  r\  rs   r   )r;   r,  r5   argmaxeqrb  rU   rI   rH   rJ   rh  rG   rR  r	  ra  squeezefinforF   minr   clamp_r	   r   rs   r   )r:   r?   rt   r@   r   ru   rA   rd  re  ry   r   r   rf  question_positions_were_none"question_position_for_each_exampler   r:  r[  r\  
total_lossignored_indexloss_fct
start_lossend_lossr   s                            r=   rN   z$SplinterForQuestionAnswering.forwardG  s   H &1%<k$++B]B]',$%$5:\\XXi)?)?@EEGR62 6;[[!&&q)MDXDXanauau62 "D!M!Mb!Q+/(--))%'/!5#   

 "!*#'#5#5oGY#Z j''3';';A'>
@R@RST@U*L%'1~+=\M_M_A`AdAd*ddL#q>'9U[[IYIY=Z=^=^&^^J
&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r>   NNNNNNNNNNNN)rO   rP   rQ   r%   r   r;  r<  r   r=  r   r>  r   r5   r   rS   r   r   r   rN   rV   rW   s   @r=   r^  r^  5  sh    ++D+K+KLi+jk&0$ -11515/3,0046:48,0/3&*9=^
ELL)^
 !.^
 !.	^

 u||,^
 ELL)^
  -^
 "%"2"23^
   0 01^
 $D>^
 'tn^
 d^^
 %U%5%56^
 
u22	3^
 l^
r>   r^  c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZeeej                        ed<   dZeeej                        ed<   y)SplinterForPreTrainingOutputa  
    Class for outputs of Splinter as a span selection model.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nrm  r[  r\  rs   r   )rO   rP   rQ   rR   rm  r   r5   rT   __annotations__r[  r\  rs   r   r   r   r>   r=   r}  r}    sr    . )-D(5$$
%,&*L%##*$(J!!(8<M8E%"3"345<59Ju00129r>   r}  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       e Zd Z fdZ eej                  d            	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deej                     deeef   fd       Zdej                  dej                  fdZ xZS )SplinterForPreTrainingc                     t         |   |       t        |      | _        t	        |      | _        |j                  | _        | j                          y r   r`  r9   s     r=   r%   zSplinterForPreTraining.__init__  rc  r>   z*batch_size, num_questions, sequence_lengthr?   rt   r@   r   ru   rA   rd  re  ry   r   r   rf  rC   c                 b   ||n| j                   j                  }|||t        d      ||t        d      || j                  |      }| j	                  |||||||	|
|	      }|d   }|j                         \  }}}| j                  ||      \  }}|j                  d      }||j                  d      j                  |||      }|d|z
  t        j                  |j                        j                  z  z   }|d|z
  t        j                  |j                        j                  z  z   }d}|||j                  dt        d|dz
               |j                  dt        d|dz
               t        | j                   j                         } ||j#                  ||z  |      |j#                  ||z              } ||j#                  ||z  |      |j#                  ||z              }||z   dz  }|s||f|dd z   }||f|z   S |S t%        ||||j&                  |j(                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedri  r   r   rj  r_   rl  )r;   r,  	TypeError_prepare_question_positionsr	  rH   ra  rR  r7   r5   rq  rF   rr  rs  maxr	   r)   ro   r}  rs   r   )r:   r?   rt   r@   r   ru   rA   rd  re  ry   r   r   rf  r   r:  r2  sequence_lengthr|   r[  r\  num_questions attention_mask_for_each_questionrv  rx  ry  rz  r   s                              r=   rN   zSplinterForPreTraining.forward  s   B &1%<k$++B]B]%/*E-Jcabb'I,=\]]'!%!A!A)!L--))%'/!5#   

 "!*+:+?+?+A(
OS#'#5#5oGY#Z j*//2%/=/G/G/J/Q/QM?0, (1/O+OSXS^S^_k_q_qSrSvSv*vvL#q+K'Ku{{[e[k[kOlOpOp&ppJ
&=+D""1c!_q-@&AB  C?Q+>$?@ (T[[5M5MNH!!!*}"<oN$$Z-%?@J  
] :OL"":#=>H %x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r>   c                    t        j                  || j                  j                  k(        \  }}t        j                  |      }t        j
                  |j                  d      |j                         f| j                  j                  t         j                  |j                        }t        j                  |D cg c]  }t        j                  |       c}      }||||f<   |S c c}w )Nr   rE   )r5   wherer;   rb  bincountfullrH   r  r)   rJ   rG   r~   r6   )r:   r?   rowsflat_positionsr  rU  ncolss           r=   r  z2SplinterForPreTraining._prepare_question_positionsH  s    ${{98U8U+UVnt,JJ^^A 1 1 34KK$$**##	
	 yy=Aa%,,q/AB .	$* Bs   <C(r{  )rO   rP   rQ   r%   r   r;  r<  r   r5   r   rS   r   r   r   r}  rN   r  rV   rW   s   @r=   r  r    so    +!(()UV
 -11515/3,0046:48,0/3&*9=b
ELL)b
 !.b
 !.	b

 u||,b
 ELL)b
  -b
 "%"2"23b
   0 01b
 $D>b
 'tnb
 d^b
 %U%5%56b
 
u22	3b
b
HU\\ ell r>   r  )9rR   r   dataclassesr   typingr   r   r   r   r5   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_splinterr   
get_loggerrO   r   r=  r>  Moduler   rY   r   r   r   r   r   r   r   r  SPLINTER_START_DOCSTRINGr;  r  r@  rI  r^  r}  r  r   r>   r=   <module>r     s     ! / /    % ! t t - l l u u 2 
		H	%) "3 3nCBII CN  "# 0		 0h299  RYY SBII SnZ
bii Z
z*o *8	 / d h\
+ \
	\
~")) $#(RYY #(L  o
#: o
o
d :; : :> 
 }4 }}r>   