
    sgo                     B   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ dd	lmZ  e       rdd
lmZ ddlmZ ddlmZmZmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddlm%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+  e(jX                  e-      Z.dZ/dZ0d Z1d Z2d Z3 G d dejh                        Z5 G d dejh                        Z6 G d dejh                        Z7 G d dejh                        Z8 G d d ejh                        Z9 G d! d"ejh                        Z: G d# d$ejh                        Z; G d% d&ejh                        Z< G d' d(ejh                        Z= G d) d*ejh                        Z> G d+ d,ejh                        Z? G d- d.ejh                        Z@ G d/ d0ejh                        ZA G d1 d2ejh                        ZB G d3 d4e"      ZCe G d5 d6e             ZDd7ZEd8ZF e&d9eE       G d: d;eC             ZG e&d<eE       G d= d>eC             ZH e&d?eE       G d@ dAeC             ZI e&dBeE       G dC dDeC             ZJ e&dEeE       G dF dGeC             ZK e&dHeE       G dI dJeC             ZL e&dKeE       G dL dMeC             ZM e&dNeE       G dO dPeC             ZNy)QzPyTorch FNet model.    N)	dataclass)partial)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )is_scipy_available)linalg)ACT2FN)	BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
FNetConfigzgoogle/fnet-baser!   c                     | j                   d   }|d|d|f   }| j                  t        j                        } t        j                  d| ||      S )z4Applies 2D matrix multiplication to 3D input arrays.r    Nzbij,jk,ni->bnk)shapetypetorch	complex64einsum)xmatrix_dim_onematrix_dim_two
seq_lengths       Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/fnet/modeling_fnet.py_two_dim_matmulr-   @   sN    J#KZK*$<=N	uA<<(!^^LL    c                     t        | ||      S N)r-   )r(   r)   r*   s      r,   two_dim_matmulr1   I   s    1nn==r.   c                     | }t        t        | j                        dd       D ]#  }t        j                  j	                  ||      }% |S )z
    Applies n-dimensional Fast Fourier Transform (FFT) to input array.

    Args:
        x: Input n-dimensional array.

    Returns:
        n-dimensional Fourier transform of input n-dimensional array.
    r    N)axis)reversedrangendimr%   fft)r(   outr3   s      r,   fftnr9   N   sG     Cqvvqr*+ ,iimmCdm+,Jr.   c                   *     e Zd ZdZ fdZddZ xZS )FNetEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 x   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j
                        | _        t        j                   |j"                        | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  dt)        j.                  | j0                  j3                         t(        j4                        d       y )	N)padding_idxepsposition_ids)r    F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsLinear
projectionDropouthidden_dropout_probdropoutregister_bufferr%   arangeexpandzerosr@   sizelongselfconfig	__class__s     r,   rG   zFNetEmbeddings.__init__a   s<   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>ST))F$6$68J8JKzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r.   c                 X   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
| j                  |
      }
|
S )NrA   r    rC   r   rE   device)r\   r@   hasattrrC   rZ   r%   r[   r]   rd   rL   rP   rN   rQ   rT   rW   )r_   	input_idsrC   r@   inputs_embedsinput_shaper+   buffered_token_type_ids buffered_token_type_ids_expandedrP   
embeddingsrN   s               r,   forwardzFNetEmbeddings.forwardw   s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
__Z0
\\*-
r.   )NNNN)__name__
__module____qualname____doc__rG   rl   __classcell__ra   s   @r,   r;   r;   ^   s    Q
,!r.   r;   c                   *     e Zd Z fdZd Zd Z xZS )FNetBasicFourierTransformc                 D    t         |           | j                  |       y r0   )rF   rG   _init_fourier_transformr^   s     r,   rG   z"FNetBasicFourierTransform.__init__   s    $$V,r.   c                    |j                   s+t        t        j                  j                  d      | _        y |j                  dk  rt               r| j                  dt        j                  t        j                  |j                        t        j                               | j                  dt        j                  t        j                  |j                        t        j                               t        t        | j                   | j"                        | _        y t%        j&                  d       t        | _        y t        | _        y )	N)r       dim   dft_mat_hiddenrD   dft_mat_seq)r)   r*   zpSciPy is needed for DFT matrix calculation and is not found. Using TPU optimized fast fourier transform instead.)use_tpu_fourier_optimizationsr   r%   r7   r9   fourier_transformrM   r   rX   tensorr   dftrJ   r&   tpu_short_seq_lengthr1   r}   r|   r   warning)r_   r`   s     r,   rv   z1FNetBasicFourierTransform._init_fourier_transform   s    33%,UYY^^%HD"++t3!#$$$ell6::f>P>P3QY^YhYh&i $$!5<<

6;V;V0W_d_n_n#o *1"43C3CTXTgTg*& * *.&%)D"r.   c                 >    | j                  |      j                  }|fS r0   )r   real)r_   hidden_statesoutputss      r,   rl   z!FNetBasicFourierTransform.forward   s"     ((7<<zr.   )rm   rn   ro   rG   rv   rl   rq   rr   s   @r,   rt   rt      s    -*.r.   rt   c                   $     e Zd Z fdZd Z xZS )FNetBasicOutputc                     t         |           t        j                  |j                  |j
                        | _        y Nr>   )rF   rG   r   rQ   rJ   rR   r^   s     r,   rG   zFNetBasicOutput.__init__   s,    f&8&8f>S>STr.   c                 .    | j                  ||z         }|S r0   )rQ   r_   r   input_tensors      r,   rl   zFNetBasicOutput.forward   s    |m'CDr.   rm   rn   ro   rG   rl   rq   rr   s   @r,   r   r      s    Ur.   r   c                   $     e Zd Z fdZd Z xZS )FNetFourierTransformc                 b    t         |           t        |      | _        t	        |      | _        y r0   )rF   rG   rt   r_   r   outputr^   s     r,   rG   zFNetFourierTransform.__init__   s&    -f5	%f-r.   c                 X    | j                  |      }| j                  |d   |      }|f}|S Nr   )r_   r   )r_   r   self_outputsfourier_outputr   s        r,   rl   zFNetFourierTransform.forward   s1    yy/\!_mD!#r.   r   rr   s   @r,   r   r      s    .
r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )FNetIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r0   )rF   rG   r   rS   rJ   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr^   s     r,   rG   zFNetIntermediate.__init__   s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r.   r   returnc                 J    | j                  |      }| j                  |      }|S r0   )r   r   r_   r   s     r,   rl   zFNetIntermediate.forward   s&    

=100?r.   rm   rn   ro   rG   r%   Tensorrl   rq   rr   s   @r,   r   r      s#    9U\\ ell r.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
FNetOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rF   rG   r   rS   r   rJ   r   rQ   rR   rU   rV   rW   r^   s     r,   rG   zFNetOutput.__init__   s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r.   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r0   )r   rW   rQ   r   s      r,   rl   zFNetOutput.forward   s7    

=1]3}|'CDr.   r   rr   s   @r,   r   r      s1    >U\\  RWR^R^ r.   r   c                   *     e Zd Z fdZd Zd Z xZS )	FNetLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y Nr    )
rF   rG   chunk_size_feed_forwardseq_len_dimr   fourierr   intermediater   r   r^   s     r,   rG   zFNetLayer.__init__   sI    '-'E'E$+F3,V4 (r.   c                     | j                  |      }|d   }t        | j                  | j                  | j                  |      }|f}|S r   )r   r   feed_forward_chunkr   r   )r_   r   self_fourier_outputsr   layer_outputr   s         r,   rl   zFNetLayer.forward   sO    #||M:-a00##T%A%A4CSCSUc
  /r.   c                 L    | j                  |      }| j                  ||      }|S r0   )r   r   )r_   r   intermediate_outputr   s       r,   r   zFNetLayer.feed_forward_chunk  s*    "//?{{#6Gr.   )rm   rn   ro   rG   rl   r   rq   rr   s   @r,   r   r      s    )
r.   r   c                   &     e Zd Z fdZddZ xZS )FNetEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rF   rG   r`   r   
ModuleListr5   num_hidden_layersr   layergradient_checkpointing)r_   r`   _ra   s      r,   rG   zFNetEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                 2   |rdnd }t        | j                        D ]O  \  }}|r||fz   }| j                  r)| j                  r| j	                  |j
                  |      }n ||      }|d   }Q |r||fz   }|st        d ||fD              S t        ||      S )N r   c              3   &   K   | ]	  }||  y wr0   r   ).0vs     r,   	<genexpr>z&FNetEncoder.forward.<locals>.<genexpr>+  s     Xq!-Xs   )last_hidden_stater   )	enumerater   r   training_gradient_checkpointing_func__call__tupler   )r_   r   output_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss           r,   rl   zFNetEncoder.forward  s    "6BD(4 		-OA|#$58H$H!**t}} $ A A,BWBWYf g ,] ;)!,M		-   1]4D DX]4E$FXXXN_``r.   )FTr   rr   s   @r,   r   r     s    ,ar.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
FNetPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r0   )rF   rG   r   rS   rJ   r   Tanh
activationr^   s     r,   rG   zFNetPooler.__init__2  s9    YYv1163E3EF
'')r.   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S r   )r   r   )r_   r   first_token_tensorpooled_outputs       r,   rl   zFNetPooler.forward7  s6     +1a40

#566r.   r   rr   s   @r,   r   r   1  s#    $
U\\ ell r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )FNetPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rF   rG   r   rS   rJ   r   r   r   r   r   transform_act_fnrQ   rR   r^   s     r,   rG   z$FNetPredictionHeadTransform.__init__B  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr.   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r0   )r   r   rQ   r   s     r,   rl   z#FNetPredictionHeadTransform.forwardK  s4    

=1--m<}5r.   r   rr   s   @r,   r   r   A  s$    UU\\ ell r.   r   c                   ,     e Zd Z fdZd ZddZ xZS )FNetLMPredictionHeadc                 D   t         |           t        |      | _        t	        j
                  |j                  |j                        | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y r0   )rF   rG   r   	transformr   rS   rJ   rI   decoder	Parameterr%   r[   biasr^   s     r,   rG   zFNetLMPredictionHead.__init__S  si    4V< yy!3!3V5F5FGLLV->->!?@	 IIr.   c                 J    | j                  |      }| j                  |      }|S r0   )r   r   r   s     r,   rl   zFNetLMPredictionHead.forward^  s$    }5]3r.   c                     | j                   j                  j                  j                  dk(  r| j                  | j                   _        y | j                   j                  | _        y )Nmeta)r   r   rd   r$   r_   s    r,   _tie_weightsz!FNetLMPredictionHead._tie_weightsc  sC    <<##((F2 $		DLL ))DIr.   )r   N)rm   rn   ro   rG   rl   r   rq   rr   s   @r,   r   r   R  s    	&
*r.   r   c                   $     e Zd Z fdZd Z xZS )FNetOnlyMLMHeadc                 B    t         |           t        |      | _        y r0   )rF   rG   r   predictionsr^   s     r,   rG   zFNetOnlyMLMHead.__init__m  s    /7r.   c                 (    | j                  |      }|S r0   )r   )r_   sequence_outputprediction_scoress      r,   rl   zFNetOnlyMLMHead.forwardq  s     ,,_=  r.   r   rr   s   @r,   r   r   l  s    8!r.   r   c                   $     e Zd Z fdZd Z xZS )FNetOnlyNSPHeadc                 l    t         |           t        j                  |j                  d      | _        y Nrx   )rF   rG   r   rS   rJ   seq_relationshipr^   s     r,   rG   zFNetOnlyNSPHead.__init__x  s'     "		&*<*<a @r.   c                 (    | j                  |      }|S r0   )r   )r_   r   seq_relationship_scores      r,   rl   zFNetOnlyNSPHead.forward|  s    !%!6!6}!E%%r.   r   rr   s   @r,   r   r   w  s    A&r.   r   c                   $     e Zd Z fdZd Z xZS )FNetPreTrainingHeadsc                     t         |           t        |      | _        t	        j
                  |j                  d      | _        y r   )rF   rG   r   r   r   rS   rJ   r   r^   s     r,   rG   zFNetPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r.   c                 N    | j                  |      }| j                  |      }||fS r0   )r   r   )r_   r   r   r   r   s        r,   rl   zFNetPreTrainingHeads.forward  s0     ,,_=!%!6!6}!E "888r.   r   rr   s   @r,   r   r     s    A
9r.   r   c                   "    e Zd ZdZeZdZdZd Zy)FNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    fnetTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r   r   rS   weightdatanormal_r`   initializer_ranger   zero_rH   r=   rQ   fill_)r_   modules     r,   _init_weightsz!FNetPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r.   N)	rm   rn   ro   rp   r!   config_classbase_model_prefixsupports_gradient_checkpointingr
  r   r.   r,   r   r     s    
 L&*#*r.   r   c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZeeej                        ed<   y)FNetForPreTrainingOutputa  
    Output type of [`FNetForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    Nlossprediction_logitsseq_relationship_logitsr   )rm   rn   ro   rp   r  r   r%   FloatTensor__annotations__r  r  r   r   r   r.   r,   r  r    sZ    $ )-D(5$$
%,+/u((/15U..58<M8E%"3"345<r.   r  aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`FNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare FNet Model transformer outputting raw hidden-states without any specific head on top.c                   0    e Zd ZdZd fd	Zd Zd Z eej                  d             e
eee      	 	 	 	 	 	 ddeej                      deej                      d	eej                      d
eej"                     dee   dee   deeef   fd              Z xZS )	FNetModelz

    The model can behave as an encoder, following the architecture described in [FNet: Mixing Tokens with Fourier
    Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.

    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd | _        | j                          y r0   )
rF   rG   r`   r;   rk   r   encoderr   pooler	post_init)r_   r`   add_pooling_layerra   s      r,   rG   zFNetModel.__init__  sK     (0"6*,=j(4 	r.   c                 .    | j                   j                  S r0   rk   rL   r   s    r,   get_input_embeddingszFNetModel.get_input_embeddings
  s    ...r.   c                 &    || j                   _        y r0   r  )r_   values     r,   set_input_embeddingszFNetModel.set_input_embeddings  s    */'r.   batch_size, sequence_length
checkpointoutput_typer  rf   rC   r@   rg   r   r   r   c                 |   ||n| j                   j                  }||n| j                   j                  }||t        d      ||j	                         }|\  }}	n&||j	                         d d }|\  }}	nt        d      | j                   j
                  r)|	dk  r$| j                   j                  |	k7  rt        d      ||j                  n|j                  }
|pt        | j                  d      r4| j                  j                  d d d |	f   }|j                  ||	      }|}n&t        j                  |t        j                  |
      }| j                  ||||      }| j                  |||	      }|d
   }| j                   | j!                  |      nd }|s
||f|dd  z   S t#        |||j$                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerA   z5You have to specify either input_ids or inputs_embedsr{   zThe `tpu_short_seq_length` in FNetConfig should be set equal to the sequence length being passed to the model when using TPU optimizations.rC   rc   )rf   r@   rC   rg   )r   r   r   r    )r   pooler_outputr   )r`   r   use_return_dict
ValueErrorr\   r~   r   rd   re   rk   rC   rZ   r%   r[   r]   r  r  r   r   )r_   rf   rC   r@   rg   r   r   rh   
batch_sizer+   rd   ri   rj   embedding_outputencoder_outputsr   r'  s                    r,   rl   zFNetModel.forward  s     %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"#..*K%0"J
&',,.s3K%0"J
TUU KK55d"00J>; 
 &/%:!!@T@T!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'	 + 
 ,,!5# ' 

 *!,8<8OO4UY#]3oab6III)-')77
 	
r.   )T)NNNNNN)rm   rn   ro   rp   rG   r  r!  r   FNET_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r%   
LongTensorr  boolr   r   rl   rq   rr   s   @r,   r  r    s    

/0 ++@+G+GHe+fg&#$ 15593759/3&*C
E,,-C
 !!1!12C
 u//0	C

   1 12C
 'tnC
 d^C
 
uo%	&C
 hC
r.   r  z
    FNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                   p    e Zd ZddgZ fdZd Zd Z eej                  d             e
ee      	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   deeef   fd              Z xZS )FNetForPreTrainingcls.predictions.decoder.biascls.predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r0   )rF   rG   r  r   r   clsr  r^   s     r,   rG   zFNetForPreTraining.__init__f  s4     f%	'/ 	r.   c                 B    | j                   j                  j                  S r0   r8  r   r   r   s    r,   get_output_embeddingsz(FNetForPreTraining.get_output_embeddingso      xx##+++r.   c                     || j                   j                  _        |j                  | j                   j                  _        y r0   r8  r   r   r   r_   new_embeddingss     r,   set_output_embeddingsz(FNetForPreTraining.set_output_embeddingsr  ,    '5$$2$7$7!r.   r"  r%  r  rf   rC   r@   rg   labelsnext_sentence_labelr   r   r   c	                    ||n| j                   j                  }| j                  ||||||      }	|	dd \  }
}| j                  |
|      \  }}d}|u|st	               } ||j                  d| j                   j                        |j                  d            } ||j                  dd      |j                  d            }||z   }|s||f|	dd z   }||f|z   S |S t        ||||	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, FNetForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
        >>> model = FNetForPreTraining.from_pretrained("google/fnet-base")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```NrC   r@   rg   r   r   rx   rA   )r  r  r  r   )	r`   r(  r   r8  r
   viewrI   r  r   )r_   rf   rC   r@   rg   rD  rE  r   r   r   r   r   r   r   
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   s                      r,   rl   zFNetForPreTraining.forwardv  s5   T &1%<k$++B]B])))%'!5#  
 *1!&48HH_m4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//	
 	
r.   NNNNNNNN)rm   rn   ro   _tied_weights_keysrG   r;  rA  r   r-  r.  r   r  r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   r4  r4  \  s    9:Z[,8 ++@+G+GHe+fg+CRab -115/304)-6:/3&*F
ELL)F
 !.F
 u||,	F

  -F
 &F
 &ell3F
 'tnF
 d^F
 
u..	/F
 c hF
r.   r4  z2FNet Model with a `language modeling` head on top.c                   R    e Zd ZddgZ fdZd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 ddeej                      d	eej                      d
eej                      deej                      deej                      dee   dee   deeef   fd              Z xZS )FNetForMaskedLMr5  r6  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r0   )rF   rG   r  r   r   r8  r  r^   s     r,   rG   zFNetForMaskedLM.__init__  4     f%	"6* 	r.   c                 B    | j                   j                  j                  S r0   r:  r   s    r,   r;  z%FNetForMaskedLM.get_output_embeddings  r<  r.   c                     || j                   j                  _        |j                  | j                   j                  _        y r0   r>  r?  s     r,   rA  z%FNetForMaskedLM.set_output_embeddings  rB  r.   r"  r#  rf   rC   r@   rg   rD  r   r   r   c                 ~   ||n| j                   j                  }| j                  ||||||      }|d   }	| j                  |	      }
d}|Ft	               } ||
j                  d| j                   j                        |j                  d            }|s|
f|dd z   }||f|z   S |S t        ||
|j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        NrG  r   rA   rx   r  logitsr   )	r`   r(  r   r8  r
   rH  rI   r   r   )r_   rf   rC   r@   rg   rD  r   r   r   r   r   rK  rJ  r   s                 r,   rl   zFNetForMaskedLM.forward  s    , &1%<k$++B]B])))%'!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY>:K[b[p[pqqr.   NNNNNNN)rm   rn   ro   rN  rG   r;  rA  r   r-  r.  r   r/  r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   rP  rP    s	   8:Z[,8 ++@+G+GHe+fg&"$ -115/304)-/3&*'rELL)'r !.'r u||,	'r
  -'r &'r 'tn'r d^'r 
un$	%'r h'rr.   rP  zJFNet Model with a `next sentence prediction (classification)` head on top.c                   <    e Zd Z fdZ eej                  d             eee	      	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     de
ej                     d	e
e   d
e
e   deeef   fd              Z xZS )FNetForNextSentencePredictionc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r0   )rF   rG   r  r   r   r8  r  r^   s     r,   rG   z&FNetForNextSentencePrediction.__init__
  rR  r.   r"  rC  rf   rC   r@   rg   rD  r   r   r   c                    d|v r+t        j                  dt               |j                  d      }||n| j                  j
                  }| j                  ||||||      }	|	d   }
| j                  |
      }d}|2t               } ||j                  dd      |j                  d            }|s|f|	dd z   }||f|z   S |S t        |||	j                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, FNetForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
        >>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base")
        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```rE  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.NrG  r    rA   rx   rV  )warningswarnFutureWarningpopr`   r(  r   r8  r
   rH  r   r   )r_   rf   rC   r@   rg   rD  r   r   kwargsr   r   seq_relationship_scoresrL  rJ  r   s                  r,   rl   z%FNetForNextSentencePrediction.forward  s   N !F*MM%
 ZZ 56F%0%<k$++B]B])))%'!5#  
  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//
 	
r.   rX  )rm   rn   ro   rG   r   r-  r.  r   r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   rZ  rZ    s    
 ++@+G+GHe+fg+FUde -115/304)-/3&*I
ELL)I
 !.I
 u||,	I

  -I
 &I
 'tnI
 d^I
 
u11	2I
 f hI
r.   rZ  z
    FNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   >    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	ee   d
ee   deee	f   fd              Z xZS )FNetForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r0   rF   rG   
num_labelsr  r   r   rU   rV   rW   rS   rJ   
classifierr  r^   s     r,   rG   z&FNetForSequenceClassification.__init__i  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r.   r"  r#  rf   rC   r@   rg   rD  r   r   r   c                 $   ||n| j                   j                  }| j                  ||||||      }|d   }	| j                  |	      }	| j	                  |	      }
d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                   j
                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } ||
|      }|s|
f|dd z   }||f|z   S |S t!        ||
|j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrG  r    
regressionsingle_label_classificationmulti_label_classificationrA   rx   rV  )r`   r(  r   rW   rh  problem_typerg  rE   r%   r]   intr   squeezer
   rH  r	   r   r   )r_   rf   rC   r@   rg   rD  r   r   r   r   rW  r  rJ  r   s                 r,   rl   z%FNetForSequenceClassification.forwardt  s   , &1%<k$++B]B])))%'!5#  
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'T&PWPePeffr.   rX  )rm   rn   ro   rG   r   r-  r.  r   r/  r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   rd  rd  a  s    	 ++@+G+GHe+fg&,$ -115/304)-/3&*9gELL)9g !.9g u||,	9g
  -9g &9g 'tn9g d^9g 
u..	/9g h9gr.   rd  z
    FNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   >    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	ee   d
ee   deee	f   fd              Z xZS )FNetForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y r   )rF   rG   r  r   r   rU   rV   rW   rS   rJ   rh  r  r^   s     r,   rG   zFNetForMultipleChoice.__init__  sV     f%	zz&"<"<=))F$6$6: 	r.   z(batch_size, num_choices, sequence_lengthr#  rf   rC   r@   rg   rD  r   r   r   c                    ||n| j                   j                  }||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||      }	|	d   }
| j                  |
      }
| j                  |
      }|j                  d|      }d}|t               } |||      }|s|f|	dd z   }||f|z   S |S t        |||	j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr    rA   rG  rx   rV  )r`   r(  r#   rH  r\   r   rW   rh  r
   r   r   )r_   rf   rC   r@   rg   rD  r   r   num_choicesr   r   rW  reshaped_logitsr  rJ  r   s                   r,   rl   zFNetForMultipleChoice.forward  s   , &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 )))%'!5#  
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE(d?ZaZoZoppr.   rX  )rm   rn   ro   rG   r   r-  r.  r   r/  r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   rq  rq    s     ++@+G+GHr+st&-$ -115/304)-/3&*4qELL)4q !.4q u||,	4q
  -4q &4q 'tn4q d^4q 
u//	04q u4qr.   rq  z
    FNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   >    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	ee   d
ee   deee	f   fd              Z xZS )FNetForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r0   rf  r^   s     r,   rG   z#FNetForTokenClassification.__init__  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r.   r"  r#  rf   rC   r@   rg   rD  r   r   r   c                    ||n| j                   j                  }| j                  ||||||      }|d   }	| j                  |	      }	| j	                  |	      }
d}|<t               } ||
j                  d| j                        |j                  d            }|s|
f|dd z   }||f|z   S |S t        ||
|j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrG  r   rA   rx   rV  )
r`   r(  r   rW   rh  r
   rH  rg  r   r   )r_   rf   rC   r@   rg   rD  r   r   r   r   rW  r  rJ  r   s                 r,   rl   z"FNetForTokenClassification.forward  s    ( &1%<k$++B]B])))%'!5#  
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$$vWMbMbccr.   rX  )rm   rn   ro   rG   r   r-  r.  r   r/  r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   rx  rx    s    
 ++@+G+GHe+fg&)$ -115/304)-/3&*(dELL)(d !.(d u||,	(d
  -(d &(d 'tn(d d^(d 
u++	,(d h(dr.   rx  z
    FNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   ^    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   deee	f   fd              Z xZS )FNetForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r0   )
rF   rG   rg  r  r   r   rS   rJ   
qa_outputsr  r^   s     r,   rG   z!FNetForQuestionAnswering.__init__R  sS      ++f%	))F$6$68I8IJ 	r.   r"  r#  rf   rC   r@   rg   start_positionsend_positionsr   r   r   c	                    ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|	dd z   }||f|z   S |S t        ||||	j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        NrG  r   r    rA   ry   )ignore_indexrx   )r  start_logits
end_logitsr   )r`   r(  r   r~  splitro  
contiguouslenr\   clampr
   r   r   )r_   rf   rC   r@   rg   r  r  r   r   r   r   rW  r  r  rI  ignored_indexrJ  
start_lossend_lossr   s                       r,   rl   z FNetForQuestionAnswering.forward]  s   6 &1%<k$++B]B])))%'!5#  
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+,:]d]r]r
 	
r.   rM  )rm   rn   ro   rG   r   r-  r.  r   r/  r   r0  r   r%   r   r2  r   r   rl   rq   rr   s   @r,   r|  r|  J  s    	 ++@+G+GHe+fg&0$ -115/3042604/3&*>
ELL)>
 !.>
 u||,	>

  ->
 "%,,/>
  ->
 'tn>
 d^>
 
u22	3>
 h>
r.   r|  )Orp   r]  dataclassesr   	functoolsr   typingr   r   r   r%   torch.utils.checkpointr   torch.nnr	   r
   r   utilsr   scipyr   activationsr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   r   r   configuration_fnetr!   
get_loggerrm   loggerr/  r0  r-   r1   r9   Moduler;   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r  FNET_START_DOCSTRINGr-  r  r4  rP  rZ  rd  rq  rx  r|  r   r.   r,   <module>r     s     !  ) )    A A '  !
 
 
 . 6  + 
		H	%( M>
 :RYY :z#		 #Lbii 
299 
ryy   		 6a")) a>  ")) "*299 *4!bii !&bii &	9299 	9*/ *8 ={ = =2	   F dc
# c
	c
L  [
, [
[
| NPde@r) @r f@rF TU
$7 U
	U
p  Kg$7 KgKg\  Eq/ EqEqP  ;d!4 ;d;d|  P
2 P
P
r.   