
    sg                      d Z ddlmZ ddlZddlZddlZddlmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZ dd	lmZmZmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e#jN                  e(      Z)dZ*dZ+dZ,d.dZ-d/d0dZ.d1d2dZ/ G d dej`                  jb                        Z2 G d dej`                  jf                        Z4 G d dej`                  jf                        Z5 G d dej`                  jf                        Z6 G d de      Z7dZ8dZ9dZ:e G d  d!ej`                  jf                               Z;e G d" d#ej`                  jf                               Z<e G d$ d%ej`                  jf                               Z= e!d&e8       G d' d(e7             Z> G d) d*ej`                  jf                        Z? e!d+e8       G d, d-e7e             Z@y)3zTF 2.0 Blenderbot model.    )annotationsN)ListOptionalTupleUnion   )get_tf_activation)TFBaseModelOutput+TFBaseModelOutputWithPastAndCrossAttentionsTFSeq2SeqLMOutputTFSeq2SeqModelOutput)TFCausalLanguageModelingLossTFPreTrainedModelkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_end_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )BlenderbotConfigz facebook/blenderbot-400M-distillr   g    חc           
        t        j                  || j                        }t        j                  || j                        }t        j                  t	        |       d   dft        j
                  || j                              }t        j                  || d d d df   gd      }t        j                  |dk(  t        j                  t	        |      t        j
                  || j                              |      }t         j                  j                  |t        j                  d| j                              }t        j                  |g      5  t        j                  |      }d d d        |S # 1 sw Y   |S xY w)Nr   r   dtype)tfcastr"   fillr   convert_to_tensorconcatwhere	debuggingassert_greater_equalconstantcontrol_dependenciesidentity)	input_idspad_token_iddecoder_start_token_idstart_tokensshifted_input_idsassert_gte0s         h/var/www/html/venv/lib/python3.12/site-packages/transformers/models/blenderbot/modeling_tf_blenderbot.pyshift_tokens_rightr5   @   s8   77<9LWW%;Y__M77	I	q	!1%r';';<RT]TcTc'dL 		<1crc61B"CRHT!

,-r/C/CLR[RaRa/bc ,,334Er{{ST\e\k\kGlmK 
	 	 +	/ ;KK(9:; ; s   E..E8c           	        | d   }| d   }t        j                  ||f      t        z  }t        j                  t	        |      d         }t        j
                  |t        j                  |dz   t	        |      d   df      k  d|      }|dkD  r.t        j                  t        j                  ||f      |gd      }t        j                  |ddddddf   |dddf      S )zB
    Make causal mask used for bi-directional self-attention.
    r   r   r           axisN)
r#   onesLARGE_NEGATIVEranger   r(   reshaper'   zerostile)input_ids_shapepast_key_values_lengthbsztgt_lenmask	mask_conds         r4   _make_causal_maskrF   Y   s     !
Ca G77GW%&7DD)"-.I88I

9q=:d;KB;OQR:S TTVY[_`D!yy"((G-C#DEtLSUV774dAq()CAq>::    c                    t        |       d   }||n|}t        j                  d      }t        j                  | |j                        } t        j
                  | ddddddf   dd|df      }||z
  t        z  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   N      ?r!   )r   r#   r+   r$   r"   r?   r;   )rD   rC   src_lenone_cstexpanded_masks        r4   _expand_maskrM   k   sx     q!G ,g'Gkk#G774w}}-DGGDD$!12Q7A4FGMm#~55rG   c                  >     e Zd ZdZd fdZ	 d	 	 	 	 	 d fdZ xZS )&TFBlenderbotLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    c                (    t        |   ||fi | y N)super__init__)selfnum_embeddingsembedding_dimkwargs	__class__s       r4   rS   z/TFBlenderbotLearnedPositionalEmbedding.__init__}   s    A&ArG   c                    |"|d   }t        j                  |dd      }||z  }t        |   t        j                  |t         j
                              S )z/Input is expected to be of size [bsz x seqlen].r   r<   )deltanamer!   )r#   r<   rR   callr$   int32)rT   input_shaperA   position_idsseq_lenrX   s        r4   r\   z+TFBlenderbotLearnedPositionalEmbedding.call   sO     !!nG88G17CL22Lw|BGGLABBrG   )rU   intrV   ra   )r   N)r^   tf.TensorShaperA   ra   r_   tf.Tensor | None__name__
__module____qualname____doc__rS   r\   __classcell__rX   s   @r4   rO   rO   x   s<    B nr	C)	CCF	CZj	C 	CrG   rO   c                  |     e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 	 	 d fdZddZ	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 d
dZddZ xZS )TFBlenderbotAttentionz6Multi-headed attention from "Attention Is All You Needc                z   t        |   d
i | || _        || _        t        j
                  j                  |      | _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _
        || _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d	      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      k_proj)use_biasr[   q_projv_projout_proj )rR   rS   	embed_dim	num_headsr   layersDropoutdropouthead_dim
ValueErrorscaling
is_decoderDensern   rp   rq   rr   )rT   rt   ru   rx   r|   biasrW   rX   s          r4   rS   zTFBlenderbotAttention.__init__   s    	"6"""||++G4!Y.MMI%$..8MdnnM]$YKr3  }}d*$ll((T(Qll((T(Qll((T(Q**9t**UrG   c           	         t        j                  t        j                  |||| j                  | j                  f      d      S )Nr      r   r   )r#   	transposer=   ru   ry   )rT   tensorr`   rB   s       r4   _shapezTFBlenderbotAttention._shape   s0    ||BJJvWdnndmm/\]_kllrG   c           
     	   |du}t        |      \  }}	}
| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j	                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f}t        j                  | j                  ||	|      |      }t        j                  ||      }t        j                  ||      }t        |      d   }t        j                  ||d      }t        j                  j                  t        |      || j                  z  |	|gd	|| j                  z  |	|f d
t        |              |t        j                  j                  t        |      |d|	|gd|d|	|f d
t        |              t        j                  ||j                         }t        j                  ||| j                  |	|f      |z   }t        j                  ||| j                  z  |	|f      }t#        |d      }|t        j                  j                  t        |      | j                  gd| j                   d
t        |              t        j                  |d      t        j                  ||| j                  |	|f      z  }t        j                  ||| j                  z  |	|f      }| j%                  ||      }t        j                  ||      }t        j                  j                  t        |      || j                  z  |	| j                  gd|| j                  |	| j                  f d
t        |              t        j&                  t        j                  ||| j                  |	| j                  f      d      }t        j                  |||	|
f      }| j)                  |      }t        j                  ||| j                  |	|f      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   r   r8   Ttranspose_bz$Attention weights should be of size z	, but is messagez!Attention mask should be of size r!   z/Head mask for a single layer should be of size )r   r   r   r   trainingz `attn_output` should be of size r   )r   rp   r{   r   rn   rq   r#   r'   r|   ru   ry   r=   matmulr)   assert_equalr$   r"   r   rx   r   rr   )rT   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskr   is_cross_attentionrB   rC   rt   query_states
key_statesvalue_states
proj_shaperJ   attn_weights
attn_probsattn_outputs                      r4   r\   zTFBlenderbotAttention.call   s    .T9",]";Wi {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BKJ99nQ&7%FQOL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
zz$++lGS"I:VZZ
J7
zz,
;Z(+yyztL
!!|$4>>!7G46dnn8LgW^7_6` a|,-/	 	" 	
 %LL%%>*a'*7a'8R7S T">235	 &   WW^<;M;MNN::lS$..'SZ4[\_mmL::lS4>>5I7T[4\]L%l<&LL%%?+ Et~~EW X"?346	 &  ::o}E

sDNNGWEI L ::lS4>>5I7T[4\]L\\,\B
ii
L9
!!{#4>>!7DMM:2CRVR_R_3`2a b{+,.	 	" 	
 llJJ{S$..'4==$QRT`
 jjsGY.GHmmK0"$**\CQXZa;b"cL.88rG   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTrn   rp   rq   rr   )builtgetattrr#   
name_scopern   r[   buildrt   rp   rq   rr   rT   r^   s     r4   r   zTFBlenderbotAttention.build%  s   ::
44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@4T*6t}}112 B##T4$@AB B 7@ @@ @@ @B Bs0   )F32)G )G )G3F= G	GG!)r7   FT)
rt   ra   ru   ra   rx   floatr|   boolr~   r   )r   	tf.Tensorr`   ra   rB   ra   )NNNNF)r   r   r   rc   r   zTuple[Tuple[tf.Tensor]] | Noner   rc   r   rc   r   Optional[bool]returnz"Tuple[tf.Tensor, tf.Tensor | None]rQ   )	re   rf   rg   rh   rS   r   r\   r   ri   rj   s   @r4   rl   rl      s    @  VV V 	V
 V V8m .29=+/,0#(t9 t9 +t9 7	t9
 )t9 *t9 !t9 
,t9lBrG   rl   c                  B     e Zd Zd fdZ	 d	 	 	 	 	 	 	 ddZddZ xZS )TFBlenderbotEncoderLayerc                   t        |   d
i | |j                  | _        t	        | j                  |j
                  |j                  d      | _        t        j                  j                  dd      | _        t        j                  j                  |j                        | _        t        |j                        | _        t        j                  j                  |j"                        | _        t        j                  j%                  |j&                  d      | _        t        j                  j%                  | j                  d      | _        t        j                  j                  dd	      | _        || _        y )N	self_attn)rx   r[   h㈵>self_attn_layer_normepsilonr[   fc1r[   fc2final_layer_normrs   )rR   rS   d_modelrt   rl   encoder_attention_headsattention_dropoutr   r   rv   LayerNormalizationr   rw   rx   r	   activation_functionactivation_fnactivation_dropoutr}   encoder_ffn_dimr   r   r   configrT   r   rW   rX   s      r4   rS   z!TFBlenderbotEncoderLayer.__init__9  s   "6".NNF::FD\D\cn
 %*LL$C$CDWm$C$n!||++FNN;.v/I/IJ"',,"6"6v7P7P"Q<<%%f&<&<5%I<<%%dnn5%A % ? ?Se ? frG   c           
        |}| j                  |      }| j                  |||      \  }}}t        j                  j	                  t        |      t        |      dt        |       dt        |              | j                  ||      }||z   }|}| j                  |      }| j                  | j                  |            }| j                  ||      }| j                  |      }| j                  ||      }||z   }||fS )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
                *(encoder_attention_heads,)*
        )r   r   r   z&Self attn modified the shape of query z to r   r   )r   r   r#   r)   r   r   rx   r   r   r   r   r   )rT   r   r   r   r   residualself_attn_weights_s           r4   r\   zTFBlenderbotEncoderLayer.callH  s    !11-@.2nn'Xg /= /
+(! 	!!}%x <Z=Q<RRVWaboWpVqr 	" 	
 ]XF =0 --m<**488M+BC///Q/]XF =0///rG   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   XxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r   r   r   r   )r   r   r#   r   r   r[   r   r   rt   r   r   r   r   r   r   s     r4   r   zTFBlenderbotEncoderLayer.buildp  s   ::
4d+7t~~223 +$$T*+4/6Bt88==> N))//tT^^0LMN4%1txx}}- =dDNN;<=4%1txx}}- JdDKK,G,GHIJ4+T2>t4499: J%%++T4,HIJ J ?+ +N N= =J JJ Js<   H%)H$)H133H=$)I	H!$H.1H:=I	Ir   r   )F)r   r   r   r   r   r   r   r   rQ   re   rf   rg   rS   r\   r   ri   rj   s   @r4   r   r   8  s?    ( $)&0 &0 "&0 #	&0
 !&0PJrG   r   c                  b     e Zd Zd fdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFBlenderbotDecoderLayerc                   t        |   di | |j                  | _        t	        | j                  |j
                  |j                  dd      | _        t        j                  j                  |j                        | _        t        |j                        | _        t        j                  j                  |j                        | _        t        j                  j!                  dd      | _        t	        | j                  |j
                  |j                  dd      | _        t        j                  j!                  dd	      | _        t        j                  j)                  |j*                  d
      | _        t        j                  j)                  | j                  d      | _        t        j                  j!                  dd      | _        || _        y )Nr   T)rt   ru   rx   r[   r|   r   r   r   encoder_attn)rx   r[   r|   encoder_attn_layer_normr   r   r   r   rs   )rR   rS   r   rt   rl   decoder_attention_headsr   r   r   rv   rw   rx   r	   r   r   r   r   r   r   r   r}   decoder_ffn_dimr   r   r   r   r   s      r4   rS   z!TFBlenderbotDecoderLayer.__init__  sa   "6".nn44,,
 ||++FNN;.v/I/IJ"',,"6"6v7P7P"Q$)LL$C$CDWm$C$n!1NN**,,
 (-||'F'FtZs'F't$<<%%f&<&<5%I<<%%dnn5%A % ? ?Se ? frG   c	                8   |}	| j                  |      }||dd nd}
| j                  ||
||      \  }}}| j                  ||      }|	|z   }d}d}|S|}	| j                  |      }||dd nd}| j	                  |||||      \  }}}| j                  ||      }|	|z   }||z   }|}	| j                  |      }| j                  | j                  |            }| j                  ||      }| j                  |      }| j                  ||      }|	|z   }||||fS )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            encoder_hidden_states (`tf.Tensor`):
                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
                *(decoder_attention_heads,)*
            cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
                *(decoder_attention_heads,)*
            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
        Nr   )r   r   r   r   r   )r   r   r   r   r   )
r   r   rx   r   r   r   r   r   r   r   )rT   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   r   self_attn_past_key_valuer   present_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_values                   r4   r\   zTFBlenderbotDecoderLayer.call  s   4 !11-@ :H9S>"1#5Y] >Bnn'3)+	 ?M ?
;(*; ]XF =0 (,$! ,$H 88GM @N?Yrs(;_c%NRN_N_+!65 :8 O` OKM-/K !LLLJM$}4M !24P P !--m<**488M+BC///Q/]XF =0 	
 	
rG   c                b   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   sxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   rxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)	NTr   r   r   r   r   r   r   )r   r   r#   r   r   r[   r   r   rt   r   r   r   r   r   r   r   r   s     r4   r   zTFBlenderbotDecoderLayer.build  s   ::
4d+7t~~223 +$$T*+4/6Bt88==> N))//tT^^0LMN4.:t00556 .!!''-.42D9Et;;@@A Q,,22D$3OPQ4%1txx}}- =dDNN;<=4%1txx}}- JdDKK,G,GHIJ4+T2>t4499: J%%++T4,HIJ J ?#+ +N N. .Q Q= =J JJ JsT   K%)K%K2&)K?)L43L%)L%K"%K/2K<?L	LL"%L.r   )NNNNNNF)r   r   r   rc   r   rc   r   rc   r   rc   r   rc   r   zTuple[tf.Tensor] | Noner   r   r   z4Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]rQ   r   rj   s   @r4   r   r     s    > ,02637,07;26#(N
 N
 )N
  0	N

 !1N
 *N
 %5N
 0N
 !N
 
>N
`JrG   r   c                      e Zd ZeZdZy)TFBlenderbotPreTrainedModelmodelN)re   rf   rg   r   config_classbase_model_prefixrs   rG   r4   r   r     s    #LrG   r   a{	  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
au  
    Conversation example::

    ```py
    >>> from transformers import AutoTokenizer, TFBlenderbotForConditionalGeneration

    >>> mname = "facebook/blenderbot-400M-distill"
    >>> model = TFBlenderbotForConditionalGeneration.from_pretrained(mname)
    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    >>> print("Human: ", UTTERANCE)

    >>> inputs = tokenizer([UTTERANCE], return_tensors="tf")
    >>> reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])

    >>> REPLY = "I'm not sure"
    >>> print("Human: ", REPLY)
    >>> NEXT_UTTERANCE = (
    ...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
    ...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
    ...     "<s> I'm not sure."
    ... )
    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf")
    >>> next_reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
    ```
aE  
    Args:
        input_ids (`tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
        decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.max_position_embeddings - 1]`.
        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`tf.FloatTensor`, *optional*):
            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
            of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
c                  ^     e Zd ZeZ	 dd fdZd Zd Ze	 	 	 	 	 	 	 	 dd       Z	ddZ
 xZS )	TFBlenderbotEncoderc                   t        |   di | || _        t        j                  j                  |j                        | _        |j                  | _        |j                  | _
        |j                  | _        |j                  r2t        j                  j!                  t#        |j$                              nd| _        || _        t+        |j                  |j$                  d      | _        t/        |j0                        D cg c]  }t3        |d|        c}| _        t        j                  j5                  dd      | _        y c c}w )	NrI   embed_positionsr   layers.r   
layer_normr   rs   )rR   rS   r   r   rv   rw   rx   encoder_layerdrop	layerdropr/   padding_idxmax_position_embeddingsmax_source_positionsscale_embeddingr#   mathsqrtr   r   embed_scaleembed_tokensrO   r   r<   encoder_layersr   r   r   rT   r   r   rW   irX   s        r4   rS   zTFBlenderbotEncoder.__init__  s    "6"||++FNN;11!..$*$B$B!BHBXBX277<<fnn(=>^a(E**NN" 

 V[[a[p[pUqrPQ/wqc]Kr,,99$\9Z ss   E	c                    | j                   S rQ   r   rT   s    r4   get_embed_tokensz$TFBlenderbotEncoder.get_embed_tokens         rG   c                    || _         y rQ   r   rT   r   s     r4   set_embed_tokensz$TFBlenderbotEncoder.set_embed_tokens  
    (rG   c	           
        ||t        d      |t        |      }	n|t        |      dd }	nt        d      |>t        || j                  j                         | j                  |      | j
                  z  }| j                  |	      }
||
z   }| j                  ||      }|t        |      }nd}|rdnd}|rdnd}|gt        j                  j                  t        |      d   t        | j                        dt        | j                         d	t        |      d    d
       t        | j                        D ]R  \  }}|r||fz   }t        j                   dd      }|r|| j"                  k  r6 ||||||   nd      \  }}|sM||fz  }T | j%                  |      }|r||fz   }|st'        d |||fD              S t)        |||      S )a
  
        Args:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
                in the config will be used instead.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail. This argument can be used only in eager mode, in graph mode the value in the config
                will be used instead.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used
                in eager mode, in graph mode the value will always be set to True.
            training (`bool`, *optional*, defaults to `False`):
                Whether or not to use the model in training mode (some modules like dropout modules have different
                behaviors between training and evaluation).
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rs   r   z&The head_mask should be specified for  layers, but it is for .r   r   c              3  &   K   | ]	  }||  y wrQ   rs   ).0vs     r4   	<genexpr>z+TFBlenderbotEncoder.call.<locals>.<genexpr>>  s     eqWXWdes   last_hidden_stater   
attentions)rz   r   r   r   	input_dimr   r   rx   rM   r#   r)   r   lenrv   	enumeraterandomuniformr   r   tupler
   )rT   r.   inputs_embedsr   	head_maskoutput_attentionsoutput_hidden_statesreturn_dictr   r^   	embed_posr   encoder_statesall_attentionsidxencoder_layerdropout_probabilityattns                     r4   r\   zTFBlenderbotEncoder.call  s   l  ]%>cdd"$Y/K&$]3CR8KTUU *9d6G6G6Q6QR --i84;K;KKM((5	%	1]XF %).9N!N30d  LL%%9%a(DKK <S=M<N O"9-a014	 &  #,DKK"8 	*C#!/=2B!B"(..A"604>>A"/"+"7	#T#M4 !4')	*" 6+}.>>Ne]NN$Seee +>Vd
 	
rG   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   nxY w# 1 sw Y   axY wNTr   r   rv   r   r   r#   r   r   r[   r   r   r   r   rv   rT   r^   layers      r4   r   zTFBlenderbotEncoder.buildC  "   ::
4*D1=t33889 1$$**4014t,8t334 I%%tT4;;3F3F&GHI44(4 &]]5::. &KK%& && 51 1I I& &$   D9%3EE9EEE	rQ   r   r   r   z Optional[keras.layers.Embedding])NNNNNNNFre   rf   rg   r   r   rS   r   r   r   r\   r   ri   rj   s   @r4   r   r     sS    #L[$!)  !v
 v
p&rG   r   c                  j     e Zd ZeZ	 dd fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z	ddZ
 xZS )	TFBlenderbotDecoderc                z   t        |   di | || _        |j                  | _        || _        |j                  | _        t        |j                  |j                  d      | _        |j                  r2t        j                  j                  t!        |j                              nd| _        t%        |j&                        D cg c]  }t)        |d|        c}| _        t,        j*                  j/                  dd      | _        t,        j*                  j3                  |j4                        | _        y c c}w )	Nr   r   rI   r   r   r   r   rs   )rR   rS   r   r/   r   r   decoder_layerdropr   rO   r   r   r   r   r#   r   r   r   r   r<   decoder_layersr   rv   r   r   r   rw   rx   r   s        r4   rS   zTFBlenderbotDecoder.__init__^  s    "6"!..(11E**NN" 

 CIBXBX277<<fnn(=>^aUZ[a[p[pUqrPQ/wqc]Kr,,99$\9Z||++FNN; ss   D8c                    | j                   S rQ   r   r   s    r4   r   z$TFBlenderbotDecoder.get_embed_tokenso  r   rG   c                    || _         y rQ   r   r   s     r4   r   z$TFBlenderbotDecoder.set_embed_tokensr  r   rG   c                8   ||t        d      |t        |      }n|t        |      dd }nt        d      |	t        |	d   d         d   nd}|| j                  ||      }n| j                  ||      }|>t        || j                  j
                         | j	                  |      | j                  z  }|}|d   dkD  rt        ||	      }n.t        t        j                  |d   |d   |z   f      |d   
      }||t        ||d   
      z   }||t        ||d   
      }||z   }| j                  ||      }|rdnd}|rdnd}|r|dnd}|
rdnd}d|fd|ffD ]r  \  }}|	t        j                  j                  t        |      d   t        | j                        d| dt        | j                         dt        |      d    d       t t!        | j                        D ]z  \  }}|r||fz  }t#        j$                  dd      }|r|| j&                  k  r6|	|	|   nd} ||||||||   nd|||   nd|      \  }}}} |
r|| fz  }|sl||fz  }|u||fz  }| | j)                  |      }|r||fz  }|s|||||fS t+        |||||      S )a  
        Args:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
                range `[0, config.max_position_embeddings - 1]`.
            encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
                decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
                in the config will be used instead.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail. This argument can be used only in eager mode, in graph mode the value in the config
                will be used instead.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used
                in eager mode, in graph mode the value will always be set to True.
            training (`bool`, *optional*, defaults to `False`):
                Whether or not to use the model in training mode (some modules like dropout modules have different
                behaviors between training and evaluation).
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   )r_   r   )rA   )rC   r   rs   r  cross_attn_head_maskzThe z should be specified for r   r   r   )r   r   r   r   r   r   )r   past_key_valuesr   r   cross_attentions)rz   r   r   r   r   r   r   rF   rM   r#   r:   rx   r)   r   r  rv   r  r  r  r   r   r   )!rT   r.   r  r   r_   r   r   r  r#  r$  	use_cacher  r	  r
  r   r^   rA   	positionsr   combined_attention_maskall_hidden_statesall_self_attnsall_cross_attnspresent_key_valuesattn_mask_name	attn_maskr  decoder_layerr  r   layer_self_attnlayer_cross_attnr   s!                                    r4   r\   zTFBlenderbotDecoder.callu  s   n  ]%>stt"$Y/K&$]3CR8KdeeIXIdOA,>q,A!B1!Ejk ,,[:PQI,,[|,TI *9d6G6G6Q6QR --i84;K;KKM% r?Q&7\r&s#&2QQ:P)PQR\ghj\k'# %&=^epqset@u&u# ,1G1S%12HR]^`Ra%b"%	1]XF #7BD0d!27L7X"_c#,R$ ,7	*BE[]qDr)s 		%NI$))y)!,$~..GDKKHXGY Z&y1!45Q8	 * 		 #,DKK"8 	;C#!m%55!"(..A"604>>A5D5P_S1VZNR_6&;'=2;2G	#TH\Hh+?+Dnr-SOM?,<>O "'8&::" ?"44(4#(8'::O9	;< 6-!11 "46GYhhh>"/ 2/)!0 rG   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   nxY w# 1 sw Y   axY wr  r  r  s      r4   r   zTFBlenderbotDecoder.build4  r  r  rQ   r  )NNNNNNNNNNNNNFr  rj   s   @r4   r  r  S  sd    #L<"!)  "#!!| ||&rG   r  c                  p     e Zd ZeZd fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 dd       Z	d	dZ
 xZS )
TFBlenderbotMainLayerc                   t        |   di | || _        t        j                  j                  |j                  |j                  t        j                  j                  | j                  j                        d      | _        d| j                  _        t        || j                  d      | _        t        || j                  d      | _        y )N)stddevzmodel.shared)r   
output_dimembeddings_initializerr[   encoderr   decoderrs   )rR   rS   r   r   rv   	Embedding
vocab_sizer   initializersTruncatedNormalinit_stdsharedload_weight_prefixr   r9  r  r:  r   s      r4   rS   zTFBlenderbotMainLayer.__init__H  s    "6"ll,,''~~#(#5#5#E#ET[[MaMa#E#b	 - 
 *8&*64;;YO*64;;YOrG   c                    | j                   S rQ   )r@  r   s    r4   get_input_embeddingsz*TFBlenderbotMainLayer.get_input_embeddingsX  s    {{rG   c                ~    || _         | j                   | j                  _        | j                   | j                  _        y rQ   )r@  r9  r   r:  )rT   new_embeddingss     r4   set_input_embeddingsz*TFBlenderbotMainLayer.set_input_embeddings[  s)    $$(KK!$(KK!rG   c                ^   ||n| j                   j                  }|	| j                  ||||||||      }	nl|rHt        |	t              s8t	        |	d   t        |	      dkD  r|	d   nd t        |	      dkD  r|	d   nd       }	n"|s t        |	t              s|	j                         }	| j                  ||||	d   ||||
||||||      }|s||	z   S t        |j                  |j                  |j                  |j                  |j                  |	j                  |	j                  |	j                        S )N)r.   r   r  r  r  r	  r
  r   r   r   r   r   )r   r_   r   r   r  r#  r$  r  r&  r  r	  r
  r   r   r$  decoder_hidden_statesdecoder_attentionsr%  encoder_last_hidden_stater   encoder_attentions)r   r	  r9  
isinstancer
   r  r  to_tupler:  r   r   r$  r   r   r%  )rT   r.   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsr  decoder_head_maskr#  encoder_outputsr$  r  decoder_inputs_embedsr&  r  r	  r
  r   rW   decoder_outputss                       r4   r\   zTFBlenderbotMainLayer.call`  sb   . %9$D $++JjJj 	 ""ll#-#+"3%9'! + 	O O=N!O/"1!"4474H14Loa0RV14_1E1I?1-tO Z%G-668O,,1-"1!"4#1'!5+//!5# ' 
" "_44#-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
rG   c                   | j                   ry d| _         t        j                  | j                  j                  dz   | j                  j
                  z   dz         5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NT/r9  r:  )
r   r#   r   r@  rA  r[   r   r   r9  r:  r   s     r4   r   zTFBlenderbotMainLayer.build  s   ::
 ]]4;;99C?$++BRBRRUXXY 	$KKd#	$4D)5t||001 )""4()4D)5t||001 )""4() ) 6	$ 	$) )) )s$   D55EE5D>E
Er   NNNNNNNNNNNNNNNNF)rS  )Optional[Union[Tuple, TFBaseModelOutput]]rQ   )re   rf   rg   r   r   rS   rC  rF  r   r\   r   ri   rj   s   @r4   r4  r4  D  su    #LP 0
  #!!EI"!%L
 CL
 L
\)rG   r4  zXThe bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.c                      e Zd Zd
 fdZd Zd Zed fd       Ze e	e
j                  d             eeee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd Zdd	Z xZS )TFBlenderbotModelc                P    t        |   |g|i | t        |d      | _        y )Nr   r   )rR   rS   r4  r   rT   r   inputsrW   rX   s       r4   rS   zTFBlenderbotModel.__init__  s(    3&3F3*6@
rG   c                .    | j                   j                  S rQ   r   r9  r   s    r4   get_encoderzTFBlenderbotModel.get_encoder      zz!!!rG   c                .    | j                   j                  S rQ   r   r:  r   s    r4   get_decoderzTFBlenderbotModel.get_decoder  rb  rG   c                    |dk(  r1ddl m} t        j                  dt               |j                  |      S t        |   |g|i |S )Nfacebook/blenderbot-90Mr   )TFBlenderbotSmallModelThe checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical checkpoint `facebook/small_blenderbot-90M` with `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')` instead.)blenderbot_smallrh  warningswarnFutureWarningfrom_pretrainedrR   )clspretrained_model_name_or_path
model_argsrW   rh  rX   s        r4   rn  z!TFBlenderbotModel.from_pretrained  sU    (,EEAMM  *99:WXXw&'D\z\U[\\rG   zbatch_size, sequence_length)
checkpointoutput_typer   c                J    | j                  |||||||||	|
|||||||      }|S )N)r.   r   rO  rP  rQ  r  rR  r#  rS  r$  r  rT  r&  r  r	  r
  r   )r   )rT   r.   r   rO  rP  rQ  r  rR  r#  rS  r$  r  rT  r&  r  r	  r
  r   rW   outputss                       r4   r\   zTFBlenderbotModel.call  sQ    8 **)/#9!5/!5++'"7/!5##  
( rG   c           
        | j                   j                  r"t        j                  |j                        d   nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }t        |j                  |||||j                  ||      S )Nr   rH  )r   r&  r#   r  r$  r	  r&   rI  r  rJ  r%  r   rL  r   r   rK  rT   outputpkvdec_hs	dec_attnscross_attnsenc_hs	enc_attnss           r4   serving_outputz TFBlenderbotModel.serving_output  s   59[[5J5Jbhhv--.q1PTGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	GK{{GdGdb**6+B+BCjnGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	#$66"(((&,&F&F"((	
 		
rG   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr   )r   r   r#   r   r   r[   r   r   s     r4   r   zTFBlenderbotModel.build&  sg    ::
4$'3tzz/ '

  &' ' 4' 's   A11A:r   rp  z!Optional[Union[str, os.PathLike]]rX  )$r.   rc   r   rc   rO  rc   rP  rc   rQ  rc   r  rc   rR  rc   r#  rc   rS  rY  r$  List[tf.Tensor] | Noner  rc   rT  rc   r&  r   r  r   r	  r   r
  r   r   r   r   z-Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]rQ   )re   rf   rg   rS   ra  re  classmethodrn  r   r   BLENDERBOT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr\   r  r   ri   rj   s   @r4   r[  r[    sa   
A
"" ] ] *+F+M+MNk+lm&($ '++/.23715&*.215EI26*.26$(,0/3&*#(%)#) )) ,	)
 !1) /) $) ,) /) C) 0) ()  0) ") *)  -!)" $#)$ !%)( 
7)) n )X
&'rG   r[  c                  (     e Zd ZdZ fdZd Z xZS )	BiasLayerz
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    c                \    t        |   dd|i| | j                  ||||      | _        y )Nr[   r[   shapeinitializer	trainablers   )rR   rS   
add_weightr~   )rT   r  r  r  r[   rW   rX   s         r4   rS   zBiasLayer.__init__6  s3    -d-f- OOU_hOi	rG   c                     || j                   z   S rQ   )r~   )rT   xs     r4   r\   zBiasLayer.call=  s    499}rG   rd   rj   s   @r4   r  r  0  s    
jrG   r  zRThe BLENDERBOT Model with a language modeling head. Can be used for summarization.c                  N    e Zd ZddgZ fdZd Zd Zd Zd Zd Z	d	 Z
ed fd
       Ze ee       eee       ee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                            Zd Z	 	 	 	 	 	 	 	 ddZddZ xZS )$TFBlenderbotForConditionalGenerationz!model.encoder.embed_tokens.weightz!model.decoder.embed_tokens.weightc                    t        |   |g|i | t        |d      | _        |j                  | _        t        dd|j                  gdd      | _        y )Nr   r   final_logits_biasr   r>   Fr  )rR   rS   r4  r   r&  r  r<  
bias_layerr]  s       r4   rS   z-TFBlenderbotForConditionalGeneration.__init__K  sW    3&3F3*6@
))#$Q0A0A,BPWch
rG   c                .    | j                   j                  S rQ   rd  r   s    r4   re  z0TFBlenderbotForConditionalGeneration.get_decoderT  rb  rG   c                .    | j                   j                  S rQ   r`  r   s    r4   ra  z0TFBlenderbotForConditionalGeneration.get_encoderW  rb  rG   c                "    | j                         S rQ   )rC  r   s    r4   get_output_embeddingsz:TFBlenderbotForConditionalGeneration.get_output_embeddingsZ  s    ((**rG   c                &    | j                  |       y rQ   )rF  )rT   values     r4   set_output_embeddingsz:TFBlenderbotForConditionalGeneration.set_output_embeddings]  s    !!%(rG   c                2    d| j                   j                  iS )Nr  )r  r~   r   s    r4   get_biasz-TFBlenderbotForConditionalGeneration.get_bias`  s    #T__%9%9::rG   c                    |d   j                   d   }t        dd|gdd      | _        | j                  j                  j	                  |d          y )Nr  r   r   r>   Fr  )r  r  r  r~   assign)rT   r  r<  s      r4   set_biasz-TFBlenderbotForConditionalGeneration.set_biasc  sR    ./55b9
#$Q
O\a
 	##E*=$>?rG   c                    |dk(  r1ddl m} t        j                  dt               |j                  |      S t        |   |g|i |S )Nrg  r   ))TFBlenderbotSmallForConditionalGenerationri  )rj  r  rk  rl  rm  rn  rR   )ro  rp  rq  rW   r  rX   s        r4   rn  z4TFBlenderbotForConditionalGeneration.from_pretrainedk  sU    (,EETMM  =LLMjkkw&'D\z\U[\\rG   )rs  r   c                0   |t        j                  || j                  j                  k(  t        j                  t        j
                  t        |      d      |j                        |      }d}|7|5t        || j                  j                  | j                  j                        }| j                  ||||	||||||
|||||||      }t        j                  |d   | j                  j                  j                  d      }| j                  |      }|dn| j                  ||      }|s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  |j&                  |j(                  |j*                  |j,                  |j.                  		      S )
a  
        labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Nr    F)r   rO  rS  rP  rQ  r  rR  r#  r$  r  rT  r&  r  r	  r
  r   r   Tr   r   )	losslogitsr$  rI  rJ  r%  rK  r   rL  )r#   r(   r   r/   r$   r%   r   r"   r5   r0   r   r   r@  weightsr  hf_compute_lossr   r$  rI  rJ  r%  rK  r   rL  )rT   r.   r   rO  rP  rQ  r  rR  r#  rS  r$  r  rT  r&  r  r	  r
  labelsr   ru  	lm_logitsmasked_lm_lossrx  s                          r4   r\   z)TFBlenderbotForConditionalGeneration.call{  s   D XX$++222
6 2D96<<HF
 I (-B-J$6DKK44dkk6X6X%! **)/+#9!5/!5+'"7/!5##  
& IIgaj$***;*;*C*CQUV	OOI.	!'T5I5I&R[5\\GABK/F3A3M^%.YSYY #33")"?"?&99$55&-&G&G")"?"?&99

 
	
rG   c           
        | j                   j                  r"t        j                  |j                        d   nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }t        |j                  |||||j                  ||      S )Nr   )r  r$  rI  rJ  r%  rK  r   rL  )r   r&  r#   r  r$  r	  r&   rI  r  rJ  r%  r   rL  r   r  rK  rw  s           r4   r  z3TFBlenderbotForConditionalGeneration.serving_output  s   59[[5J5Jbhhv--.q1PTGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	GK{{GdGdb**6+B+BCjnGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	 =="(((&,&F&F"((	
 		
rG   c
                   ||d d dd f   }|,t         j                  j                  |dd      d d dd f   }n:||d   d   j                  d   }n"t        j                  |j                  d         }d |	|||||||||dS )Nr   T)r9   	exclusiver   r   r   )r.   rS  r$  rO  r   rP  rQ  r  rR  r#  r&  )r#   r   cumsumr  r<   )rT   rO  r$  r   rP  r  rR  r#  r&  rS  rW   rQ  s               r4   prepare_inputs_for_generationzBTFBlenderbotForConditionalGeneration.prepare_inputs_for_generation  s     & 1!RS& 9!-#%77>>2Hr]a>#bcdfhfici#j (#21#5a#8#>#>q#A #%88,=,C,CA,F#G  ..!2,&<$8"!2$8"
 	
rG   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr   r  )r   r   r#   r   r   r[   r   r  r   s     r4   r   z*TFBlenderbotForConditionalGeneration.build
  s    ::
4$'3tzz/ '

  &'4t,8t334 ,%%d+, , 9' ', ,s   C%CCC r  )NNNNNNNNNNNNNNNNNF)&r.   rc   r   rc   rO  rc   rP  rc   rQ  rc   r  rc   rR  rc   r#  rc   rS  rY  r$  r  r  rc   rT  rc   r&  r   r  r   r	  r   r
  r   r  rc   r   r   r   z*Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput])NNNNNNNNrQ   )re   rf   rg   "_keys_to_ignore_on_load_unexpectedrS   re  ra  r  r  r  r  r  rn  r   r   r  r   r   r  r   BLENDERBOT_GENERATION_EXAMPLEr\   r  r  r   ri   rj   s   @r4   r  r  A  s    	-,*&

""+);@ ] ] *+FG+<?[56 '++/.23715&*.215EI26*.26$(,0/3&*#'#('N
#N
 )N
 ,	N

 !1N
 /N
 $N
 ,N
 /N
 CN
 0N
 (N
  0N
 "N
 *N
  -!N
" $#N
$ !%N
& !'N
( 
4)N
 7 \ H N
b
. #!$
L	,rG   r  )r.   r   r/   ra   r0   ra   )r   )r@   rb   rA   ra   rQ   )rD   r   rC   zOptional[int])Arh   
__future__r   osr  rk  typingr   r   r   r   
tensorflowr#   activations_tfr	   modeling_tf_outputsr
   r   r   r   modeling_tf_utilsr   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   r   r   configuration_blenderbotr   
get_loggerre   loggerr  r  r;   r5   rF   rM   rv   r;  rO   Layerrl   r   r   r   BLENDERBOT_START_DOCSTRINGr  r  r   r  r4  r[  r  r  rs   rG   r4   <module>r     s    " 	   / /  /   S R  7 
		H	%8 $ 2;$
6CU\\-C-C C*gBELL.. gBVJJu||11 JJ\EJu||11 EJP "3  
' R! :K \ h&%,,,, h& h&V m&%,,,, m& m&` x)ELL.. x) x)v ^h'3 h'	h'X"" " XN,+FHd N,	N,rG   