
    sgf              
       8   d Z ddlZddlZddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZmZ ddl
mZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(  e&jR                  e*      Z+dZ,dZ-dej\                  de/dej`                  dej\                  fdZ1dej\                  dej\                  de2de3dej\                  f
dZ4dej\                  dej\                  fdZ5dej\                  dej\                  dej\                  fdZ6 G d  d!ejn                  jp                        Z9 G d" d#e	jt                        Z; G d$ d%e	jt                        Z< G d& d'e	jt                        Z= G d( d)e	jt                        Z> G d* d+e$      Z?d,Z@d-ZA ed.e@       G d/ d0e?             ZB ed1e@       G d2 d3e?e             ZC ed4e@       G d5 d6e?             ZD ed7e@       G d8 d9e?             ZE ed:e@       G d; d<e?             ZFy)=zPyTorch BLOOM model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCacheStaticCache)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)logging   )BloomConfigzbigscience/bloom-560mr   attention_mask	num_headsdtypereturnc                    | j                   \  }}dt        j                  t        j                  |            z  }t	        j
                  ddt        j                  |      dz
   z   z  | j                  t        j                        }t	        j                  dd|z   | j                  t        j                        }t	        j                  ||      }||k7  rt	        j
                  ddt        j                  d|z        dz
   z   z  | j                  t        j                        }	t        |||z
        }
t	        j                  ddd|
z  z   d| j                  t        j                        }t	        j                  |t	        j                  |	|      gd      }| j                  d      dz
  | z  dddddf   }|d	   |z  }|j                  ||z  d|      j                  |      S )
a  
    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
       r   devicer    r   r   dimN).N)shapemathfloorlog2torchtensorr%   float32arangeint32powmincatcumsumreshapeto)r   r   r    
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.pybuild_alibi_tensorrD   1   s   " ,11J
djj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYtV$FY&\\A499Q);%;<q@AABCNLaLainiviv

 ""4iBT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj,$GHaP %+++3a7>I1dTU:VM9-E==i/J?BB5II    xresidualprobtrainingc                 @    t        j                  | ||      }||z   }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )prI   )Fdropout)rF   rG   rH   rI   outs        rC   dropout_addrO   ]   s$     ))A
1C
S.CJrE   c                 \    | dz  dt        j                  d| z  dd| z  | z  z   z        z   z  S )z
    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
    make the model jitable.

    Args:
        x (`torch.tensor`):
            input hidden states
          ?      ? e3E?r   Hm?r-   tanh)rF   s    rC   bloom_gelu_forwardrW   p   s8     s7cEJJzA~X\A=M9M'NOOPPrE   gc                     |d   }t        j                  d|z  dd|z  |z  z   z        }d|z  d||z  z
  dd|z  |z  z   z  z  dd|z   z  z   }|| z  S )a   
    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
    0.3989423 * x * torch.exp(-0.5 * x * x)

    Args:
        g (`torch.tensor`):
            gradient output tensor
        x (`torch.tensor`):
            input tensor
    r   rS   r   rT   rQ   g6vf?rU   )rX   rF   tanh_outffs       rC   bloom_gelu_backr\   |   sz     	
!Azz*q.A1q0@,@ABH	qQH,,lQ>NQR>R1RS	TWZ^_bj^jWk	kB6MrE   c                       e Zd Zedej
                  dej
                  fd       Zedej
                  dej
                  fd       Zy)GeLUFunctioninputr!   c                 :    | j                  |       t        |      S N)save_for_backwardrW   )ctxr_   s     rC   forwardzGeLUFunction.forward   s    e$!%((rE   grad_outputc                 6    | j                   }t        ||      }|S ra   )saved_tensorsr\   )rc   re   r_   tmps       rC   backwardzGeLUFunction.backward   s    !!k51
rE   N)__name__
__module____qualname__staticmethodr-   Tensorrd   ri    rE   rC   r^   r^      sT    )ELL )U\\ ) ) 5<< ELL  rE   r^   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )	BloomGelua  
    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
    copied from Megatron-DeepSpeed code and adapted for our needs

    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
    c                 "    t         |           y ra   super__init__)self	__class__s    rC   ru   zBloomGelu.__init__   s    rE   rF   r!   c                 Z    | j                   rt        j                  |      S t        |      S ra   )rI   r^   applyrW   )rv   rF   s     rC   rd   zBloomGelu.forward   s%    ==%%a((%a((rE   )	rj   rk   rl   __doc__ru   r-   rn   rd   __classcell__rw   s   @rC   rq   rq      s(    ) )%,, )rE   rq   c                       e Zd Zddedee   f fdZdej                  de	ej                  ej                  ej                  f   fdZ
dej                  dej                  fdZ	 	 	 	 	 dd	ej                  d
ej                  dej                  dej                  dee   deej                     dededeej                     fdZ xZS )BloomAttentionconfig	layer_idxc                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | _        |j                  | _	        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      dt        j                  | j                        z  | _        d| _        || _        |-t         j#                  d| j$                  j&                   d       t)        j*                  | j                  d| j                  z  d	      | _        t)        j*                  | j                  | j                        | _        t)        j0                  |j2                        | _        y )
NzA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rR   zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   Tbias)rt   ru   pretraining_tpslow_but_exacthidden_sizen_headr   head_dim
split_sizehidden_dropout
ValueErrorr*   sqrtinv_norm_factorbetar   loggerwarning_oncerw   rj   r   Linearquery_key_valuedenseDropoutattention_dropout)rv   r   r   rw   s      rC   ru   zBloomAttention.__init__   sz   $33$33!--((DNN:**$33==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==	" !8!8 9 :, ,  "yy)9)91t?O?O;OVZ[YYt//1A1AB
!#F,D,D!ErE   	fused_qkvr!   c                    |j                   \  }}}|j                  ||| j                  d| j                        }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|||fS )a  
        Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape
        without making any copies, results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, num_heads, seq_length, head_dim]
            key: [batch_size, num_heads, seq_length, head_dim]
            value: [batch_size, num_heads, seq_length, head_dim]
        r   .r   Nr   r#   )r)   viewr   r   	transpose)rv   r   r8   r9   three_times_hidden_sizequery_layer	key_layervalue_layers           rC   _reshapezBloomAttention._reshape   s     ;D//7
J 7NN:z4>>1dmm\	Q	*44Q:c1ai(221a8	Q	*44Q:I{22rE   rF   c                    |j                   \  }}}|| j                  z  }|j                  || j                  || j                        }|j	                  dddd      }|j                  ||| j                  | j                  z        S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r#   r   r   )r)   r   r   r   permuter6   )rv   rF   batch_size_and_num_headsr9   _r8   s         rC   _merge_headszBloomAttention._merge_heads   sy     34''/ *a-?
 FF:t~~z4==I IIaAq! yyZ$--1OPPrE   hidden_statesrG   rB   r   
layer_past	head_mask	use_cacheoutput_attentionscache_positionc
                 &   |j                   \  }
}}| j                  |      }| j                  |      \  }}}|%d|	i}|j                  ||| j                  |      \  }}|j                  |
| j                  z  d| j                        }|j                  |
| j                  z  d| j                        j                  dd      }|j                  |
| j                  z  d| j                        }|j                  ||| j                  | j                        }|j                  |
| j                  |d      }|#|d d d d d d d |j                   d   f   }||z   }t        j                  |dt        j                         j#                  |j$                        }| j'                  |      }|||z  }|j                  |
| j                  z  |d      }t        j(                  ||      }| j+                  |      }| j,                  dkD  r| j.                  r| j0                  | j,                  z  }t        j2                  |      }t5        | j,                        D ]z  }|t        j6                  |d d d d t9        ||z        t9        |dz   |z        f   | j:                  j<                  d d t9        ||z        t9        |dz   |z        f         z   }| n| j;                  |      }t?        ||| j@                  | jB                        }||f}|r||fz  }|S )Nr   r(   )batch1batch2r   alpha)r'   r    r   )"r)   r   r   updater   r6   r   r   r   baddbmmr   r   r   rL   softmaxr-   r/   r7   r    r   bmmr   r   r   r   
zeros_likerangelinearintr   weightrO   r   rI   )rv   r   rG   rB   r   r   r   r   r   r   r8   q_lengthr   r   r   r   r   cache_kwargsattention_scoresattn_weightscausal_maskattention_probsattention_probs_reshapedcontext_layerslicesoutput_tensorioutputss                               rC   rd   zBloomAttention.forward   s    #0"5"5
Ha((7	.2mmI.F+Y!,n=L%/%6%6y+t~~_k%l"I{ "))*t~~*Er4==Y%%j4>>&A2t}}U__`bdfg	!))*t~~*Er4==Y !==&&	 ) 
 (,,ZSUV%(Aq2GIOOB4G2G)GHK'+5L ))LbNQQR]RcRcd 00A -	9O $3#7#7
T^^8SU]_a#b  		":KH ))-8 "t':':%%(;(;;F!,,];M4../  -!!QAJ#q1u>N:O(O"OPJJ%%aQZ3A?O;P)P&PQ1 ! !JJ}5M#M8T=P=PRVR_R_` *-))GrE   ra   NNFFN)rj   rk   rl   r   r   r   ru   r-   rn   r   r   r   r   bool
LongTensorrd   r{   r|   s   @rC   r~   r~      s   F{ Fx} FB3%,, 35u||UZUaUa9a3b 3(Qell Qu|| Q> '+,0"'59L||L ,,L ||	L
 L UOL ELL)L L  L !!1!12LrE   r~   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )BloomMLPr   c                 6   t         |           |j                  }|j                  | _        |j                  | _        t        j                  |d|z        | _        t               | _	        t        j                  d|z  |      | _
        |j                  | _        y )N   )rt   ru   r   r   r   r   r   dense_h_to_4hrq   	gelu_impldense_4h_to_hr   )rv   r   r   rw   s      rC   ru   zBloomMLP.__init__M  sz    (($33$33YY{AOD"YYq;D$33rE   r   rG   r!   c                    | j                  | j                  |            }| j                  dkD  r| j                  rt	        j
                  |      }| j                  j                  j                  d   | j                  z  }t        | j                        D ]z  }|t        j                  |d d d d t        ||z        t        |dz   |z        f   | j                  j                  d d t        ||z        t        |dz   |z        f         z   }| n| j                  |      }t        ||| j                  | j                        }|S )Nr   r(   )r   r   r   r   r-   r   r   r   r)   r   rL   r   r   rO   r   rI   )rv   r   rG   intermediate_outputr   r   outputs          rC   rd   zBloomMLP.forwardX  s.   t'9'9-'HI"t':':"'"2"28"<''..44R84;N;NNF4../ &9AHH!!QAJ#q1u>N:O(O"OP&&--aQZ3AQWGWCX1X.XY= '# #'"4"4]"C0(D<O<OQUQ^Q^_rE   )	rj   rk   rl   r   ru   r-   rn   rd   r{   r|   s   @rC   r   r   L  s5    	4{ 	4U\\ U\\ ell rE   r   c                        e Zd Zddedee   f fdZ	 	 	 	 	 ddej                  dej                  dej                  dee	   deej                     d	e
d
e
deej                     fdZ xZS )
BloomBlockr   r   c                 R   t         |           |j                  }t        ||j                        | _        |j                  | _        t        ||      | _	        t        ||j                        | _
        t        |      | _        |j                  | _        |j                  | _        y )Neps)rt   ru   r   r	   layer_norm_epsiloninput_layernormr   r   r~   self_attentionpost_attention_layernormr   mlp(apply_residual_connection_post_layernormr   )rv   r   r   r   rw   s       rC   ru   zBloomBlock.__init__l  s    (((&:S:ST,VY?(1+6C\C\(]%F#8>8g8g5$33rE   r   rB   r   r   r   r   r   r   c	                     | j                  |      }	| j                  r|	}
n|}
| j                  |	|
|||||||	      }|d   }|dd  }| j                  |      }	| j                  r|	}
n|}
| j	                  |	|
      }|r|f|z   }|S |f|dd  z   }|S )N)r   r   rB   r   r   r   r   r   r   )r   r   r   r   r   )rv   r   rB   r   r   r   r   r   r   layernorm_outputrG   attn_outputsattention_outputr   r   s                  rC   rd   zBloomBlock.forwardz  s      //> 88'H$H **!)/) + 

 (?qr"889IJ 88'H'H *H5i')G  i'!"+-GrE   ra   r   )rj   rk   rl   r   r   r   ru   r-   rn   r   r   r   rd   r{   r|   s   @rC   r   r   k  s    4{ 4x} 4& '+,0"'597||7 ||7 	7
 UO7 ELL)7 7  7 !!1!127rE   r   c                   `     e Zd ZeZdZdZdgZdZdZ	dZ
dZ fdZdej                  fdZ xZS )BloomPreTrainedModeltransformerTr   past_key_valuesc                 $    t        |   |i | y ra   rs   )rv   inputskwargsrw   s      rC   ru   zBloomPreTrainedModel.__init__  s    &+F+rE   modulec                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t              rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights.        )meanstdNrR   )
isinstancer   r   r   datanormal_r   initializer_ranger   zero_	Embeddingpadding_idxr	   fill_)rv   r   s     rC   _init_weightsz"BloomPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .	*KK""$MM$$S) +rE   )rj   rk   rl   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_static_cache_supports_quantized_cacheru   r   Moduler   r{   r|   s   @rC   r   r     sI    L%&*#%"3 ! $,*BII *rE   r   a,  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance, see our
            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
z_The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zdef fdZdej                  dedej                  dej                  fdZ	d Z
d	ej                  fd
Z ee       eeee      	 	 	 	 	 	 	 	 	 	 ddeej&                     deeeeeej                  ej                  f   df   f      deej                     deej&                     deej&                     dee   dee   dee   dee   deej&                     deeej                  df   ef   fd              Zdej                  dej                  dej                  dedef
dZedej                  dededej                  dej6                  dej                  defd       Z xZS )
BloomModelr   c           	      "   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _	        t        | j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        | j                  |j                        | _        d| _        | j)                          y c c}w )Nr   )r   F)rt   ru   r   	embed_dimr   r   r   r   
vocab_sizeword_embeddingsr	   r   word_embeddings_layernorm
ModuleListr   num_hidden_layersr   hln_fgradient_checkpointing	post_init)rv   r   r   rw   s      rC   ru   zBloomModel.__init__+  s     ++  "||F,=,=t~~N)24>>vG`G`)a& vOgOgIhiA
6Q ?ij dnn&2K2KL	&+# 	  js   .Dr   r   r    r!   c                     t        |||      S ra   )rD   )rv   r   r   r    s       rC   rD   zBloomModel.build_alibi_tensor@  s    !.)UCCrE   c                     | j                   S ra   r  rv   s    rC   get_input_embeddingszBloomModel.get_input_embeddingsC  s    ###rE   new_embeddingsc                     || _         y ra   r  rv   r  s     rC   set_input_embeddingszBloomModel.set_input_embeddingsF  s
    -rE   
checkpointoutput_typer   	input_idsr   .r   inputs_embedsr   r   output_hidden_statesreturn_dictr   c                    |j                  dd      durt        j                  dt               t	        |      dkD  rt        d|       ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|	|	n| j                  j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }d}|rIt!        |t"              s9d}|t%               }n*t%        j&                  |      }t        j                  d	       |j(                  \  }}}||j+                         nd}||z   }|
%t-        j.                  |||z   |j0                  
      }
| j3                  || j                  j4                        }| j7                  |      }d }|rdnd }|rdnd }|$t-        j8                  ||f|j0                  
      }n|j;                  |j0                        }| j=                  || j>                  |j@                        }| jC                  |||
||      }tE        | jF                        D ]{  \  }}|r||fz   }| j                  r3| j                  r'| jI                  |jJ                  ||||||   |||
	      }n ||||||   ||||
      }|d   }|r|d   }|so|||rdnd   fz   }} | jM                  |      }|r||fz   }|r|nd }|r|jO                         }|	stQ        d ||||fD              S tS        ||||      S )Nposition_idsFz`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore passing `position_ids`.r   Got unexpected arguments: z:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...TzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r%   ro   )r    )r   r   r   r   r   rB   r   r   r#   c              3   &   K   | ]	  }||  y wra   ro   ).0vs     rC   	<genexpr>z%BloomModel.forward.<locals>.<genexpr>  s      bcbos   )last_hidden_stater   r   
attentions)*popwarningswarnFutureWarninglenr   r   r   r  r   use_return_dictr	  rI   r   r   r  r   r   r   from_legacy_cacher)   get_seq_lengthr-   r0   r%   get_head_maskn_layerr  onesr7   rD   r   r    _update_causal_mask	enumerater  _gradient_checkpointing_func__call__r  to_legacy_cachetupler   )rv   r  r   r   r   r  r   r   r  r  r   deprecated_argumentsreturn_legacy_cacher8   r9   r   past_lengthseq_length_with_pastr   next_decoder_cacheall_self_attentionsall_hidden_statesrB   r   r   blockr   
next_caches                               rC   rd   zBloomModel.forwardI  s   (  ##NE:%GMM+
 #$q(9:N9OPQQ1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yl I  00;M $Z?"&&".."."@"@"Q##^ %2$7$7!
J:I:Uo446[\)K7!"\\+{Z7OXeXlXlmN &&y$++2E2EF	66}E!$5b4"6BD !"ZZ5I(JS`SgSghN+..}/C/CDN''mNaNa'b..M>?L]
 "$&&) !	^HAu#$58H$H!**t}};;NN!#aL%"
  !.#.'l'&7#1	 $AJM%,QZ" &9W)QYZ=[<]&]#C!	^H 		-0 1]4D D+4'$
#335J ):7HJ]^   9+&+*	
 	
rE   input_tensorc           
      
   | j                   j                  dk(  r	|d|v r|S y ||j                         nd}t        |t              }| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  |j                  }	}|j                  d   }
|r|j                         }n1t        |t        j                        r|j                  d   n||
z   dz   }| j                  ||
|||	||j                  d         }| j                   j                  dk(  rR|P|j                  j                  d	k(  r7|s5t        j                   |      j"                  }t        j$                  ||      }|S )
Nflash_attention_2r   r   sdpa)r  past_key_values_lengthis_trainingr   r(   )sequence_lengthtarget_lengthr    r%   r   r8   cuda)r   _attn_implementationr-  r   r   r   _ignore_causal_mask_sdparI   r    r%   r)   get_max_cache_shaper-   rn   5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfor3   _unmask_unattended)rv   r   r@  r   r   r   past_seen_tokensusing_static_cacher    r%   rF  rG  r   	min_dtypes                 rC   r1  zBloomModel._update_causal_mask  s    ;;++/BB)c^.C%%
 @O?Z?99;`a'E ;;++v5>PYj%>>*'7 MM	 $**L,?,?v&,,Q/+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**f4%
 E*..I0CCKQZ[KrE   rF  rG  r%   r8   c                 f   | | j                         dk(  r| }|S t        j                  |      j                  }	t        j                  ||f|	||      }|dk7  rt        j
                  |d      }|t        j                  ||      |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| s|j                         }| j                  d   }
|ddddddd|
f   | ddddddf   z   }|dk(  }|ddddddd|
f   j                  ||	      |ddddddd|
f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )
fill_valuer    r%   r   )diagonalr  r(   r   )r'   r-   rN  r3   fulltriur0   r6   expandcloner)   masked_fill)r   rF  rG  r    r%   r   r8   r   r   rR  mask_lengthpadding_masks               rC   rL  z@BloomModel._prepare_4d_causal_attention_mask_with_cache_position  s`   D %.*<*<*>!*C(K& # E*..I** -0Ye\bK !##jjqA5<<fEH^H^_acdHeeeK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdd+q05@Aq,;,AV5W5c5c )6Aq!\k\12 rE   
NNNNNNNNNN)rj   rk   rl   r   ru   r-   rn   r   r    rD   r  r  r   BLOOM_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r   r   r   r   r   rd   r1  rm   r%   rL  r{   r|   s   @rC   r   r   &  sY   
{ *D D# DV[VaVa Dfkfrfr D$.5<< . ++AB&=$ 15ae150448$(,0/3&*59L
E,,-L
 "%uU5<<;U5VX[5[/\(\"]^L
 !.	L

 E,,-L
   0 01L
 D>L
 $D>L
 'tnL
 d^L
 !!1!12L
 
uU\\3&')RR	SL
 CL
^?? ll? 	?
 ?  ?B 555 5 {{	5
 5 5 5 5rE   r   z
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZdgZdef fdZd Zdej                  fdZ		 	 	 	 	 ddZ
 ee       eeee      	 	 	 	 	 	 	 	 	 	 	 dd	eej$                     d
eeeeeej                  ej                  f   df   f      deej                     deej                     deej                     deej                     dee   dee   dee   dee   deej$                     deeej                     ef   fd              Zdeeej                  ej                  f   df   dej$                  deeej                  ej                  f   df   fdZ xZS )BloomForCausalLMzlm_head.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr   )
rt   ru   r   r   r   r   r   r  lm_headr
  rv   r   rw   s     rC   ru   zBloomForCausalLM.__init__c  sI     %f-yy!3!3V5F5FUS 	rE   c                     | j                   S ra   re  r  s    rC   get_output_embeddingsz&BloomForCausalLM.get_output_embeddingsk  s    ||rE   r  c                     || _         y ra   rh  r  s     rC   set_output_embeddingsz&BloomForCausalLM.set_output_embeddingsn  s	    %rE   c                    |D||d d |j                   d    d f   }n(|j                   d   |j                   d   k7  r	|d d |f   }||d   dk(  r|d d}n#|j                  t        j                        d d}t	        |t
              rl|j|j                         }	|j                   \  }
}|	|z
  }t        j                  |
||j                  |j                        }t        j                  ||gd      }|j                  ||||d	       |S )
Nr   r   )r  r  )memory_format)r  r  r$   r(   r&   )r   r   r   r   )r)   rY  r-   contiguous_formatr   r   rK  zerosr%   r    r4   r   )rv   r  r   r   r  r   r   r   model_inputsrG  r8   r9   diffnew_attn_masks                 rC   prepare_inputs_for_generationz.BloomForCausalLM.prepare_inputs_for_generationq  s3    &(%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	 $):a)?-:NL
 *3uG^G^)_rvwL o{38R+??AM%3%9%9"J
 :-D!KK
DAVAV^l^r^rsM"YY/N
 	"0#2&"0		
 rE   r  r  r   .r   r   r  labelsr   r   r  r  r   r!   c                    |j                  dd      durt        j                  dt               t	        |      dkD  rt        d|       |
|
n| j                  j                  }
| j                  ||||||||	|
|
      }|d   }| j                  |      }d}||j                  |j                        }|ddd	ddf   j                         }|dd
df   j                         }|j                  \  }}}t               } ||j                  ||z  |      |j                  ||z              }|
s|f|d
d z   }||f|z   S |S t!        |||j"                  |j$                  |j&                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        r  Fr  r   r  N)	r   r   r   r  r   r   r  r  r   .r(   r   losslogitsr   r   r%  )r&  r'  r(  r)  r*  r   r   r+  r   re  r7   r%   
contiguousr)   r   r   r   r   r   r%  )rv   r  r   r   r   r  rt  r   r   r  r  r   r7  transformer_outputsr   	lm_logitsrw  shift_logitsshift_labelsr8   r9   r  loss_fctr   s                           rC   rd   zBloomForCausalLM.forward  s   6  ##NE:%GMM+
 #$q(9:N9OPQQ%0%<k$++B]B]"..+)'/!5#) / 
 ,A.LL/	YYy//0F$S#2#q[1<<>L!#qr'?557L1=1C1C.J
J')H!!*z"9:FHYHYZdgqZqHrD \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
rE   pastbeam_idxc           	          |D ci c]/  }|D ](  }|j                   |j                  |j                         * 1 c}}t        fd|D              }|S c c}}w )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c              3      K   | ]N  }|d    j                  d |d    j                           |d   j                  d |d    j                           f P yw)r   r   N)index_selectr%   )r!  r   device_to_beam_idxs     rC   r#  z2BloomForCausalLM._reorder_cache.<locals>.<genexpr>  se      

  1**1.@AAUAU.VW1**1.@AAUAU.VW
s   AA)r%   r7   r6  )rv   r  r  r   
past_statereordered_pastr  s         @rC   _reorder_cachezBloomForCausalLM._reorder_cache  sr     QU
BLgq
YcJx{{:+<+<==

  

 #
 
 
s   4A)NNNNT)NNNNNNNNNNN)rj   rk   rl   _tied_weights_keysr   ru   ri  r-   rn   rk  rs  r   r^  r   r_  r   r`  r   r   r   r   r   r   rd   r  r{   r|   s   @rC   rb  rb  Y  s    ++{ &ELL & 3j ++AB&5$ 15ae15,004)-$(,0/3&*59I
E,,-I
 "%uU5<<;U5VX[5[/\(\"]^I
 !.	I

 ELL)I
  -I
 &I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
 
uU\\"$EE	FI
 CI
V%ell :;S@AMRM]M]	uU\\5<</0#5	6rE   rb  a  
    The Bloom Model transformer with a sequence classification head on top (linear layer).

    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deeeeeej"                  ej"                  f   df   f      deej"                     deej"                     d	eej"                     d
eej"                     dee   dee   dee   dee   deeej"                     e	f   fd              Z xZS )BloomForSequenceClassificationr   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y rd  )
rt   ru   
num_labelsr   r   r   r   r   scorer
  rf  s     rC   ru   z'BloomForSequenceClassification.__init__  sV      ++%f-YYv1163D3D5Q
 	rE   r  r  r   .r   r   r  rt  r   r   r  r  r!   c                    |j                  dd      durt        j                  dt               t	        |      dkD  rt        d|       |
|
n| j                  j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                  j                  |dk7  rt        d	      | j                  j                  d
}n|xt        j                  || j                  j                        j                         j                  d
      dz
  }||j                  d
   z  }|j!                  |j"                        }n.d
}t$        j'                  | j(                  j*                   d       |t        j,                  ||j"                        |f   }d}|^| j                  j.                  | j0                  dk(  rd| j                  _        nl| j0                  dkD  rL|j2                  t        j4                  k(  s|j2                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j.                  dk(  rIt7               }| j0                  dk(  r& ||j9                         |j9                               }nc |||      }nY| j                  j.                  dk(  rt;               } |||      }n,| j                  j.                  dk(  rt=               } |||      }|
s|f|dd z   }||f|z   S |S t?        |||j@                  |jB                  |jD                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r  Fr  r   r  Nr   r   r   r  r   r   r  r  r   z=Cannot handle batch sizes > 1 if no padding token is defined.r(   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationrv  )#r&  r'  r(  r)  r*  r   r   r+  r   r  r)   pad_token_idr-   eqr   argmaxr7   r%   r   r   rw   rj   r0   problem_typer  r    longr
   squeezer   r   r   r   r   r%  )rv   r  r   r   r   r  rt  r   r   r  r  r7  rz  r   rx  r8   sequence_lengthspooled_logitsrw  r~  r   s                        rC   rd   z&BloomForSequenceClassification.forward(  s5   4  ##NE:%GMM+
 #$q(9:N9OPQQ%0%<k$++B]B]"..+)'/!5# / 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!$#(88It{{7O7O#P#T#T#V#]#]^`#ade#e #3ioob6I#I #3#6#6v}}#E #% ##~~../ 0^ ^
 u||Jv}}MO__`{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
rE   r]  )rj   rk   rl   r   ru   r   r^  r   r_  r   r`  r   r-   r   r   r   r   rn   r   rd   r{   r|   s   @rC   r  r    sX    {  ++AB&4$ 15ae15,004)-$(,0/3&*g
E,,-g
 "%uU5<<;U5VX[5[/\(\"]^g
 !.	g

 ELL)g
  -g
 &g
 D>g
 $D>g
 'tng
 d^g
 
uU\\"$DD	Eg
 Cg
rE   r  z
    Bloom Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deeeeeej"                  ej"                  f   df   f      deej"                     deej"                     d	eej"                     d
eej"                     dee   dee   dee   dee   deeej"                     e	f   fd              Z xZS )BloomForTokenClassificationr   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)rt   ru   r  r   r   hasattrr  r   r   r   rM   r   r   
classifierr
  )rv   r   r  rw   s      rC   ru   z$BloomForTokenClassification.__init__  s      ++%f-6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	rE   r  r  r   .r   r   r  rt  r   r   r  r  r!   c                    |j                  dd      durt        j                  dt               t	        |      dkD  rt        d|       |
|
n| j                  j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                         |j                  ||z              }|
s|f|dd z   }||f|z   S |S t#        |||j$                  |j&                  	      S )
r  r  Fr  r   r  Nr  r#   )rw  rx  r   r%  )r&  r'  r(  r)  r*  r   r   r+  r   rM   r  r7   r%   r)   r   r   r  r   r   r%  )rv   r  r   r   r   r  rt  r   r   r  r  r7  rz  r   rx  rw  r8   r9   r~  r   s                       rC   rd   z#BloomForTokenClassification.forward  s|   4  ##NE:%GMM+
 #$q(9:N9OPQQ%0%<k$++B]B]"..+)'/!5# / 

 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
rE   r]  )rj   rk   rl   r   ru   r   r^  r   r_  r   r`  r   r-   r   r   r   r   rn   r   rd   r{   r|   s   @rC   r  r    sX   { " ++AB&)$ 15ae15,004)-$(,0/3&*C
E,,-C
 "%uU5<<;U5VX[5[/\(\"]^C
 !.	C

 ELL)C
  -C
 &C
 D>C
 $D>C
 'tnC
 d^C
 
uU\\"$99	:C
 CC
rE   r  z
    The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like
    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   n    e Zd Z fdZ eej                  d            	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   deeef   fd       Z xZS )BloomForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr#   )	rt   ru   r   r   r   r   r   
qa_outputsr
  rf  s     rC   ru   z"BloomForQuestionAnswering.__init__  sA     %f-))F$6$6: 	rE   zbatch_size, sequence_lengthr  r   r  r   r  start_positionsend_positionsr   r  r  r!   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        N)r   r  r   r  r   r  r  r   r   r(   r&   )ignore_indexr#   )rw  start_logits
end_logitsr   r%  )r   r+  r   r  splitr  ry  r*  sizeclampr   r   r   r%  )rv   r  r   r  r   r  r  r  r   r  r  r   sequence_outputrx  r  r  
total_lossignored_indexr~  
start_lossend_lossr   s                         rC   rd   z!BloomForQuestionAnswering.forward  s   0 &1%<k$++B]B]"")%'/!5# # 	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rE   r]  )rj   rk   rl   ru   r   r^  formatr   r-   r   FloatTensorr   r   r   r   rd   r{   r|   s   @rC   r  r    s,    ++A+H+HIf+gh 156:3715596:48,0/3&*F
E,,-F
 !!2!23F
 u//0	F

 E--.F
   1 12F
 "%"2"23F
   0 01F
 $D>F
 'tnF
 d^F
 
u22	3F
 iF
rE   r  )Grz   r*   r'  typingr   r   r   r-   torch.utils.checkpointr   torch.nnr   r   r	   r
   r   rL   cache_utilsr   r   r   
file_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   configuration_bloomr   
get_loggerrj   r   r_  r`  rn   r   r    rD   floatr   rO   rW   r\   autogradFunctionr^   r   rq   r~   r   r   r   BLOOM_START_DOCSTRINGr^  r   rb  r  r  r  ro   rE   rC   <module>r     s      ) )    L L $ ; ; q q ) >  .  , 
		H	%- )Ju|| )J )JEKK )J\a\h\h )JX5<< 5<< u PT Y^YeYe &	Q%,, 	Q5<< 	Qu||   $
5>>** 
)		 )&[RYY [|ryy >F FR*? *< B J el% l	l^	  l+_ ll^  w
%9 w
w
t  [
"6 [
[
|  P
 4 P
P
rE   