
    sgkB                    \   d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.  e        rddl/m0Z0  e"jb                  e2      Z3dZ4dZ5 G d de
jl                        Z7 G d de7      Z8e7e8dZ9 G d de
jl                        Z: G d de
jl                        Z; G d de
jl                        Z< G d  d!e      Z=d"Z>d#Z?d$Z@d%ZA G d& d'e=e      ZB ed(e>j                  d)*             G d+ d,eB             ZD ed-e>j                  d.*             G d/ d0eB             ZE ed1e>j                  d2*             G d3 d4e=             ZF ed5e?       G d6 d7e=             ZGy)8zPyTorch BARK model.    N)DictOptionalTupleUnion)nn)
functional   )GenerationMixin)#AlternatingCodebooksLogitsProcessor!BarkEosPrioritizerLogitsProcessorSuppressTokensLogitsProcessor)_prepare_4d_attention_mask)CausalLMOutputWithPastMaskedLMOutput)PreTrainedModelget_parameter_device)add_start_docstrings%add_start_docstrings_to_model_forwardis_accelerate_availableis_flash_attn_2_available#is_flash_attn_greater_or_equal_2_10logging   )	AutoModel   )BarkCoarseConfig
BarkConfigBarkFineConfigBarkSemanticConfigBarkSubModelConfig)BarkCoarseGenerationConfigBarkFineGenerationConfigBarkSemanticGenerationConfig)_flash_attention_forwardzsuno/bark-smallr   c                   F     e Zd Zd fd	Zd Zd ZddZ	 	 	 	 	 ddZ xZS )	BarkSelfAttentionc                    t         |           |j                  | _        t        j                  |j                        | _        t        j                  |j                        | _        |j                  | _        |j                  | _	        | j                  | j                  z  | _
        |j                  |j                  z  dk7  r&t        d| j                   d| j                   d      t        j                  |j                  d|j                  z  |j                        | _        t        j                  |j                  |j                  |j                        | _        || _        |ra|j"                  }t%        j&                  t%        j(                  ||ft*                    j-                  dd||      }| j/                  d	|       y y )
Nr   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r	   biasdtyper   r)   )super__init__dropoutr   Dropoutattn_dropoutresid_dropouthidden_size	embed_dim	num_headshead_dim
ValueErrorLinearr)   att_projout_proj	is_causal
block_sizetorchtrilonesboolviewregister_buffer)selfconfigr:   r;   r)   	__class__s        Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bark/modeling_bark.pyr-   zBarkSelfAttention.__init__H   sk    ~~JJv~~6ZZ7++))$..8 0 00A5MdnnM] ^NN#2'  		&"4"4a&:L:L6LSYS^S^_		&"4"4f6H6Hv{{["**J::ejj*j)ANOTTUVXY[egqrD  .     c                 |    |j                         dd ||fz   }|j                  |      }|j                  dddd      S )J
        Splits hidden_size dim into attn_head_size and num_heads
        Nr   r   r   r	   )sizer@   permuterB   tensorr4   attn_head_size	new_shapes        rE   _split_headszBarkSelfAttention._split_headsf   sC     KKM#2&)^)DD	Y'~~aAq))rF   c                     |j                  dd      j                         }|j                  |j                         dd ||z  fz         }|S )S
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r   N)	transpose
contiguousr@   rJ   rB   rM   r4   rN   s       rE   _merge_headszBarkSelfAttention._merge_headsn   sL     !!!Q'224V[[]3B/9~3M2OOPrF   c                    t        j                  ||j                  dd            dt        j                  | j
                        z  z  }| j                  rz|j                  d      |j                  d      }}|j                  | j                  d d d d ||z
  |d |f   dk(  t        j                  |j                        j                        }|||z   }t        j                  j                  |d      }|j!                  |j                        }| j#                  |      }|||z  }t        j                  ||      }	|	|fS )NrI   rS         ?r   dim)r<   matmulrT   mathsqrtr5   r:   rJ   masked_fillr)   finfor+   minr   r   softmaxtor0   )
rB   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthattn_outputs
             rE   _attnzBarkSelfAttention._attnz   s&   ||E3==R+@AS499UYUbUbKcEcd>>',zz"~sxx|*L (33		!Q
\ 9J FSTXYYL../33L
 %'.8L}},,\r,B#u{{3((6  ')3L ll<7L((rF   c                    | j                  |      j                  | j                  d      \  }}}	| j                  || j                  | j
                        }| j                  || j                  | j
                        }| j                  |	| j                  | j
                        }	|<|d   }
|d   }t        j                  |
|fd      }t        j                  ||	fd      }	|du r||	f}nd }| j                  |||	||      \  }}| j                  || j                  | j
                        }| j                  |      }| j                  |      }||f}|r||fz  }|S )Nr   rZ   r   r   rS   T)r8   splitr3   rP   r4   r5   r<   catrm   rW   r9   r1   )rB   hidden_statesrg   past_key_valuesrh   	use_cacheoutput_attentionsrd   re   rf   past_key
past_valuepresentrl   ri   outputss                   rE   forwardzBarkSelfAttention.forward   sO    !MM-8>>t~~ST>UsE!!%GT^^T]]C!!%G&&q)H(+J))XsO4CIIz51r:EElGG$(JJuc5.R[$\!\''T^^T]]SmmK0((5(&GrF   FNNNNNFF)	__name__
__module____qualname__r-   rP   rW   rm   ry   __classcell__rD   s   @rE   r&   r&   D   s.    /<*
)D %rF   r&   c                   @     e Zd ZdZ fdZd Zd Z	 	 	 	 	 ddZ xZS )BarkSelfFlashAttention2aH  
    Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 D    t        |   |i | t                | _        y N)r,   r-   r   _flash_attn_uses_top_left_mask)rB   argskwargsrD   s      rE   r-   z BarkSelfFlashAttention2.__init__   s&    $)&)
 3V2W.W+rF   c                 X    |j                         dd ||fz   }|j                  |      }|S )rH   NrI   )rJ   r@   rL   s        rE   rP   z$BarkSelfFlashAttention2._split_heads   s5     KKM#2&)^)DD	Y' rF   c                 X    |j                  |j                         dd ||z  fz         }|S )rR   NrS   )r@   rJ   rV   s       rE   rW   z$BarkSelfFlashAttention2._merge_heads   s1     V[[]3B/9~3M2OOPrF   c           
         |j                         \  }}}	| j                  |      j                  | j                  d      \  }
}}| j	                  |
| j
                  | j                        }
| j	                  || j
                  | j                        }| j	                  || j
                  | j                        }|\|d   j                  dd      }|d   j                  dd      }t        j                  ||fd      }t        j                  ||fd      }|du r%|j                  dd      |j                  dd      f}nd }t        |
||||| j                  r| j                  nd| j                  | j                        }| j                  || j
                  | j                        }| j!                  |      }| j#                  |      }||f}|rd }||fz  }|S )Nr   rZ   r   r   T        )r.   use_top_left_maskr:   )rJ   r8   ro   r3   rP   r4   r5   rT   r<   rp   r$   trainingr.   r   r:   rW   r9   r1   )rB   rq   rg   rr   rh   rs   rt   
batch_size	query_len_rd   re   rf   ru   rv   rw   rl   rx   ri   s                      rE   ry   zBarkSelfFlashAttention2.forward   s    $1#5#5#7 
Iq !MM-8>>t~~ST>UsE!!%GT^^T]]C!!%G&&q)33Aq9H(+55a;J))XsO3CIIz51q9E}}Q*EOOAq,ABGG.$(MMDLLs"AAnn	
 ''T^^T]]SmmK0((5(L&GrF   r|   )	r}   r~   r   __doc__r-   rP   rW   ry   r   r   s   @rE   r   r      s.    X 4rF   r   )eagerflash_attention_2c                   *     e Zd ZdZd fd	Zd Z xZS )BarkLayerNormzOLayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.c                     t         |           t        j                  t	        j
                  |            | _        |r.t        j                  t	        j                  |            | _        y d | _        y r   )	r,   r-   r   	Parameterr<   r>   weightzerosr)   )rB   r2   r)   rD   s      rE   r-   zBarkLayerNorm.__init__$  sH    ll5::k#:;>BBLL[!9:		rF   c                     t        j                  || j                  j                  | j                  | j                  d      S )Ngh㈵>)eps)F
layer_normr   shaper)   )rB   inputs     rE   ry   zBarkLayerNorm.forward)  s,    ||E4;;#4#4dkk499RVWWrF   )T)r}   r~   r   r   r-   ry   r   r   s   @rE   r   r   !  s    YM
XrF   r   c                   $     e Zd Z fdZd Z xZS )BarkMLPc                    t         |           t        j                  |j                  d|j                  z  |j
                        | _        t        j                  d|j                  z  |j                  |j
                        | _        t        j                  |j                        | _	        t        j                         | _        y )N   r(   )r,   r-   r   r7   r2   r)   in_projr9   r/   r.   GELUgelurB   rC   rD   s     rE   r-   zBarkMLP.__init__.  s    yy!3!3Q9K9K5KRXR]R]^		!f&8&8"8&:L:LSYS^S^_zz&..1GGI	rF   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r9   r.   )rB   rq   s     rE   ry   zBarkMLP.forward5  s@    ]3		-0m4]3rF   r}   r~   r   r-   ry   r   r   s   @rE   r   r   -  s    rF   r   c                   2     e Zd Zd fd	Z	 	 	 	 	 ddZ xZS )	BarkBlockc                    t         |           |rMt        |j                  |j                        | _        t        |j                  |j                        | _        nHt        j                  |j                        | _        t        j                  |j                        | _        t        |j                     ||      | _        t        |      | _        y )Nr(   r:   )r,   r-   r   r2   r)   layernorm_1layernorm_2r   	LayerNormBARK_ATTENTION_CLASSES_attn_implementationattnr   mlp)rB   rC   r:   rD   s      rE   r-   zBarkBlock.__init__>  s      -V-?-?fkkRD,V-?-?fkkRD!||F,>,>?D!||F,>,>?D*6+F+FGZcd	6?rF   c                     | j                  |      }| j                  ||||||      }|d   }	|dd  }
||	z   }|| j                  | j                  |            z   }|r|f|
z   }
|
S |f|
dd  z   }
|
S )Nrr   rg   rh   rs   rt   r   r   )r   r   r   r   )rB   rq   rr   rg   rh   rs   rt   intermediary_hidden_statesattn_outputsrl   rx   s              rE   ry   zBarkBlock.forwardO  s     &*%5%5m%D"yy&+)/ ! 
 #1oqr"%2[%@"%?$((78C
 &
" 13g=G  23gabkAGrF   rz   r|   r   r   s   @rE   r   r   =  s    #( !rF   r   c                   ^     e Zd ZdZeZdZdZd Z fdZ	e
dej                  fd       Z xZS )BarkPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    FTc                    t        |t        j                  f      rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights.r   )meanstdNrY   )
isinstancer   r7   r   datanormal_rC   initializer_ranger)   zero_	Embeddingpadding_idxr   fill_rB   modules     rE   _init_weightsz!BarkPreTrainedModel._init_weights}  s   fryyl+ MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .rF   c                 $    t        |   |i | y r   )r,   r-   )rB   inputsr   rD   s      rE   r-   zBarkPreTrainedModel.__init__  s    &+F+rF   returnc                 :   t        | d      st        |       S | j                         D ]g  }t        |d      st        |j                  d      s'|j                  j                  >t        j                  |j                  j                        c S  t        |       S )
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        _hf_hookexecution_device)hasattrr   modulesr   r   r<   devicer   s     rE   r   zBarkPreTrainedModel.device  s     tZ('--lln 	FF
+FOO-?@OO44@||FOO$D$DEE	F $D))rF   )r}   r~   r   r   r   config_classsupports_gradient_checkpointing_supports_flash_attn_2r   r-   propertyr<   r   r   r   s   @rE   r   r   s  sD    
 L&+#!* , * * *rF   r   aG  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`{config}`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BarkConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a
  
    Args:
        codebook_idx (`int`):
            Index of the codebook that will be predicted.
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is
            predicted recursively by attending the previously predicted channels. The model predicts on windows of
            length 1024.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
            `past_key_values` is used, optionally only the last `input_embeds` have to be input (see
            `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
            associated vectors than the model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `input_ids` of shape `(batch_size, sequence_length)`.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you
            have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds`
            is used in priority instead of `input_ids`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       e Zd ZeZ fdZd Zd ZddZ e	e
      	 	 	 	 	 	 	 	 	 	 	 ddeej                     deeej                        deej                     deej                     d	eej                     d
eej                      deej                     dee   dee   dee   dee   deeej                     ef   fd       Zedeeej                        dej                  deeej                        fd       Z xZS )BarkCausalModelc           	         t         |   |       || _        t        j                  |j
                  |j                        | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |d       c}      | _        |j$                  dk(  | _        t)        |j                  |j*                        | _        t        j.                  |j                  |j0                  d      | _        d| _        | j7                          y c c}w )NTr   r   r(   F)r,   r-   rC   r   r   input_vocab_sizer2   input_embeds_layerr;   position_embeds_layerr/   r.   drop
ModuleListrange
num_layersr   layersr   _use_flash_attention_2r   r)   layernorm_finalr7   output_vocab_sizelm_headgradient_checkpointing	post_initrB   rC   r   rD   s      rE   r-   zBarkCausalModel.__init__)  s     #%,,v/F/FHZHZ"[%'\\&2C2CVEWEW%X"JJv~~.	mmPUV\VgVgPh$i1Yv%F$ij&,&A&AEX&X#,V-?-?fkkRyy!3!3V5M5MTYZ&+# 	 %js    E#c                     | j                   S r   r   rB   s    rE   get_input_embeddingsz$BarkCausalModel.get_input_embeddings>  s    &&&rF   c                     || _         y r   r   rB   new_embeddingss     rE   set_input_embeddingsz$BarkCausalModel.set_input_embeddingsA  s
    "0rF   c                    |j                  dd       }|j                  dd       }|j                  dd       }|Y|j                  d   }|d   d   j                  d   }|j                  d   |kD  r|}	n|j                  d   dz
  }	|d d |	d f   }d }n2|!|j                  d      r|j                  d   }n|j                  d   }||d d d |f   }||d d d |f   }|U|S|j                         j                  d      dz
  }|j	                  |dk(  d       |r|d d |j                  d    d f   }nd }|)|j                  d      rd |||j                  d      ||d	S |||j                  d      ||d
S )Ninput_embedsrg   position_idsr   r   r   rs   rI   )	input_idsr   rr   rs   r   rg   )r   rr   rs   r   rg   )getr   longcumsummasked_fill_)
rB   r   rr   r   r   rg   r   seq_lenpast_lengthremove_prefix_lengths
             rE   prepare_inputs_for_generationz-BarkCausalModel.prepare_inputs_for_generationD  s   zz.$7$4d;zz.$7&ooa(G)!,Q/55a8K q!K/'2$ (1q'9A'=$!!%9%:":;I  L'FJJ{,C&,,Q/#//!, %+AxxK8N#'8G84L%,*>)..077;a?L%%n&91=+A	0B/B/D,DEL#

;(?! ,#2#ZZ4 ,"0  #.K0(,
 	
rF   r   rr   rg   r   rh   labelsr   rs   rt   output_hidden_statesreturn_dictr   c           
         |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }d }|t        d      ||t        d      ||n"|| j                  |      }n|nt        d      |j                         d d }|j                  d   }|d   }||j                  n|j                  }|%d}t        d gt        | j                        z        }n|d   d   j                  d      }|;t        j                  |||z   t        j                   |      }|j#                  d      }| j%                  |      }|O|dk  rt        d      | j&                  r	d|v r|nd }n*|j)                  |d      }t+        ||j,                  d	
      }| j/                  || j                   j0                        }| j3                  ||z         }||j                  d      fz   }| j4                  r%| j6                  r|rt8        j;                  d       d}|rdnd }|	rdnd }|
rdnd }t=        t?        | j                  |            D ]~  \  }\  }}|
r||fz   }| j4                  r1| j6                  r%| jA                  |jB                  |d |||   ||	      }n ||||||   ||	      }|d   }|r	||d	   fz   }|	sr|||rdnd	   fz   } | jE                  |      }|j)                  |      }|
r||fz   }| jG                  |      }|st        d d ||||fD              S tI        |||||      S )NzXTraining is not implemented yet for Bark - ensure you do not pass `labels` to the model.CYou cannot specify both input_ids and input_embeds at the same time4You have to specify either input_ids or input_embedsrI   r   rS   r+   r   $batch_size has to be defined and > 0r   tgt_lenzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F r   r   c              3   &   K   | ]	  }||  y wr   r  .0vs     rE   	<genexpr>z*BarkCausalModel.forward.<locals>.<genexpr>
  s      ijiv   )losslogitsrr   rq   
attentions)%rC   rt   r   rs   use_return_dictNotImplementedErrorr6   r   rJ   r   r   tuplelenr   r<   aranger   	unsqueezer   r   r@   r   r+   get_head_maskr   r   r   r   loggerwarning_once	enumeratezip_gradient_checkpointing_func__call__r   r   r   )rB   r   rr   rg   r   rh   r   r   rs   rt   r   r   r  input_shaper   
seq_lengthr   r   position_embedsrq   output_shapepresent_key_valuesall_self_attentionsall_hidden_statesiblockpast_layer_key_valuesrx   r  s                                rE   ry   zBarkCausalModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]%j   \%=bcc%/*A "229=L%STT"'')#2.!''*
 _
%.%:!!@S@S"K#TFS-=$=>O)!,Q/44R8K <<Z+5MUZU_U_hnoL'11!4L44\B %Q !GHH**343FD!/!4!4Z!D "<NLL^L^hi!j &&y$++2H2HI			,"@A"m&8&8&<%>>&&4==##p "	#,R$$5b4"6BD1:3t{{O;\1] 	^-A-,#$58H$H!**t}};;NN!"aL%  !$9#1'l'&7 $AJM%771:-%G" &9W)QYZ=[<]&]#=	^@ ,,];%**<8   1]4D Dm,  &*<>OQde   &.+*
 	
rF   beam_idxc                 ,    t        fd| D              S )a  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c              3   F   K   | ]  }t        fd |D                yw)c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectrc   r   )r  
past_stater)  s     rE   r  z;BarkCausalModel._reorder_cache.<locals>.<genexpr>.<genexpr>!  s.     jQ[*))!X[[9J9J-KLjs   58Nr  )r  
layer_pastr)  s     rE   r  z1BarkCausalModel._reorder_cache.<locals>.<genexpr>   s%      
 j_ijj
s   !r/  )rr   r)  s    `rE   _reorder_cachezBarkCausalModel._reorder_cache  s      
-
 
 	
rF   r   )NNNNNNNNNNN)r}   r~   r   r    r   r-   r   r   r   r   "BARK_CAUSAL_MODEL_INPUTS_DOCSTRINGr   r<   Tensorr   FloatTensor
LongTensorr?   r   r   ry   staticmethodr1  r   r   s   @rE   r   r   &  s   %L*'1<
| ++MN -1>B15/3,0-1/3$(,0/3&*Q
ELL)Q
 "%(9(9":;Q
 !.	Q

 u||,Q
 ELL)Q
 ))*Q
 u||,Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$::	;Q
 OQ
f 
uU\\23
?D||
	uU\\"	#
 
rF   r   zBark semantic (or text) model. It shares the same architecture as the coarse model.
    It is a GPT-2 like autoregressive model with a language modeling head on top.r   )rC   c                        e Zd ZdZeZ	 	 	 ddej                  dede	e
eej                  f      de	ej                     dej                  f
 fdZ xZS )	BarkSemanticModelsemanticr   semantic_generation_confighistory_promptrg   r   c           
         |t        d      |j                  d   }|j                  }||j                  z   }|-|j	                  d|z
  j                         |j                        }|E|d   | d }t        j                  j                  |d|t        |      z
  f|j                  d      }nLt        j                  |j                  g|z  t        j                        j                  | j                         }t        j"                  |d   |d	      }t        j                  |j$                  gg|z  t        j                        j                  | j                         }	t        j&                  | j)                  |ddd|f         | j)                  |ddd|dz   f         z   | j)                  |	      gd	      }
t+        t-        |j.                  |j                              }|j1                  t+        t-        |j                  dz   | j2                  j4                                     t7        ||j                   
      }|j9                  d|j:                        }t=        |j>                  ||j                         }tA        |   t        jD                  ||dz   ft        j                        j                  | j                         f|
||g|d|}|dd|dz   df   }|S )a  
        Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids, i.e tokenized input sentences. Will be truncated up to
                semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as
                long as the longest generation among the batch.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            attention_mask (`Optional[torch.Tensor]`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        Returns:
            torch.LongTensor: Output semantic tokens.
        N/`semantic_generation_config` has to be providedr   r   semantic_promptconstant)rf   moder*   rZ   )r   	min_eos_p)eos_token_idrA  r   )r   logits_processorgeneration_config)#r6   r   max_input_semantic_lengthtext_encoding_offsetr_   r?   text_pad_tokenr   r   padr  semantic_pad_tokenr<   rM   intrc   r   repeat_interleavesemantic_infer_tokenrp   r   listr   semantic_vocab_sizeextendrC   r   r   r   rA  r   rB  r,   generater>   )rB   r   r:  r;  rg   r   r   rE  semantic_historyinfer_arrayr   tokens_to_suppress suppress_tokens_logits_processorrA  early_stopping_logits_processorsemantic_outputrD   s                   rE   rP  zBarkSemanticModel.generate/  s   < &-NOO__Q'
$>$X$X! : O OO	%!--q>/A.G.G.IKeKtKtuI%-.?@B[A[A\]!}}00 -4D0EEF0CC	  1    %||+>>?B[[chclcl bo  !223CD3I:[\]ll(==>?*LTYT]T]

"T[[/ 	 yy''	!5O6O5O2O(PQ))*:1>]@Y\]@]>];]*^_`''4
 
 ",@@B\BoBop
 	!!1DDqH$++JgJghi	
 ,II[dmdtdt+u(JJ{,F,P,PQ	*K3@@I^g^n^n+
'  '*JJ
$=$AB%))TWWX\XcXcd
%>@_`8	

 
 *!-F-J-L*LMrF   NNN)r}   r~   r   base_model_prefixr   r   r<   r3  r#   r   r   strr5  rP  r   r   s   @rE   r8  r8  &  s     #%L
 DH<@15a<<a %Aa !c5<<&7!89	a
 !.a 
		a arF   r8  zBark coarse acoustics model.
    It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a
    language modeling head on top.r   c                   &    e Zd ZdZeZ	 ddedededededeee	e
j                  f      fdZ	 	 	 	 	 dd	e
j                  ded
ededeee	e
j                  f      dee   dee
j"                  ee
j"                  e
j"                  f   f   f fdZ xZS )BarkCoarseModelcoarse_acousticsmax_coarse_historysemantic_to_coarse_ratior   r:  codebook_sizer;  c           
      ,   |t        j                  |d   d   |d      }|d   j                         }|2t        d|j                  d         D ]  }	||	ddfxx   ||	z  z  cc<    t        j
                  |dd      j                  d      }||j                  z   }t        j                  |d   |d      }t        t        j                  ||z              }
t        |
|j                  d   |j                  d   dz  z
  t        t        j                  |j                  d   |z              g      }t        t        ||z              }|dd| df   j                         }|dd| df   j                         }|dddd	f   }||fS t        j                  g g|z  t         j                  
      j                  | j                        }t        j                  g g|z  t         j                  
      j                  | j                        }||fS )a  
        Preprocess the optional `Bark` speaker prompts before `self.generate`.

        Args:
            max_coarse_history (`int`):
                Maximum size of coarse tokens used.
            semantic_to_coarse_ratio (`int`):
                Ratio of semantic to coarse frequency
            batch_size (`int`):
                Batch size, i.e the number of samples.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            codebook_size (`int`):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`):
                Optional `Bark` speaker prompt.
        Returns: Returns:
            `tuple(torch.FloatTensor)`:
            - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt.
            - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt.
        Nr>  r   rZ   coarse_promptr   rI   r   rS   r*   )r<   rK  cloner   r   rT   reshaperN  rJ  npfloorra   roundrM   rc   r   )rB   r]  r^  r   r:  r_  r;  x_semantic_historyx_coarse_historynmax_semantic_historyn_semantic_hist_providedn_coarse_hist_provideds                rE   preprocess_historiesz$BarkCoarseModel.preprocess_histories  s1   < %!&!8!8HY9Z[_9`blrs!t-o>DDF (q"2"8"8";< @A$QT*ma.??*@
  %/?AFNNrR/2L2`2``$667G7Mz_`a $'rxx0BE]0]'^#_ '*(&,,Q/2D2J2J12MPQ2QQ!1!7!7!:=U!UVW($ &)/GJb/b)c%d"!3A8P7P7Q4Q!R!V!V!X/4J3J3K0KLPPR/3B37 "#333 "'rdZ.?uyy!Q!T!TUYU`U`!a$||RD:,=UYYORRSWS^S^_!#333rF   rV  coarse_generation_configreturn_output_lengthsr   c           
         |t        d      |t        d      |j                  }|j                  }	|j                  }
|j	                  ||j
                  k(  |j                         |j                  |j                  z  |j                  z  }t        t        j                  |	|z              }||j                  k7  j                  d      }t        j                  ||z  |j                  z        }t        j                  ||j                  z        j                         }t        j                   |      j#                         }|j$                  d   }| j'                  ||	||||      \  }}|j$                  d   }t        j(                  ||g      }t        t        j*                  ||
z              }d}|j$                  d   }t-        |      D ]d  }|t        t        ||z              z   }|ddt        j                   d||z
  g      df   }|ddd|f   }t/        j0                  |d||j$                  d   z
  fd|j                        }t        j(                  |t        j2                  |j4                  gg|z        j7                  | j8                        |dd|	 df   g      }t;        |j$                  d   |j<                  |      }t?        |   |f|gtC        |
||z
        |d	|}|j$                  d   }t        j(                  ||dd|df   g      }|j$                  d   |z
  }~g |dd|df   }|r||fS |S )
aW  
        Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*):
                Input text semantic ids, i.e the output of `BarkSemanticModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the output lengths. Useful when batching.
        Returns:
            By default:
                torch.LongTensor: Output coarse acoustics tokens.
            If `return_output_lengths=True`:
                `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample
                of the batch.
        Nr=  -`coarse_generation_config` has to be providedr   r   )r;  r]  r^  r   r:  r_  rI   r?  )rC  max_new_tokensrD  )"r6   max_coarse_input_lengthr]  sliding_window_lenr   rI  coarse_semantic_pad_tokencoarse_rate_hzsemantic_rate_hzn_coarse_codebooksrJ  rd  re  sumr<   rf  maxitemr   rm  hstackceilr   r   rH  rM   coarse_infer_tokenrc   r   r   rN  r,   rP  ra   )rB   rV  r:  rn  r_  r;  ro  r   rs  r]  rt  r^  rj  output_lengthsmax_generated_lenr   rg  x_coarsebase_semantic_idxn_window_stepstotal_generated_lenlen_coarse_historyr   semantic_idxinput_coarsealternatingLogitsProcessoroutput_coarseinput_coarse_lencoarse_outputrD   s                                rE   rP  zBarkCoarseModel.generate  s   F &-NOO#+LMM":"R"R5HH5HH 	$$9LLL$>>	
 %33(99:&99: 	!
  #288,>AY,Y#Z[)-E-_-__ddefg558P8c8cc
 ^6N6a6a%abffh!IIn5::<$**1-
'+'@'@)1%=!'A' (A (
$H /44Q7,,(:O'LMRWW%69K%KLM%^^A.~& (	A,s59LOg9g3h/iiL +1bffaH\9\5].^.`+`aL'+C,C+C(CDL55+l.@.@.DDE(BB	L !<< LL#;#N#N"O!PS]!]^aabfbmbmnQ!3 3 445L *M""1%*>>*& "G,"<!="#57HK^7^_":	
 M  ,11!4||X}Q@P@Q=Q/R$STH"*.."36H"HQ(	T !$6$7!78  .00rF   r   )NN   NN)r}   r~   r   rX  r   r   rJ  r   r   rY  r<   r3  rm  r#   r!   r?   r   r5  r   rP  r   r   s   @rE   r[  r[    s    +#L =AH4H4 #&H4 	H4
 %(H4 H4 !c5<<&7!89H4Z DH?C!<@04FF %AF #=	F
 F !c5<<&7!89F  (~F 
uu'7'79I9I'I!JJ	KF FrF   r[  zBark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and
    language modeling heads, one for each codebook.r   c                   :    e Zd ZdZeZdZ fdZd Zd Z	d Z
d Zd dZ	 d!d	ee   d
ee   dej                   fdZd Zd Z ee      	 	 	 	 	 	 	 	 	 d"dedeej.                     deej.                     deej.                     deej.                     deej0                     deej.                     dee   dee   dee   deeej.                     ef   fd       Z	 	 	 	 	 d#dej.                  dedede dedee!e"ej.                  f      dej0                  fdZ# xZ$S )$BarkFineModelfine_acousticscodebook_idxc           
         t         |   |       || _        t        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _
        t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  t        |j                         D cg c]  }t#        |d       c}      | _        |j&                  dk(  | _        t        j*                  |j                        | _        t        j                  t        |j.                  |j                        D cg c].  }t        j0                  |j                  |j2                  d      0 c}      | _        d| _        |j                  | _        | j9                          y c c}w c c}w c c}w )NFr   r   r(   )r,   r-   rC   r   r   r   n_codes_totalr   r   r2   input_embeds_layersr;   r   r/   r.   r   r   r   r   r   r   r   r   n_codes_givenr7   r   lm_headsr   r   r   s      rE   r-   zBarkFineModel.__init__z  ss     $&==PUV\VjVjPkl1R\\&1163E3EFl$
  &(\\&2C2CVEWEW%X"JJv~~.	mmQVW]WhWhQi$jAYv%G$jk&,&A&AEX&X#!||F,>,>? v33V5I5IJ 		&,,f.F.FUS
 ',##11 	+ m %ks   1G%4G*=3G/c                     | j                   S r   r  r   s    rE   r   z"BarkFineModel.get_input_embeddings  s    '''rF   c                     || _         y r   r  r   s     rE   r   z"BarkFineModel.set_input_embeddings  s
    #1 rF   c                     | j                   S r   r  r   s    rE   get_output_embeddingsz#BarkFineModel.get_output_embeddings  s    }}rF   c                     || _         y r   r  )rB   new_output_embeddingss     rE   set_output_embeddingsz#BarkFineModel.set_output_embeddings  s	    -rF   c           
         | j                         }t        j                  |D cg c]  }| j                  |||       c}      }| j	                  |       |d   j
                  j                  d   }| j                         j| j                  j                  sT| j                         }t        j                  |D cg c]  }| j                  ||       c}      }| j                  |       | j                         S c c}w c c}w )Nr   )r   r   r   _get_resized_embeddingsr   r   r   r  rC   tie_word_embeddings_get_resized_lm_headr  )	rB   new_num_tokenspad_to_multiple_ofold_embeddings_listold_embeddingsnew_embeddings_listold_lm_head_listold_lm_headnew_lm_head_lists	            rE   _resize_token_embeddingsz&BarkFineModel._resize_token_embeddings  s    "779 mm ':" ,,^^M_`
 	!!"56,Q/66<<Q? %%'3DKK<[<[#99;!}}[klK**;Gl  &&'78((**! ms   C<;Dr  r  r   c                    | j                  ||      }|||S |d   j                  j                  d   | j                  _        |d   j                  j                  d   | j                  _        |d   j                  j                  d   | _        |d   j                  j                  d   | _        | j                          |S )aX  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        r   )r  r   r   rC   r   
vocab_sizetie_weights)rB   r  r  model_embedss       rE   resize_token_embeddingsz%BarkFineModel.resize_token_embeddings  s    0 44^EWX!&8&@ )5Q(>(>(D(DQ(G%!-a!7!7!=!=a!@!-a!7!7!=!=a!@&q/0066q9 	rF   c                 l   t        | j                  dd      rg | _        | j                         }| j	                         }t        | j                  j                  | j                  j                  z
        D ]<  }| j                  ||   ||dz             | j                  j                  d| d       > y y )Nr  Tr   	lm_heads..weight)
getattrrC   _tied_weights_keysr  r   r   r  r  _tie_or_clone_weightsappend)rB   output_embeddingsinput_embeddingsr&  s       rE   _tie_weightszBarkFineModel._tie_weights  s    4;; 5t<&(D# $ : : <#88:4;;44t{{7P7PPQ G**+<Q+?AQRSVWRWAXY''..1#W/EFG =rF   c                    t        | j                  dd      rg | _        | j                         }| j	                         }t        | j                  j                  | j                  j                  z
        D ]<  }| j                  ||   ||dz             | j                  j                  d| d       > | j                         D ]  }t        |d      s|j                          ! y)z
        Tie the weights between the input embeddings list and the output embeddings list.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        r  Tr   r  r  r  N)r  rC   r  r  r   r   r  r  r  r  r   r   r  )rB   r  r  r&  r   s        rE   r  zBarkFineModel.tie_weights  s     4;; 5t<&(D# $ : : <#88:4;;44t{{7P7PPQ G**+<Q+?AQRSVWRWAXY''..1#W/EFG
 lln 	&Fv~.##%	&rF   r   rg   r   rh   r   r   rt   r   r   c           
         ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
d }|t	        d      |dk(  rt        d      ||t        d      ||t        d      |t        | j                        D cg c]&  \  }} ||d d d d |f         j                  d      ( }}}t        j                  |d      }|d d d d d d d |dz   f   j                  d      }|j                         d d }|j                  d   }|d   }||j                  n|j                  }|8t        j                  d|t        j                   |	      }|j                  d      }| j#                  |      }|=|dk  rt        d
      | j$                  r	d|v r|nd }nt'        ||j(                  d      }| j+                  || j                   j,                        }| j/                  ||z         }||j                  d      fz   }|rdnd }|	rdnd }t        | j0                        D ]-  \  }}|	r||fz   } |||||   |      }|d   }|s%||d   fz   }/ | j3                  |      }|j5                  |      }|	r||fz   } | j6                  || j                   j8                  z
     |      }|
st;        d d |||fD              S t=        ||||      S c c}}w )NzTraining is not implemented yetr   zRCannot predict 0th codebook - 0th codebook should be predicted by the coarse modelr  r  rI   rZ   r   r  r  r  r  )rg   rh   rt   c              3   &   K   | ]	  }||  y wr   r  r
  s     rE   r  z(BarkFineModel.forward.<locals>.<genexpr>k  s     lq^_^klr  )r  r  rq   r  )rC   rt   r   r  r  r6   r  r  r  r<   rp   ry  rJ   r   r   r  r   r   r   r   r+   r  r   r   r   r   r@   r  r  r  r   )rB   r  r   rg   r   rh   r   r   rt   r   r   r  r&  r   r  r   r   r   r!  rq   r"  r$  r%  r'  rx   r  s                             rE   ry   zBarkFineModel.forward  sa    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&GHH1qrr \%=bcc!5STT  .7t7O7O-P)A) #9Q1W#56@@DL  !99\r:L'1a1C<!3C1C(CDHHRHPL"'')#2.!''*
 ^
%.%:!!@S@S <<:UZZPVWL'11!4L44\B %Q !GHH**343FD "<NLL^L^hi!j&&y$++2H2HI			,"@A"m&8&8&<%>>$5b4"6BD!$++. 	JHAu#$58H$H!-#A,"3	G $AJM &9WQZM&I#	J  ,,];%**<8   1]4D DH|dkk.G.GGHWlT63DFY$Zlll+*	
 	
Es   &+K7r  r:  rn  fine_generation_configr_  r;  c           	         |t        d      |t        d      |t        d      |j                  d|j                        }|j                  }	|j                  }
|j                  |j                  d   d|j                        }t        j                  ||j                  z
  |      }|j                  d   }|)t        j                  |d   j                  d   |d	      }nd}|j                  }t        j                  |d|j                  |z
  fd
|      }|Ct        j                   |dd|	 dddf   |gd	      }|dd|	 dddf   j                  d   }nd}d}|j                  d   |
k  r/|
|j                  d   z
  }t        j                  |ddd|fd
|      }|j                  d   |
|z
  z
  |	z  }t#        t%        j&                  |            }t)        d|      dz   }t+        |      D ]  }t-        ||	z  |j                  d   |
z
  g      }t-        |||	z  z   |j                  d   |	z
  g      }||z
  }|dd|||
z   ddf   }t+        ||j                        D ]  }| j/                  ||      j0                  }||dk(  r%|dd|dd|f   }t        j2                  |d      }nk|ddddd|f   |z  }t        j4                  |d	      dd||
f   }|j7                  d|f      }t        j8                  |d      j                  |d      }|j;                  t        j<                        }||dd|d|f<   ~~ t+        ||j                        D ]  }|dd|d|f   |dd|||
|z
  z   |f<     ~ |j?                  dd      dddd|df   }|dkD  r|ddddd| f   }|j                  d   |j                  d   k7  rt        d      |S )ap  
        Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker
        prompt.

        Args:
            coarse_output (`torch.Tensor` of shape (batch_size, seq_len)):
                Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`.
            semantic_generation_config (`BarkSemanticGenerationConfig`):
                Generation config indicating how to generate the semantic tokens.
            coarse_generation_config (`BarkCoarseGenerationConfig`):
                Generation config indicating how to generate the coarse tokens.
            fine_generation_config (`BarkFineGenerationConfig`):
                Generation config indicating how to generate the fine tokens.
            codebook_size (`int`, *optional*, defaults to 1024):
                Codebook channel size, i.e. the size of the output vocabulary per codebook channel.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt.
        Returns:
            torch.LongTensor: Output fine acoustics tokens.
        Nr=  rq  z+`fine_generation_config` has to be providedtemperaturer   rI   fine_promptrZ   r?  r   )r@  rf   rY   )num_samplesr   rS   z-input and output should have the same seq_len) r6   r   r  max_fine_history_lengthmax_fine_input_lengthr@   r   rx  r<   	remainderrN  rK  Tr   rH  n_fine_codebooksrp   rJ  rd  r}  rz  r   ra   ry   r  argmaxrb   rc  multinomialrc   int32rT   )rB   r  r:  rn  r  r_  r;  r   r  r  r  r   x_fine_historyn_coarse
fine_input	n_historyn_remove_from_endn_loopsn_outer	start_idxstart_fill_idxrel_start_fill_idxinput_buffern_innerr  relevant_logitscodebook_predsprobss                               rE   rP  zBarkFineModel.generatet  s   < &-NOO#+LMM!)JKK
 jj0F0R0RS"8"P"P 6 L L &**=+>+>q+A2G_GrGrs 8R8f8f(fhuv"((+
%"44^M5R5T5TUY5Z\flmnN "N+>> UU&77(BC	

 %N17N6N6OQR3R$SU_#`fghJ 'q+B*B*CQ'FGMMaPIIA!66 5
8H8H8K KzAq!5F+Gj`mnJ !&&q)-BY-NOSjjbggg&'a/A%W~ 	GW'>>
@P@PQR@SVk@klmI W'>>>
@P@PQR@SVm@mnN "0)!;%aYAV5V)VXY&YZL +A+R+RS +g|<CC&+*<&,Q0B0C^m^-S&TO%*\\/2%FN&,Q>M>-A&B[&POIIo2>qBTUjBj?jkE!MM2}*=>E%*%6%6u!%L%Q%QR\^`%aN!/!2!25;;!?@NQ 2 3W<=N+$ !+A+R+RS B !$6$7!@A ~:ORd:d(eegnnB =	@  ))!Q/1ij0@A
q #Aq*=,=+=*=$=>JB=#6#6r#::LMMrF   r   r{   )	NNNNNNNNN)NNNr  N)%r}   r~   r   rX  r   r   main_input_namer-   r   r   r  r  r  r   rJ  r   r   r  r  r  r   BARK_FINE_INPUTS_DOCSTRINGr<   r3  r5  r?   r   r   r   ry   r#   r!   r"   r   rY  rP  r   r   s   @rE   r  r  p  s    )!L$O>(2.+, Y]%&sm%HPQT%	%N	G&* ++EF -115/3,0-1/3,0/3&*l
l
 ELL)l
 !.	l

 u||,l
 ELL)l
 ))*l
 u||,l
 $D>l
 'tnl
 d^l
 
uU\\"N2	3l
 Gl
b DH?C;?!<@F||F %AF #=	F
 !9F F !c5<<&7!89F 
		FrF   r  a6  
    The full Bark model, a text-to-speech model composed of 4 sub-models:
    - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that
      takes
    as input tokenized text, and predicts semantic text tokens that capture the meaning of the text.
    - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer,
    that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary
    to `encodec`.
    - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively
    predicts the last codebooks based on the sum of the previous codebooks embeddings.
    - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio
      array.

    It should be noted that each of the first three modules can support conditional speaker embeddings to condition the
    output sound according to specific predefined voice.
    c                   l    e Zd ZeZ fdZedej                  fd       Zdde	e
   fdZddZ ej                         	 	 	 dde	ej                     de	eeej                  f      d	e	e   dej$                  fd
       Ze	 	 	 	 dde	ej*                     de	eeeee
f   f      dedef fd       Z xZS )	BarkModelc                    t         |   |       t        |j                        | _        t        |j                        | _        t        |j                        | _
        t        j                  |j                        | _        || _        y r   )r,   r-   r8  semantic_configr9  r[  coarse_acoustics_configr\  r  fine_acoustics_configr  r   from_configcodec_configcodec_modelrC   r   s     rE   r-   zBarkModel.__init__  sh     )&*@*@A /0N0N O+F,H,HI$001D1DErF   r   c                 N   t        | j                  d      st        |       S | j                  j                         D ]g  }t        |d      st        |j                  d      s'|j                  j
                  >t        j                  |j                  j
                        c S  y)r   r   r   N)r   r9  r   r   r   r   r<   r   r   s     rE   r   zBarkModel.device  s}     t}}j1'--mm++- 	FF
+FOO-?@OO44@||FOO$D$DEE	FrF   gpu_idc                    t               rddlm} nt        d      t	        j
                  d|       }| j
                  j                  dk7  r/| j                  d       t        j                  j                           || j                  j                  |      \  | j                  _        }d}| j                  | j                  | j                  fD ]  } ||||      \  }} || _         || j                  ||      \  }}|| _        y)a  
        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
        the next sub-model runs.

        Args:
            gpu_id (`int`, *optional*, defaults to 0):
                GPU id on which the sub-models will be loaded and offloaded.
        r   )cpu_offload_with_hookz1`enable_model_cpu_offload` requires `accelerate`.zcuda:cpuN)prev_module_hook)r   
accelerater  ImportErrorr<   r   typerc   cudaempty_cacher9  r   r\  r  fine_acoustics_hookr  codec_model_hook)rB   r  r  r   r   hookcpu_offloaded_models          rE   enable_cpu_offloadzBarkModel.enable_cpu_offload0  s     #$8QRRfX./;;u$GGENJJ""$ /DDMMDdDdfl.m+(!MM!!$
 	`
 ,,?Z^_GAt	` $( '(8(8&SWX4 !%rF   c                    |j                  dd      }| j                  j                  j                  |      }|nt	        ||      D cg c]  \  }}|ddd|f   j                  d      ! }}}|D cg c]+  }| j                  j                  |      j                         - }}|S | j                  j                  |      }|j                  d      }|S c c}}w c c}w )z:Turn quantized audio codes into audio array using encodec.r   r   N)rT   r  	quantizerdecoder  r  decodersqueeze)rB   fine_outputr  embsamplelout	audio_arrs           rE   codec_decodezBarkModel.codec_decodeW  s     "++Aq1((//<% BES.AYZ+616!RaR%=**1-ZCZRUV))11&9AACVIV
  ""**3/CAI [Vs   	$C40Cr   r;  ro  c           	         t        di | j                  j                  }t        di | j                  j                  }t        di | j                  j                  }|j                  dd      |j                  dd      d}i }	i }
|j                         D ]  \  }}|j                  d      r|t        d      d }|||<   +|j                  d      r|t        d      d }||	|<   P|j                  d      r|t        d      d }||
|<   u||vr|||<   ||	vr||	|<   ||
vs||
|<    d|v r|j                  d        | j                  j                  |f||d	|}d|	v r|	j                  d        | j                  j                  |f|||| j                  j                  |d
|	}d}|r|\  }}||j                  z  }d|
v r|
j                  d        | j                   j                  |f||||| j                  j                  d|
}t#        | dd      D| j$                  j'                          | j(                  j+                  | j,                        | _        | j/                  ||      }t#        | dd      | j0                  j'                          |rH|D cg c]  }t        |       }}t2        j4                  j6                  j9                  |dd      }||fS |S c c}w )a^	  
        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.

        Args:
            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
                longest generation among the batch.
            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:

                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.

                This means you can, for example, specify a generation strategy for all sub-models except one.
            return_output_lengths (`bool`, *optional*):
                Whether or not to return the waveform lengths. Useful when batching.
        Returns:
            By default:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
            When `return_output_lengths=True`:
                Returns a tuple made of:
                - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform.
                - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch
        Example:

        ```python
        >>> from transformers import AutoProcessor, BarkModel

        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
        >>> model = BarkModel.from_pretrained("suno/bark-small")

        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
        >>> voice_preset = "v2/en_speaker_6"

        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)

        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
        >>> audio_array = audio_array.cpu().numpy().squeeze()
        ```
        rg   NrA  )rg   rA  	semantic_coarse_fine_rD  )r;  r:  )r;  r:  rn  r_  ro  )r;  r:  rn  r  r_  r  r  Tr   )batch_firstpadding_valuer  )r#   rD  r  r!   r  r"   r  popitems
startswithr  r9  rP  r\  r_  rx  r  r  r  offloadr  rc   r   r   r  r   utilsrnnpad_sequence)rB   r   r;  ro  r   r:  rn  r  kwargs_semantickwargs_coarsekwargs_finere   rf   rV  r  r  outputaudior  s                      rE   rP  zBarkModel.generatei  s:   h &B%kDDZDZDjDj%k"#=#o@V@V@n@n#o !9!iD<R<R<h<h!i %jj)94@K6

  ,,. 	-JC~~k*#k*,-',$	*#i.*+%*c"(#g,.)#(C  o-+0OC(m+).M#&k)',K$%	-* /1 340$--00
)'A
 	
 -/126--66
)'A%=00>>"7
 
  ,9)M>+/G/Z/ZZN +-OO/0-$$--
)'A%=#900>>
 
 4.5A $$,,.#//224;;?D !!&.94+T2>!!))+ 8=>fc&k>N>HHLL--eUV-WE.((	 ?s   K%torch_dtype
device_maphard_check_onlycheck_device_mapc                     t         |   |||||      }|j                  |j                  _        |j                  |j                  _        |j                  |j
                  _        |S )a(  
        `_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model
        sub-configurations. We override the original method to make sure that Bark sub-models are using Flash Attention
        if necessary.

        If you don't know about Flash Attention, check out the official repository of flash attention:
        https://github.com/Dao-AILab/flash-attention

        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
        specific section of the documentation to learn more about it:
        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models

        The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
        half precision and not ran on CPU.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model
        can initialize the correct attention module
        )r  r  )r,   _check_and_enable_flash_attn_2r   r  r  r  )clsrC   r  r  r  r  rD   s         rE   r  z(BarkModel._check_and_enable_flash_attn_2  sh    6 7K__o 8 
 7=6Q6Q3>D>Y>Y&&;<B<W<W$$9rF   )r   r   rW  )NNFF)r}   r~   r   r   r   r-   r   r<   r   r   rJ  r  r   no_gradr3  r   rY  r?   r5  rP  classmethodr+   r   r  r   r   s   @rE   r  r    s+   ( L	 F F F"%%# %%N$ U]]_ -1<@04	OELL)O !c5<<&7!89O  (~	O 
		O Ob  .2;? %!&! ekk*! U3S#X#678	!
 ! ! !rF   r  )Hr   r]   typingr   r   r   r   numpyrd  r<   r   torch.nnr   r   
generationr
   generation.logits_processr   r   r   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   r   r  r   r   r   r   r   r   autor   configuration_barkr   r   r   r   r    generation_configuration_barkr!   r"   r#   modeling_flash_attention_utilsr$   
get_loggerr}   r  _CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCModuler&   r   r   r   r   r   r   BARK_MODEL_START_DOCSTRINGBARK_START_DOCSTRINGr  r2  r   formatr8  r[  r  r  r  rF   rE   <module>r/     s     / /    $ ) 
 C F C     J 
		H	% ( z		 zzW/ Wv 0 	XBII 	Xbii  3		 3l/*/ /*d " "( T1& "j}
)? }
@ U%%-A%B
e e
eP & %%-?%@	To TTn 7%%-=%>
E' E
EP   #&M# M'&MrF   