
    sg;                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z"  e       rddl#m$Z$  ejJ                  e&      Z'dZ(e G d de             Z)e G d de             Z*e G d de             Z+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d dejX                        Z0 G d d ejX                        Z1 G d! d"ejX                        Z2d# Z3dGd$Z4 G d% d&ejX                        Z5d'e	jl                  d(e7d)e	jl                  fd*Z8 G d+ d,ejX                        Z9 G d- d.e9      Z: G d/ d0e9      Z;e9e:e;d1Z< G d2 d3ejX                        Z= G d4 d5ejX                        Z> G d6 d7ejX                        Z? G d8 d9ejX                        Z@ G d: d;ejX                        ZA G d< d=ejX                        ZB G d> d?ejX                        ZC G d@ dAe      ZDdBZEdCZF edDeE       G dE dFeD             ZGy)HzPyTorch Mimi model.    N)	dataclass)ListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheSlidingWindowCacheStaticCache)AttentionMaskConverter)BaseModelOutputWithPast)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_flash_attn_2_available#is_flash_attn_greater_or_equal_2_10loggingreplace_return_docstrings   )
MimiConfig)_flash_attention_forwardr   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ
eeeeej                     f      ed<   dZeeeeej                     f      ed<   y)
MimiOutputa  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Mimi.
        encoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
        decoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
    Naudio_codesaudio_valuesencoder_past_key_valuesdecoder_past_key_values)__name__
__module____qualname____doc__r   torch
LongTensor__annotations__r   FloatTensorr    r   r   r   r   r!        Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mimi/modeling_mimi.pyr   r   4   ss    0 %)K!!(&*L%##*OSXeE48I8I3J,J&KLSOSXeE48I8I3J,J&KLSr+   r   c                   l    e Zd ZU dZdZej                  ed<   dZe	e
eeej                     f      ed<   y)MimiEncoderOutputaY  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        encoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
    Nr   r    )r"   r#   r$   r%   r   r&   r'   r(   r    r   r   r   r   r)   r*   r+   r,   r.   r.   T   s>     %)K!!(OSXeE48I8I3J,J&KLSr+   r.   c                   l    e Zd ZU dZdZej                  ed<   dZe	e
eeej                     f      ed<   y)MimiDecoderOutputaU  
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Mimi.
        decoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
    Nr   r!   )r"   r#   r$   r%   r   r&   r)   r(   r!   r   r   r   r   r*   r+   r,   r0   r0   h   s>     '+L%##*OSXeE48I8I3J,J&KLSr+   r0   c                        e Zd ZdZ	 	 	 	 	 ddededededededef fd	Zd
 Zd Zde	j                  de	j                  fdZedde	j                  deeef   dedefd       Zd Z xZS )
MimiConv1dz;Conv1d with asymmetric or causal padding and normalization.in_channelsout_channelskernel_sizestridedilationgroupsbiasc
           	      b   t         
|           |j                  | _        ||j                  n|| _        |dkD  r$|dkD  rt
        j                  d| d| d| d       t        j                  |||||||	      | _	        | j                  j                  d   }t        j                  | j                  j                  d   t        j                        }| j                  j                  d   }t        j                  |dz
  |z  dz   t        j                        }| j!                  d	|d
       | j!                  d|d
       | j!                  dt        j                  ||z
  t        j                        d
       | j"                  dz  | _        | j"                  | j$                  z
  | _        y )Nr   zNMimiConv1d has been initialized with stride > 1 and dilation > 1 (kernel_size=z stride=z, dilation=).)r7   r8   r9   r   dtyper6   F
persistentr5   padding_total   )super__init__use_causal_convcausalpad_modeloggerwarningr   Conv1dconvr5   r&   tensorr6   int64r7   register_bufferr@   padding_rightpadding_left)selfconfigr3   r4   r5   r6   r7   r8   rF   r9   	__class__s             r,   rC   zMimiConv1d.__init__   s    	,,+3+; A:(Q,NN!!,XfX[
RTV
 II{FXV\cg
	 ii++A.dii..q1E99%%a( llK!Ox#?!#C5;;WXv%@]KEJ_ell;;OW\WbWb.cpuv "//14 ..1C1CCr+   c                     t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         y Nweight_normr   utilsrU   hasattrparametrizationsrJ   rP   rU   s     r,   apply_weight_normzMimiConv1d.apply_weight_norm   F    hh**288,,m<((33??KDIIr+   c                 V    t         j                  j                  | j                         y Nr   rW   remove_weight_normrJ   rP   s    r,   r`   zMimiConv1d.remove_weight_norm       
##DII.r+   hidden_statesreturnc                 >   |j                   d   }|| j                  z
  | j                  z   | j                  z  dz   }t	        j
                  |      j                  t        j                        dz
  }|| j                  z  | j                  z   | j                  z
  }||z
  S )zSee `pad_for_conv1d`.r   )shaper5   r@   r6   r&   ceiltorL   )rP   rc   lengthn_framesideal_lengths        r,   _get_extra_padding_for_conv1dz(MimiConv1d._get_extra_padding_for_conv1d   s    
 $$R(T---0B0BBdkkQTUU::h'**5;;7!;$++-0@0@@4CUCUUf$$r+   paddingsmodevaluec                 l   | j                   d   }|\  }}|dk(  s"t        j                  j                  | |||      S t	        ||      }d}||k  r*||z
  dz   }t        j                  j                  | d|f      } t        j                  j                  | |||      }	|	j                   d   |z
  }
|	dd|
f   S )zTiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        rf   reflectr   r   .N)rg   r   
functionalpadmax)rc   rn   ro   rp   rj   rO   rN   max_pad	extra_padpaddedends              r,   _pad1dzMimiConv1d._pad1d   s     $$R(&.#my ==$$]HdEJJlM2	W&(1,IMM--ma^LM""=(D%Hll2*c4C4i  r+   c                 &   | j                  |      }| j                  r+| j                  || j                  |f| j                        }n7| j                  || j
                  | j                  |z   f| j                        }| j                  |      }|S )N)ro   )rm   rE   rz   r@   rF   rO   rN   rJ   )rP   rc   extra_paddings      r,   forwardzMimiConv1d.forward   s    ::=I;; KK8J8JM7ZaeananKoM KK 1 143E3E3UV]a]j]j ( M 		-0r+   )r   r   r   NT)zero        )r"   r#   r$   r%   intboolrC   r[   r`   r&   Tensorrm   staticmethodr   strfloatrz   r}   __classcell__rR   s   @r,   r2   r2   |   s    E (D (D 	(D
 (D (D (D (D (DT/
%||
% 

% !ell !eCHo !S !bg ! !$r+   r2   c                   R     e Zd ZdZ	 	 	 ddededededef
 fdZd Zd	 Zd
 Z xZ	S )MimiConvTranspose1dzDConvTranspose1d with asymmetric or causal padding and normalization.r3   r4   r5   r6   r8   c                    t         	|           |j                  | _        |j                  | _        t        j                  ||||||      | _        | j                  s| j                  dk(  st        d      | j                  j                  d   }| j                  j                  d   }||z
  }| j                  r(t        j                  || j                  z        | _        n
|dz  | _        || j                  z
  | _        y )N)r8   r9         ?zB`trim_right_ratio` != 1.0 only makes sense for causal convolutionsr   rA   )rB   rC   rD   rE   trim_right_ratior   ConvTranspose1drJ   
ValueErrorr5   r6   mathrh   rN   rO   )
rP   rQ   r3   r4   r5   r6   r8   r9   r@   rR   s
            r,   rC   zMimiConvTranspose1d.__init__   s     	,, & 7 7&&{L+v^dkop	t44;abbii++A.!!!$#f, ;; "&=4;P;P+P!QD "/!!3D)D,>,>>r+   c                     t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         y rT   rV   rZ   s     r,   r[   z%MimiConvTranspose1d.apply_weight_norm
  r\   r+   c                 V    t         j                  j                  | j                         y r^   r_   ra   s    r,   r`   z&MimiConvTranspose1d.remove_weight_norm  rb   r+   c                     | j                  |      }|j                  d   | j                  z
  }|d| j                  |f   }|S )Nrf   .)rJ   rg   rN   rO   )rP   rc   ry   s      r,   r}   zMimiConvTranspose1d.forward  sM    		-0 !!"%(:(::%c4+<+<s+B&BCr+   )r   r   T)
r"   r#   r$   r%   r   rC   r[   r`   r}   r   r   s   @r,   r   r      sX    N "? "? 	"?
 "? "? "?H/r+   r   c                   <     e Zd ZdZdededee   f fdZd Z xZ	S )MimiResnetBlockz;
    Residual block from SEANet model as used by Mimi.
    rQ   dim	dilationsc           	          t         |           |j                  df}t        |      t        |      k7  rt	        d      ||j
                  z  }g }t        t        ||            D ]R  \  }\  }}	|dk(  r|n|}
|t        |      dz
  k(  r|n|}|t        j                         gz  }|t        ||
|||	      gz  }T t        j                  |      | _        |j                  rt        |||d      | _        y t        j                         | _        y )Nr   z7Number of kernel sizes should match number of dilationsr   )r7   )r5   )rB   rC   residual_kernel_sizelenr   compress	enumeratezipr   ELUr2   
ModuleListblockuse_conv_shortcutshortcutIdentity)rP   rQ   r   r   kernel_sizeshiddenr   ir5   r7   in_chsout_chsrR   s               r,   rC   zMimiResnetBlock.__init__#  s   33Q7|I.VWW'*3Ci4P*Q 	[&A&XFSF#l"3a"77cVGbffhZEj+PXYZZE		[
 ]]5)
##&vsCQGDMKKMDMr+   c                 `    |}| j                   D ]
  } ||      } | j                  |      |z   S r^   )r   r   )rP   rc   residuallayers       r,   r}   zMimiResnetBlock.forward7  s:     ZZ 	1E!-0M	1 }}X&66r+   )
r"   r#   r$   r%   r   r   r   rC   r}   r   r   s   @r,   r   r     s+    *z * *S	 *(7r+   r   c                   .     e Zd ZdZdef fdZd Z xZS )MimiEncoderzSEANet encoder as used by Mimi.rQ   c           	      ~   t         |           t        ||j                  |j                  |j
                        g}d}t        |j                        D ]  }||j                  z  }t        |j                        D ]"  }|t        |||j                  |z  dg      gz  }$ |t        j                         gz  }|t        |||dz  |dz  |      gz  }|dz  } |t        j                         gz  }|t        |||j                  z  |j                  |j                        gz  }t        j                   |      | _        y )Nr   rA   r5   r6   )rB   rC   r2   audio_channelsnum_filtersr5   reversedupsampling_ratiosrangenum_residual_layersr   dilation_growth_rater   r   hidden_sizelast_kernel_sizer   layers)rP   rQ   modelscalingratiocurrent_scalejrR   s          r,   rC   zMimiEncoder.__init__B  s@   FF$9$96;M;MvOaOabc f667 	E#f&8&88M6556 g/&-&B]B]_`B`bcAdeffg bffhZEj8IW\_`W`inoppEqLG	 	"&&(*VWv/A/A%A6CUCUW]WnWnoppmmE*r+   c                 8    | j                   D ]
  } ||      } |S r^   r   rP   rc   r   s      r,   r}   zMimiEncoder.forwardX  %    [[ 	1E!-0M	1r+   r"   r#   r$   r%   r   rC   r}   r   r   s   @r,   r   r   ?  s    )+z +,r+   r   c                   B     e Zd ZdZ fdZdej                  fdZ xZS )MimiLayerScalezLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
    This rescales diagonally the residual outputs close to 0, with a learnt scale.
    c                     t         |           |j                  }|j                  }t	        j
                  t        j                  |f|d            | _        y )NT)requires_grad)	rB   rC   r   layer_scale_initial_scaler   	Parameterr&   fullscale)rP   rQ   channelsinitial_scalerR   s       r,   rC   zMimiLayerScale.__init__c  sD    %%88\\%**h[-W["\]
r+   xc                      | j                   |z  S r^   )r   )rP   r   s     r,   r}   zMimiLayerScale.forwardi  s    zzA~r+   )	r"   r#   r$   r%   rC   r&   r   r}   r   r   s   @r,   r   r   ^  s    ^ r+   r   c                   N     e Zd Zd fd	Z ej
                         d        Z xZS )MimiRotaryEmbeddingc                 J   t         |           || _        || _        || _        d| j                  t        j                  d| j                  dt
        j                        j                         j                  |      | j                  z  z  z  }| j                  d|d       y )Nr   r   rA   r<   inv_freqFr>   )rB   rC   r   max_position_embeddingsbaser&   arangerL   r   ri   rM   )rP   r   r   r   devicer   rR   s         r,   rC   zMimiRotaryEmbedding.__init__o  s    '>$	$))Q!5;;(W(](](_(b(bci(jmqmumu(uvwZeDr+   c                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }|j                  j
                  }t        |t              r|dk7  r|nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         }|j                         }	d d d        j                  |j                  
      	j                  |j                  
      fS # 1 sw Y   AxY w)Nr   rf   r   mpscpuF)device_typeenabledrA   r   r<   )r   r   expandrg   r   type
isinstancer   r&   autocast	transposecatcossinri   r=   )
rP   r   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r,   r}   zMimiRotaryEmbedding.forwardx  s%   
 !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @ hhmm%/S%AkUZFZk`e^^UC 	&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')C'')C		
 vvAGGv$cff177f&;;;	 	s   !A+EE)i   i'  N)r"   r#   r$   rC   r&   no_gradr}   r   r   s   @r,   r   r   n  s%    E U]]_< <r+   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrf   rA   r   )rg   r&   r   )r   x1x2s      r,   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r+   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r,   apply_rotary_pos_embr     sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr+   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MimiMLPc                 $   t         |           || _        t        |j                     | _        t        j                  |j                  |j                  d      | _
        t        j                  |j                  |j                  d      | _        y )NFr9   )rB   rC   rQ   r
   
hidden_actactivation_fnr   Linearr   intermediate_sizefc1fc2rP   rQ   rR   s     r,   rC   zMimiMLP.__init__  sj    #F$5$5699V//1I1IPUV99V55v7I7IPUVr+   rc   rd   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r^   )r   r   r   )rP   rc   s     r,   r}   zMimiMLP.forward  s4    /**=9/r+   )r"   r#   r$   rC   r&   r   r}   r   r   s   @r,   r   r     s$    WU\\ ell r+   r   rc   n_reprd   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rg   r   reshape)rc   r  batchnum_key_value_headsslenhead_dims         r,   	repeat_kvr
    so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr+   c                   ,    e Zd ZdZddedee   f fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee   d	ed
edeej                     deej                  eej                     eeej                        f   fdZ xZS )MimiAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrQ   	layer_idxc                 j   t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _	        |j                  | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j                   | _        d| _        dt%        j&                  |j                        z  | _        | j                  | j                  z  dk7  r&t+        d| j                   d| j                   d      t-        j.                  | j                  | j                  | j                  z  |j0                  	      | _        t-        j.                  | j                  | j                  | j                  z  |j0                  	      | _        t-        j.                  | j                  | j                  | j                  z  |j0                  	      | _        t-        j.                  | j                  | j                  z  | j                  |j0                  	      | _        t;        | j                  | j                  | j                   
      | _        |j>                  | _        y )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tr   r   z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: r;   r   )r   r   ) rB   rC   rQ   r  rG   warning_oncerR   r"   attention_dropoutr   num_attention_heads	num_headsr	  r  num_key_value_groupsr   
rope_theta	is_causalr   sqrtr   r   r   r   attention_biasq_projk_projv_projo_projr   
rotary_embsliding_windowrP   rQ   r  rR   s      r,   rC   zMimiAttention.__init__  s   " !8!8 9 :, , "(!9!9!--33#)#=#= $(NNd6N6N$N!'-'E'E$ ++499V__55dnn,1QRVRbRbQc$T^^$4B8 
 ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii >@P@PW]WlWlm-MM$($@$@

 %33r+   rc   attention_maskr   past_key_valueoutput_attentions	use_cachecache_positionrd   c                    |j                         \  }}	}
| j                  |      }| j                  |      }| j                  |      }|j	                  ||	| j
                  | j                        j                  dd      }|j	                  ||	| j                  | j                        j                  dd      }|j	                  ||	| j                  | j                        j                  dd      }| j                  ||      \  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                   ||j                  dd            | j"                  z  }|#|d d d d d d d |j$                  d   f   }||z   }t&        j(                  j+                  |dt        j,                        j/                  |j0                        }t&        j(                  j3                  || j4                  | j6                        }t        j                   ||      }|j                         || j
                  |	| j                  fk7  r7t9        d	|| j
                  |	| j                  f d
|j                                |j                  dd      j;                         }|j	                  ||	d      }| j=                  |      }|sd }|||fS )Nr   rA   r   r   r#  r	   rf   )r   r=   )ptrainingz `attn_output` should be of size z	, but is )sizer  r  r  viewr  r	  r   r  r  r   updater  r
  r  r&   matmulr   rg   r   rs   softmaxfloat32ri   r=   dropoutr  r(  r   
contiguousr  )rP   rc   r  r   r   r!  r"  r#  bszq_len_query_states
key_statesvalue_statesr   r   cache_kwargsattn_weightscausal_maskattn_outputs                       r,   r}   zMimiAttention.forward  s    &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm??<>S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$Jz4+D+DE
 t/H/HI||L*2F2Fq!2LMPTP\P\\%(Aq2HJ4D4DR4H2H)HIK'+5L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,lll<>#t~~udmm!LL2CPTP]P]3^2_ `$$&') 
 "++Aq1<<>!&&sE26kk+. LL.88r+   r^   NNNFFN)r"   r#   r$   r%   r   r   r   rC   r&   r   r'   r   r   r   r}   r   r   s   @r,   r  r    s    G%4z %4hsm %4T 2637*."'5989||89 !.89 u//0	89
 !89  89 89 !!1!1289 
u||Xell3XeELL>Q5RR	S89r+   r  c                       e Zd ZdZ fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee	   de
de
d	eej                     d
eej                  eej                     eeej                        f   fdZ xZS )MimiFlashAttention2aD  
    Mimi flash attention module. This module inherits from `MimiAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 D    t        |   |i | t                | _        y r^   )rB   rC   r   _flash_attn_uses_top_left_mask)rP   argskwargsrR   s      r,   rC   zMimiFlashAttention2.__init__:  s&    $)&)
 3V2W.W+r+   rc   r  r   r   r!  r"  r#  rd   c                    t        |t              rt        d      d}|j                         \  }}	}
| j	                  |      }| j                  |      }| j                  |      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }|j                  ||	| j                  | j                        j                  dd      }| j                  ||      \  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }| j                   r| j"                  nd}|j$                  }|t&        j(                  k(  rt'        j*                         rt'        j,                         }nMt/        | j0                  d      r| j0                  j2                  }n | j                  j4                  j$                  }t6        j9                  d| d	       |j;                  |      }|j;                  |      }|j;                  |      }t=        |||||	||t?        | d
d       | j@                  | jB                  
      }|jE                  ||	d      jG                         }| jI                  |      }|sd }||fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersFr   rA   r%  r   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r  )r   r/  r  r  use_top_left_maskrf   )%r   r   r   r)  r  r  r  r*  r  r	  r   r  r  r   r+  r  r(  r  r=   r&   r.  is_autocast_enabledget_autocast_gpu_dtyperX   rQ   rC  weightrG   r  ri   r   getattrr  r?  r  r0  r  )rP   rc   r  r   r   r!  r"  r#  r1  r2  r3  r4  r5  r6  r   r   r7  dropout_rateinput_dtypetarget_dtyper:  r8  s                         r,   r}   zMimiFlashAttention2.forwardB  s    nk2} 
 "%**,UA{{=1[[/
{{=1
 $((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm??<>S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J $--a3))!Q/
#--a315t--C #((%--'((*$;;=&?@#{{BB#{{1177 >$ (??<8L#|4J'??<8L.% "4)94@nn"AA
 "))#ub9DDFkk+. LL.88r+   r;  )r"   r#   r$   r%   rC   r&   r   r   r'   r   r   r   r}   r   r   s   @r,   r=  r=  3  s    X 6:37*."'59\9||\9 !!1!12\9 u//0	\9
 !\9  \9 \9 !!1!12\9 
u||Xell3XeELL>Q5RR	S\9r+   r=  c                       e Zd ZdZ	 	 	 	 	 	 ddej
                  deej
                     deej                     dee   de	de	deej                     d	e
ej
                  eej
                     ee
ej
                        f   f fd
Z xZS )MimiSdpaAttentionz
    Mimi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `MimiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    rc   r  r   r   r!  r"  r#  rd   c           	      B   |r+t         j                  d       t        |   |||||||      S |j	                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j                  |	|
| j                  | j                        j                  dd      }|j                  |	|
| j                  | j                        j                  dd      }|j                  |	|
| j                  | j                        j                  dd      }| j                  ||      \  }}t        ||||      \  }}|'|||d}|j                  ||| j                   |      \  }}t#        || j$                        }t#        || j$                        }|}||d d d d d d d |j&                  d   f   }|j(                  j*                  dk(  r2|0|j-                         }|j-                         }|j-                         }||
dkD  rdnd	}t.        j0                  j2                  j5                  ||||| j6                  r| j8                  nd
|      }|j                  dd      j-                         }|j                  |	|
d      }| j;                  |      }|d |fS )Na  MimiModel is using MimiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rc   r  r   r   r!  r"  r#  r   rA   r%  r&  cudaTFr   )	attn_mask	dropout_pr  rf   )rG   r  rB   r}   r)  r  r  r  r*  r  r	  r   r  r  r   r+  r  r
  r  rg   r   r   r0  r&   r   rs   scaled_dot_product_attentionr(  r  r  )rP   rc   r  r   r   r!  r"  r#  rA  r1  r2  r3  r4  r5  r6  r   r   r7  r9  r  r:  rR   s                        r,   r}   zMimiSdpaAttention.forward  s    [ 7?+-)-"3#- #   &**,UA{{=1[[/
{{=1#((eT^^T]]S]]^_abc__S%1I1I4==Yccdeghi
#((eT5M5Mt}}]gghiklm??<>S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$Jz4+D+DE
 t/H/HI$%%aA/E1A1A"1E/E&EFK ##v-+2I'224L#..0J'224L (/EAID5	hh))FF!04d,,3 G 
 "++Aq1<<>!&&sE26kk+.D.00r+   r;  )r"   r#   r$   r%   r&   r   r   r'   r   r   r   r}   r   r   s   @r,   rN  rN    s     2637*."'59M1||M1 !.M1 u//0	M1
 !M1  M1 M1 !!1!12M1 
u||Xell3XeELL>Q5RR	SM1 M1r+   rN  )eagerflash_attention_2sdpac                   (    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deej                  eeej                  ej                  f      f   fdZ xZS )MimiTransformerLayerrQ   r  c                    t         |           |j                  | _        t        |j                     ||      | _        t        |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        |      | _        t        |      | _        y )N)rQ   r  )eps)rB   rC   r   MIMI_ATTENTION_CLASSES_attn_implementation	self_attnr   mlpr   	LayerNormnorm_epsinput_layernormpost_attention_layernormr   self_attn_layer_scalemlp_layer_scaler  s      r,   rC   zMimiTransformerLayer.__init__  s    !--/0K0KLTZfop6?!||F,>,>FOOT(*V5G5GV__(]%%3F%;"-f5r+   rc   r  r   r   r!  r"  r#  rd   c                 &   |}	| j                  |      } | j                  d|||||||d|\  }}
}|	| j                  |      z   }|}	| j                  |      }| j	                  |      }|	| j                  |      z   }|f}|r||
fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        rP  r*   )rb  r^  rd  rc  r_  re  )rP   rc   r  r   r   r!  r"  r#  rA  r   self_attn_weightspresent_key_valueoutputss                r,   r}   zMimiTransformerLayer.forward  s    < !,,]; ?Mdnn 	?
')%)/)	?
 	?
;(*; !4#=#=m#LL !55mD/ 4#7#7#FF ")++G)++Gr+   r;  )r"   r#   r$   r   r   rC   r&   r   r   r'   r   r   r   r)   r}   r   r   s   @r,   rY  rY    s    
6z 
6c 
6 2637*.,1$)59=||= !.= u//0	=
 != $D>= D>= !!1!12= 
u  (51B1BEDUDU1U+V"WW	X=r+   rY  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
eeej                     f      dee   d	ee   d
ee   dee   deej                     de
eef   fdZdej                  dej                  dej                  ded	ef
dZedej                  dededej*                  dej,                  dej                  dededefd       Z xZS )MimiTransformerModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MimiTransformerLayer`]

    Args:
        config: MimiConfig
    rQ   c           	          t         |           t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        |j                  | _        d| _	        || _
        y c c}w )NF)rB   rC   r   r   r   num_hidden_layersrY  r   r]  gradient_checkpointingrQ   r  s      r,   rC   zMimiTransformerModel.__init__V  sd    mmFKFLdLdFef!&)4f
 %+$?$?!&+# gs   A5rc   r  r   past_key_valuesr"  r!  output_hidden_statesreturn_dictr#  rd   c
                 2   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j
                  r%| j                  r|rt        j                  d       d}|rGt        |t              s7|t               }n*t        j                  |      }t        j                  d       |	F||j                         nd}
t        j                  |
|
|j                   d   z   |j"                        }	||	j%                  d      }d}|| j'                  |||	||      }|rdnd}|rdnd}d}| j(                  D ]p  }|r||fz  }| j
                  r/| j                  r#| j+                  |j,                  |||||||	      }n ||||||||		      }|d   }|r	||rd
nd   }|sh||d   fz  }r |r||fz  }|r|nd}|st/        d ||||fD              S t1        ||||      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Embedded representation that will be contextualized by the model
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
                `past_key_values`).

                If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
                and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
                information on the default strategy.

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`.

                [What are position IDs?](../glossary#position-ids)
            past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

                Two formats are allowed:
                - a [`~cache_utils.Cache`] instance;
                - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
                cache format.

                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
                legacy cache format will be returned.

                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
                of shape `(batch_size, sequence_length)`.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
                `past_key_values`).
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r   r*   )r  r   r   r!  r"  r#  rA   c              3   &   K   | ]	  }||  y wr^   r*   ).0vs     r,   	<genexpr>z/MimiTransformerModel.forward.<locals>.<genexpr>  s     tqfgfsts   )last_hidden_statero  rc   
attentions)rQ   r!  rp  r"  use_return_dictrn  r(  rG   r  r   r   r   from_legacy_cacheget_seq_lengthr&   r   rg   r   r   _update_causal_maskr   _gradient_checkpointing_func__call__tupler   )rP   rc   r  r   ro  r"  r!  rp  rq  r#  past_seen_tokensr9  all_hidden_statesall_self_attnsnext_decoder_cachedecoder_layerlayer_outputs
next_caches                     r,   r}   zMimiTransformerModel.forwarda  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]&&4==Yj IZ?&".."."@"@"Q##^ !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L%22~PaK
 #7BD0d!![[  	6M#!m%55!**t}} $ A A!**! #%"	! !.!#.!-#2&7'#1! *!,M%28I1q%Q" =#3"55A 	6F  -!11+4'$
t]J@QSa$bttt&+&+%	
 	
r+   input_tensorc                 t   | j                   j                  dk(  r	|d|v r|S y ||j                         nd}t        |t              }t        |t
              }| j                   j                  dk(  r?|s=|s;|s9t        j                  |||| j                   j                  | j                        ry |j                  |j                  }
}	t        j                  |	      j                  }|j                  d   }|s|r|j!                         }n1t        |t        j"                        r|j                  d   n||z   dz   }| j%                  ||||	|
||j                  d   | j                   |	      }| j                   j                  dk(  r3|1|j                  j&                  d	k(  r|st        j(                  ||      }|S )
NrV  r   r   rW  )inputs_embedspast_key_values_lengthr  is_trainingr   rf   )sequence_lengthtarget_lengthr=   r   r#  
batch_sizerQ   ro  rQ  )rQ   r]  r|  r   r   r   r   _ignore_causal_mask_sdpar  r(  r=   r   r&   finfominrg   get_max_cache_shaper   5_prepare_4d_causal_attention_mask_with_cache_positionr   _unmask_unattended)rP   r  r  r#  ro  r!  r  using_static_cacheusing_sliding_window_cacher=   r   	min_dtyper  r  r9  s                  r,   r}  z(MimiTransformerModel._update_causal_mask  s    ;;++/BB)c^.C%%
 @O?Z?99;`a'E%/AS%T" KK,,6'+E%%>>*'7#{{99 MM $**L,?,?vKK&**	&,,Q/%);+??AM
 nell; $$R(%7!;  PP+')#))!,;;+ Q 

 KK,,6*%%**f4%
 1CCKQZ[Kr+   r  r  r=   r   r  c	                 >   | | j                         dk(  r| }	|	S t        j                  |      j                  }
t        j                  ||f|
||      }	t        j
                  ||      |j                  dd      kD  }|j                  ]t        |t              r||kD  rHt        j
                  ||      |j                  dd      |j                  z
  k  }|j                  |       |	|z  }	|	ddddddf   j                  |ddd      }	| |	j                         }	| j                  d   |kD  r| ddd|f   } | j                  d   }|	ddddddd|f   | ddddddf   z   }|dk(  }|	ddddddd|f   j                  ||
      |	ddddddd|f<   |	S )aS  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`MimiConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        N   )
fill_valuer=   r   rs  rf   r   r   )r   r&   r  r  r   r   r  r  r   r   bitwise_or_r   clonerg   masked_fill)r  r  r  r=   r   r#  r  rQ   ro  r9  r  diagonal_attend_masksliding_attend_maskmask_lengthpadding_masks                  r,   r  zJMimiTransformerModel._prepare_4d_causal_attention_mask_with_cache_positionR  s   J %.*<*<*>!*C(K6 3 E*..I** -0Ye\bK $)<<f#MP^PfPfgiklPm#m $$0 "/3EF/\iJi*/,,}V*T&..r158M8MM+' )445HI//K%dD!Q&67>>z1bRTUK))//1!''+m;%3A~~4E%FN,2226*1aL[L+@ANSTVZ\`bcScDdd+q05@Aq,;,AV5W5c5c )6Aq!\k\12 r+   )	NNNNNNNNN)r"   r#   r$   r%   r   rC   r&   r'   r   r   r   r   r   r)   r   r   r   r}   r}  r   r   r=   r   r  r   r   s   @r,   rk  rk  N  s   	z 	 +/1537KO$(,0/3&*59c
''c
 !.c
 u//0	c

 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 d^c
 !!1!12c
 
u--	.c
LII llI 	I
 I  IV @@@ @ {{	@
 @ @ @ @ @ @r+   rk  c                   .     e Zd ZdZdef fdZd Z xZS )MimiDecoderzSEANet decoder as used by Mimi.rQ   c           	         t         |           t        dt        |j                        z        }t        ||j                  ||j                  z  |j                        g}|j                  D ]  }||j                  z  }|t        j                         gz  }|t        |||dz  |dz  |      gz  }t        |j                        D ]%  }|t        ||dz  |j                  |z  df      gz  }' |dz  } |t        j                         gz  }|t        ||j                  |j                   |j"                        gz  }t        j$                  |      | _        y )NrA   r   r   )rB   rC   r   r   r   r2   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   )rP   rQ   r   r   r   r   r   rR   s          r,   rC   zMimiDecoder.__init__  s[   a3v77889FF$6$6&BTBT8TV\VhVhij -- 
	E#f&8&88MbffhZE#FM=A;M[`cd[dmrs E 6556 l/&-12DvGbGbdeGeghFijkklMG
	 	"&&(*VV%7%79N9NPVPgPghiimmE*r+   c                 8    | j                   D ]
  } ||      } |S r^   r   r   s      r,   r}   zMimiDecoder.forward  r   r+   r   r   s   @r,   r  r    s    )+z +0r+   r  c                   j     e Zd ZdZd
dedef fdZedej                  fd       Z
d Zd Zd	 Z xZS )MimiEuclideanCodebookz!Codebook with Euclidean distance.rQ   epsilonc                    t         |           t        j                  |j                  |j
                        }|j                  | _        | j                  dt        j                  dg             | j                  dt        j                  |j                               | j                  d|       d | _	        || _
        y )NinitializedTcluster_usage	embed_sum)rB   rC   r&   zeroscodebook_sizecodebook_dimrM   r   ones_embedr  )rP   rQ   r  embedrR   s       r,   rC   zMimiEuclideanCodebook.__init__  s    F00&2E2EF#11]ELL$,@A_ejj9M9M.NO[%0r+   rd   c                     | j                   ?| j                  | j                  j                  | j                        d d d f   z  | _         | j                   S )N)r  )r  r  r  clampr  ra   s    r,   r  zMimiEuclideanCodebook.embed  sJ    ;;..4+=+=+C+C+C+UVWY]V]+^^DK{{r+   c                     t        j                  |d    | j                  d    d      d   }|j                  d      }|S )NrA   )r'  r   rf   r   )r&   cdistr  argmin)rP   rc   dists	embed_inds       r,   quantizezMimiEuclideanCodebook.quantize  s?     M$/D1AQGJLLRL(	r+   c                     |j                   }|j                  d|d   f      }| j                  |      } |j                  |d d  }|S )Nrf   )rg   r  r  r*  )rP   rc   rg   r  s       r,   encodezMimiEuclideanCodebook.encode  sO    ##%--r59o>MM-0	"INNE#2J/	r+   c                 Z    t         j                  j                  || j                        }|S r^   )r   rs   	embeddingr  rP   r  r  s      r,   decodezMimiEuclideanCodebook.decode  s!    ==**9djjAr+   )gh㈵>)r"   r#   r$   r%   r   r   rC   propertyr&   r   r  r  r  r  r   r   s   @r,   r  r    sG    +
z 
E 
 u||  
r+   r  c                   4     e Zd ZdZdef fdZd Zd Z xZS )MimiVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    rQ   c                 B    t         |           t        |      | _        y r^   )rB   rC   r  codebookr  s     r,   rC   zMimiVectorQuantization.__init__  s    -f5r+   c                 b    |j                  ddd      }| j                  j                  |      }|S Nr   rA   r   )permuter  r  )rP   rc   embed_ins      r,   r  zMimiVectorQuantization.encode  s/    %--aA6==''6r+   c                 b    | j                   j                  |      }|j                  ddd      }|S r  )r  r  r  r  s      r,   r  zMimiVectorQuantization.decode  s/    ==''	2##Aq!,r+   )	r"   r#   r$   r%   r   rC   r  r  r   r   s   @r,   r  r    s    6z 6
r+   r  c                        e Zd ZdZd
dedef fdZd
dej                  de	e   dej                  fdZ
dej                  dej                  fd	Z xZS )MimiResidualVectorQuantizerzResidual Vector Quantizer.rQ   num_quantizersc                 b   t         |           |j                  | _        |j                  | _        ||n|j                  | _        t        j                  t        | j                        D cg c]  }t        |       c}      | _	        d | _
        d | _        |j                  |j                  k7  ryt        j
                  j                  |j                  |j                  dd      | _
        t        j
                  j                  |j                  |j                  dd      | _        y y c c}w )Nr   Fr   )rB   rC   r  
frame_rater  r   r   r   r  r   
input_projoutput_proj$vector_quantization_hidden_dimensionr   r&   rI   )rP   rQ   r  r3  rR   s       r,   rC   z$MimiResidualVectorQuantizer.__init__  s    #11 ++0>0JnPVPePemmUSWSfSfMg$h%;F%C$hi66&:L:LL#hhoo""F$O$OQRY^ . DO  %xx;;V=O=OQRY^  /  D	 M	 %is   -D,
embeddingsrd   c                 *   | j                   | j                  |      }||n| j                  }|}g }| j                  d| D ]:  }|j                  |      }|j	                  |      }||z
  }|j                  |       < t        j                  |      }|S )
        Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        N)r  r  r   r  r  appendr&   stack)	rP   r  r  r   all_indicesr   indices	quantizedout_indicess	            r,   r  z"MimiResidualVectorQuantizer.encode  s    
 ??&4J+9+E4K^K^[[.1 	(Ell8,GW-I)+Hw'		(
 kk+.r+   codesc                    t        j                  d|j                        }|j                  dd      }t	        |      D ]*  \  }}| j
                  |   }|j                  |      }||z   }, | j                  | j                  |      }|S )zJDecode the given codes of shape [B, K, T] to the quantized representation.r   rs  r   r   )r&   rK   r   r   r   r   r  r  )rP   r  quantized_outr   r  r   r  s          r,   r  z"MimiResidualVectorQuantizer.decode"  s    S>1%#E* 	6JAwKKNEW-I)I5M	6
 ' ,,];Mr+   r^   )r"   r#   r$   r%   r   r   rC   r&   r   r   r  r  r   r   s   @r,   r  r    s]    $z 3 " x} X]XdXd (ELL U\\ r+   r  c                        e Zd ZdZdef fdZd
dej                  dee	   dej                  fdZ
dej                  dej                  fd	Z xZS ) MimiSplitResidualVectorQuantizerz Split Residual Vector Quantizer.rQ   c                 R   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  |j                  z
  | _        t        || j                        | _	        t        || j                        | _
        y r^   )rB   rC   r  r  r  max_num_quantizersnum_semantic_quantizersnum_acoustic_quantizersr  "semantic_residual_vector_quantizer"acoustic_residual_vector_quantizerr  s     r,   rC   z)MimiSplitResidualVectorQuantizer.__init__3  s    #11 ++"("7"7'-'E'E$'-'<'<v?]?]']$2MfVZVrVr2s/2MfVZVrVr2s/r+   r  r  rd   c                    || j                   n|}|| j                   kD  rt        d| j                    d| d      || j                  k  rt        d| j                   d| d      | j                  j	                  |      }|| j                  kD  rC| j
                  j	                  ||| j                  z
        }t        j                  ||gd      }|S )r  cThe number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers , but is currently rD  zgThe number of quantizers (i.e codebooks) asked should be higher than the number of semantic quantizers )r  r   r   )r  r   r  r  r  r  r&   r   )rP   r  r  r  acoustic_codess        r,   r  z'MimiSplitResidualVectorQuantizer.encode?  s8    5C4J00P^D333uvz  wN  wN  vO  Ob  cq  br  rs  t  D888yz~  {W  {W  zX  Xk  lz  k{  {|  } 
 77>>zJD888!DDKK>D<X<X+X L N IIun51=Er+   r  c                     | j                   j                  |ddd| j                  f         }|j                  d   | j                  kD  r1|| j                  j                  |dd| j                  df         z  }|S )z7Decode the given codes to the quantized representation.Nr   )r  r  r  rg   r  )rP   r  r  s      r,   r  z'MimiSplitResidualVectorQuantizer.decode\  s     ??FFuQPnRVRnRnPnMnGop ;;q>D888TDDKKERSUYUqUqUsRsLtuuMr+   r^   )r"   r#   r$   r%   r   rC   r&   r   r   r   r  r  r   r   s   @r,   r  r  0  sX    *
tz 
t x Z_ZfZf :	ELL 	U\\ 	r+   r  c                   @    e Zd ZdZeZdZdZdZdgZ	dZ
dZdZdZdZd Zy)	MimiPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    mimiinput_valuesTMimiDecoderLayerro  c                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                  t        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t        j                        rt        j                  j                  |j                         |j                  jt!        j"                  |j$                  |j&                  |j(                  d   z  z        }t        j                  j+                  |j                  | |       yyt        |t        j,                        rz|j                  j                  j                  d| j                  j                         |j.                  2|j                  j                  |j.                     j                          yyt        |t        j0                        rb|j3                         D ]N  \  }}d|v r t        j                  j5                  |       *d|v s/t        j                  j7                  |d       P yy)	zInitialize the weightsr   )meanstdNr   r   )abrH  r9   )r   r   r   rH  datanormal_rQ   initializer_ranger9   zero_r`  	GroupNormfill_rI   initkaiming_normal_r   r  r8   r3   r5   uniform_	Embeddingpadding_idxLSTMnamed_parametersxavier_uniform_	constant_)rP   moduler   nameparams        r,   _init_weightsz!MimiPreTrainedModel._init_weightsz  s   fbii(MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' -MM&&CT[[5R5R&S!!-""6#5#56<<> .(%668 2et#GG++E2t^GG%%eS1	2 )r+   N)r"   r#   r$   r%   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_static_cacher	  r*   r+   r,   r  r  h  sJ    
 L$O&*#+,"3!N !2r+   r  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MimiConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Raw audio input converted to Float.
        padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
            for *masked*.
        num_quantizers (`int`, *optional*):
            Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        encoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
        decoder_past_key_values (`Cache`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            The model will output the same cache format that is fed as input.

            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
            have their past key value states given to this model).
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z"The Mimi neural audio codec model.c                       e Zd Zdef fdZd Zd Z	 	 ddej                  de	de	de
eeeej                     f      d	e
e   d
eej                  e
ej                     f   fdZ	 	 	 	 ddej                  dej                  de
e   de
eeeej                     f      d	e
e   d
eeej                  e
ej                     f   ef   fdZ	 	 ddej                  de
eeeej                     f      d	e
e   d
ej                  fdZ	 	 	 ddej                  de
ej                     de
eeeej                     f      d	e
e   d
eeej                  ej                  f   ef   f
dZ ee       eee      	 	 	 	 	 	 ddej                  de
ej                     de
e	   de
ej                     de
eeeej                     f      de
eeeej                     f      d	e
e   d
eeej                  ej                  f   ef   fd              Z xZS )	MimiModelrQ   c           
      \   t         |   |       || _        t        |      | _        t        |      | _        d | _        d | _        |j                  |j                  k7  rt        ||j                  |j                  dt        |j                  |j                  z        z  ddd      | _        t        ||j                  |j                  dt        |j                  |j                  z        z  dd|j                        | _        t        |      | _        t#        |      | _        t'        |      | _        t        t+        j,                  | j                  j.                              | _        d| j0                  z  | j                  j.                  k7  rt3        d      | j5                          y )NrA   F	replicate)r5   r6   r9   rF   )r5   r6   r9   r8   z'The codebook_size must be a power of 2.)rB   rC   rQ   r   encoderrk  encoder_transformer
downsampleupsampler  encodec_frame_rater2   r   r   r   upsample_groupsdecoder_transformerr  decoderr  	quantizerr   log2r  bits_per_codebookr   	post_initr  s     r,   rC   zMimiModel.__init__  sf    "6*#7#?  9 99(""""F$=$=@Q@Q$Q RR$DO 0""""F$=$=@Q@Q$Q RR--DM $8#? "6*9&A!$TYYt{{/H/H%I!Jd$$$(A(AAFGG 	r+   c                     | j                   S r^   )r  ra   s    r,   get_encoderzMimiModel.get_encoder      ||r+   c                     | j                   S r^   )r  ra   s    r,   get_decoderzMimiModel.get_decoder  r&  r+   r  r  r  ro  rq  rd   c                 j   | j                  |      }| j                  |j                  dd      ||      }|r|j                  d      }nt	        |      dkD  r|d   }|d   j                  dd      }| j                  |      }| j                  j                  ||      }|j                  dd      }||fS )z
        Encodes the given input using the underlying VQVAE. The padding mask is required to compute the correct scale.
        r   rA   ro  rq  ro  r   )r  r  r   getr   r  r   r  )	rP   r  r  r  ro  rq  r  encoder_outputsr  s	            r,   _encode_framezMimiModel._encode_frame  s     \\,/
22  A&U` 3 
 -112CDO!A%-a0O$Q'11!Q7
__Z0
%%j.A1%o%%r+   r    c                    ||n| j                   j                  }|| j                   j                  n|}|| j                   j                  kD  r&t        d| j                   j                   d| d      |j                  \  }}}|dk  s|dkD  rt        d|       |#t        j                  |      j                         }| j                  |||j                         ||      \  }	}|s|	|fS t        |	|      S )aE  
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
                for *masked*.
            num_quantizers (`int`, *optional*):
                Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
            encoder_past_key_values (`Cache`, *optional*):
                Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
                This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

                The model will output the same cache format that is fed as input.

                If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
                have their past key value states given to this model).
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:
            `codebook` of shape `[batch_size, num_codebooks, frames]`, the discrete encoded codes for the input audio waveform.
        r  r  rD  r   rA   z1Number of audio channels must be 1 or 2, but got r*  )
rQ   rq  r  r   rg   r&   	ones_liker   r-  r.   )
rP   r  r  r  r    rq  r3  r   input_lengthencoded_framess
             r,   r  zMimiModel.encode  sH   B &1%<k$++BYBY7E7M33SaDKK666uvz  wB  wB  wQ  wQ  vR  Re  ft  eu  uv  w  %1$6$6!8\a<8a<PQYPZ[\\ ??<8==?L262D2D3# 3E 3
// ' 
 !1HIIr+   r  c                 D   | j                   j                  |      }| j                  |      }| j                  |j	                  dd      ||      }|r|j                  d      }nt        |      dkD  r|d   }|d   j	                  dd      }| j                  |      }||fS )Nr   rA   r*  ro  r   )r   r  r  r  r   r+  r   r  )rP   r  ro  rq  r  decoder_outputsri  s          r,   _decode_framezMimiModel._decode_frameV  s     ^^**51
]]:.
22  A&U` 3 
 -112CDO!A%-a0O$Q'11!Q7
,,z*''r+   r   r!   c                     ||n| j                   j                  }| j                  |||      \  }}|5|j                  d   |j                  d   k  r|dd|j                  d   f   }|s||fS t	        ||      S )a  
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
                Discret code embeddings computed using `model.encode`.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
                for *masked*.
            decoder_past_key_values (`Cache`, *optional*):
                Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
                This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

                The model will output the same cache format that is fed as input.

                If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
                have their past key value states given to this model).
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Nr*  rf   .)rQ   rq  r4  rg   r0   )rP   r   r  r!   rq  r   s         r,   r  zMimiModel.decodej  s    > &1%<k$++BYBY040B0B)@k 1C 1
--
 #(:(:2(>ASASTVAW(W'-E|/A/A"/E-E(EFL'  !/FGGr+   )output_typer
  c                    ||n| j                   j                  }|#t        j                  |      j	                         }|B| j                  |||||      }|d   }|r|j                  d      }nt        |      dkD  r|d   }| j                  ||||      }	|	d   }
|r|	j                  d      }nt        |	      dkD  r|	d   }|s||
||fS t        ||
||      S )a  
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoFeatureExtractor, MimiModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model_id = "kyutai/mimi"
        >>> model = MimiModel.from_pretrained(model_id)
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```)rq  r   ro  r   )r   r   r    r!   )
rQ   rq  r&   r/  r   r  r+  r   r  r   )rP   r  r  r  r   r    r!   rq  r,  r3  r   s              r,   r}   zMimiModel.forward  s   D &1%<k$++BYBY ??<8==?L"kklN<Sal * O *!,K*9*=*=>O*P'_%)*9!*<'++k<AXfq+r&q)&5&9&9:K&L#!A%&5a&8#/FH_``#%$;$;	
 	
r+   )NN)NNNN)NNN)NNNNNN)r"   r#   r$   r   rC   r%  r(  r&   r   r   r   r   r   r   r)   r   r   r-  r   r.   r  r4  r0   r  r   MIMI_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr}   r   r   s   @r,   r  r    s)   
(z (T LP&*&ll& & 	&
 "%tE4E4E/F(F"GH& d^& 
u||Xell33	4&: &**.SW&*@Jll@J ll@J !	@J
 "*%tE<M<M7N0N*O!P@J d^@J 
uU\\8ELL#99:<MM	N@JJ LP&*	(||( "%tE4E4E/F(F"GH( d^	(
 
(. 04SW&*.H\\.H u||,.H "*%tE<M<M7N0N*O!P	.H
 d^.H 
uU\\5<</02CC	D.H` ++@A:OT 04(,.2SWSW&*>
ll>
 u||,>
 !	>

 ell+>
 "*%tE<M<M7N0N*O!P>
 "*%tE<M<M7N0N*O!P>
 d^>
 
uU\\5<</0*<	=>
 U B>
r+   r  )Nr   )Hr%   r   dataclassesr   typingr   r   r   r   r&   torch.utils.checkpointr   activationsr
   cache_utilsr   r   r   r   modeling_attn_mask_utilsr   modeling_outputsr   modeling_utilsr   rW   r   r   r   r   r   r   r   configuration_mimir   modeling_flash_attention_utilsr   
get_loggerr"   rG   r9  r   r.   r0   Moduler2   r   r   r   r   r   r   r   r   r   r   r
  r  r=  rN  r\  rY  rk  r  r  r  r  r  r  MIMI_START_DOCSTRINGr8  r  r*   r+   r,   <module>rG     sn     ! / /    ! O O > 7 -   + J			H	%  T T T> T T T& T T T&d dN7")) 7v7bii 7B")) >RYY  <")) <<(6bii "	UU\\ 	U# 	U%,, 	Ub9BII b9Lk9- k9^U1 U1r , J299 JZF299 FR
")) B*BII *\RYY (3")) 3l5ryy 5p)2/ )2X " @ (Q
# Q
	Q
r+   