
    sg                       d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mc mZ ddl	mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ  ej>                  e       Z!dd e"d       fdZ#d Z$d Z%d Z&d Z'd Z( G d dejR                        Z* G d dejR                        Z+ G d dejR                        Z, G d dejR                        Z- G d dejR                        Z. G d d ejR                        Z/ G d! d"ejR                        Z0 G d# d$ejR                        Z1 G d% d&ejR                        Z2d'Z3 ed(e3       G d) d*e             Z4 G d+ d,ejR                        Z5 G d- d.e      Z6 G d/ d0ejR                        Z7 G d1 d2ejR                        Z8 G d3 d4ejR                        Z9 G d5 d6ejR                        Z: G d7 d8ejR                        Z; G d9 d:ejR                        Z< G d; d<ejR                        Z= G d= d>ejR                        Z> G d? d@e      Z? G dA dBe      Z@dCZA edDe3       G dE dFe@             ZBy)GzPyTorch Jukebox model.    N)ListOptionalTuple)nn)	LayerNorm   )ACT2FN)PreTrainedModel)add_start_docstringslogging)tqdm   )ATTENTION_PATTERNSJukeboxConfigJukeboxPriorConfigJukeboxVQVAEConfig        Infc                    | j                         } t        || j                  d            }|dkD  r*| t        j                  | |d      d   dddf   k  }|| |<   |dkD  rt        j
                  | dd      \  }}t        j                  t        j                  |d      d      }||kD  }|dddf   j                         |dd	df<   d|d
<   t        j                  | t        j                        j                  d||      }|| |<   | S )a  
    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering

    Args:
        logits (`torch.Tensor`):
            logits distribution shape (vocabulary size)
        top_k (`int`, *optional*, defaults to 0):
            When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
        top_p (`int`, *optional*, defaults to 0):
            When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
    r   dim.Nr   T)
descendingr   r   ).r   dtype)r   indexsrc)cloneminsizetorchtopksortcumsumFsoftmax
zeros_likeboolscatter_)	logitstop_ktop_pfilter_valueindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_removes	            j/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.pyfilter_logitsr4   %   s    \\^Fv{{2'Eqy"UZZ2%Fq%I#rs(%SS$0 !s{(-

6dPR(S%~ <<		-R(HbQ $4e#; ,DS#2#X,N,T,T,V ab)+, ( ",,V5::FOO..F P 
 %1 !M    c           	      f   | d   } t        |       |k  rt        j                  t        j                  |t        |       z
  t        j                        j                  | j                        | g      }dg|t        |       z
  z  t        t        dt        |                   z   }nzt        t        |       ||dz  z   z  |z        }t        t        ||dz        t        |       |dz  z
        }| ||dz  z
  ||dz  z    }t        t        ||dz  z
  ||dz  z               }|j                  d      |fS )a  
    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
    returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
    the most relevant tokens (in time) for the sequence.

    Args:
        full_tokens (`List[int]`):
            List containing the token ids of the entire lyrics.
        total_length (`int`):
            Total expected length of the music (not all of it is generated, see duration), in samples.
        offset (`int`):
            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
            account
        duration (`int`):
            Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
            which represent the overall length of the signal,
    r   r   r          @   r   )lenr!   catzeroslongtodevicelistrangeintr   max	unsqueeze)full_tokensmax_n_lyric_tokenstotal_lengthoffsetdurationtokensindicesmidpoints           r3   get_relevant_lyric_tokensrL   K   s?   & a.K
;,,[[+c+.>>ejjQTTU`UgUghjuv
 $,s;/??@4aQTU`QaHbCccs;'6HsN+BClRSs8%71%<=s;?ORdhiRi?ijX(:a(??(M_cdMdBdeuX(:a(??L^bcLcAcde"G++r5   c                 v    g }t        d| |z
  |z   |      D ]   }||z   | k\  r| |z
  }|j                  |       " |S Nr   )r@   append)rF   n_ctx
hop_lengthstartsstarts        r3   
get_startsrT   m   sR    Fq,.;ZH 5=L( 5(Ee	
 Mr5   c           	      8   |j                   dz
  }|j                  }| |   }|j                  d   |j                  d   }}||k  r\||z
  }	t        j                  |t        j
                  |||z
  |j                  |j                        gd      }|j                  d   }nd}	t        |j                  | dz
     |j                  z        }
|j                  d   |j                  d   }}|h}i }i }t        t        |||
      d      D ]  }||z   }|j                  |||j                  dd      \  }}t        j                   ||d      }t        j                   ||d      }g }t#        ||      D ]?  \  }}|j%                  |d d ||f   g ||	      }|j'                  |d   d d |f          ~A t        j                  |d      }~|j)                         j+                         j-                         }~|||<   |||<    g }t/        |      D ]  }|dd
d f   }t1        j
                  |t3        |      dz   f      }t5        t        |||
            D ]   }||z   }||   |   }||   |   } ||||| f<   " |d ||	z
  d df   }|j'                  |        |S )Nr   r   r   r>   r   z#Computing lyric to music alignment )descT)get_indicesrG   )get_attn_weights   r   )levelsrP   shaper!   r:   r;   r   r>   rA   hop_fractionprior_alignment_headprior_alignment_layerr   rT   get_metadatasample_lengthchunkzipforward_tokensrO   floatcpunumpyr@   npr9   reversed)!music_tokenslabelspriorconfiglevelrP   rI   
batch_sizerF   padding_lengthrQ   alignment_headalignment_layerattn_layersalignment_hopsindices_hopsrS   endmetadataindices_hop	tokens_bsmetadata_bsw_hopstokens_i
metadata_iw_hopweightsalignment_hop
alignmentsitemrD   	alignmentrJ   s!                                    r3   get_alignmentr   w   s   LL1EKKE% F%||AQJe-U[[U\-A^d^k^klmst
 ||AV((%!4u{{BCJ&,&A&A!&DfFbFbcdFeON"#KNLjujAHmn .em % 2 265&BVBVdhqr 2 s+KK
:	kk(JA>$'	;$? 	 Hj((!U3Y,)?Zbm(nEMM%(1n#456	 ))F*++-335 *U -u'.. Jj! 
%QUmHHlC,<q,@AB	jujIJ 	:E%-C*51$7M"5)$/G,9IeCi()		:
 =~ ==ssBC	)$
% r5   c                    t        j                  |dd      j                         j                         }t	        t        |j                  d               D ]u  }|Pt	        |      |   j                         \  }}}|  d| d| d| d|d d  d| }t        j                  |||          Ut        j                  |  d| d| ||          w y )Nr   r   r   z/lvl_-   z-sample-)
r!   clamprf   rg   r?   r@   r\   valuesrh   save)	fnamelvlmetasaudiartistsgenreslyricspaths	            r3   save_temp_audior      s    
++c2q
!
%
%
'
-
-
/C%		!%& =&*5k!n&;&;&=#GVVWE#ay&6"1:,asKDGGD#a&!GGugU3%xs3SV<=r5   c                    | |dk(  ry |r||z
  nt        ||z
  d      }| dk(  r(t        j                  |||      j                  |      } n| dk(  rt        j                  |||      j                         } t        j                  |||      j                         } | j	                  ||||z        d d d d| |z  d f   } t        j
                  j                  j                  | dd      j                         j	                  ||      } n,| d	k(  r't        j                  |||      j                  |      } | j	                  dd||      S )
Nr   r   autoregressiver>   summaryr   r   r   r   r   )valueprime)	rB   r!   onestrilviewr   
functionalpad
contiguous)	maskquery_lengthkey_value_lengthblocksspreadr>   samplesample_trG   s	            r3   get_maskr      sD   ||q((.X$C8H<8WYZ4[Fzz,(8HMMfU		zz,VDIIKzz,VDIIKyyv|v/EFq#2#P`O`djOjOlGlmHH## $ 
 Z\T, 01 	 
zz,(8HMMfU99Q<)9::r5   c                   $     e Zd Z fdZd Z xZS )JukeboxConv1Dc                     t         |           || _        || _        t	        j
                  ||      }t	        j                  |      }t        j                  |      | _	        t        j                  |      | _
        y N)super__init__input_widthoutput_widthr!   emptyr;   r   	Parameterweightbias)selfr   r   r   r   	__class__s        r3   r   zJukeboxConv1D.__init__   s\    &([,7{{<(ll6*LL&	r5   c           	      8   g |j                         d d | j                  }t        j                  | j                  j                  |      |j                  d|j                  d            | j                  j                  |            } |j                  | }|S )Nr   )r    r   r!   addmmr   type_asr   r   )r   hidden_statessize_outs      r3   forwardzJukeboxConv1D.forward   s    B]'')#2.B0A0ABIIm,r=#5#5b#9:KK.

 +**H5r5   __name__
__module____qualname__r   r   __classcell__r   s   @r3   r   r      s    'r5   r   c                   &     e Zd Zd fd	Zd Z xZS )JukeboxResConv1DBlockc                    t         |           |j                  |z  }|j                  |z  }|}|| _        t        j                         | _        t        j                  ||dd||      | _	        t        j                  ||ddd      | _
        y )NrZ   r   r   )r   r   res_convolution_multiplierres_dilation_growth_rate	res_scaler   ReLU
activationConv1dconv1d_1conv1d_2)	r   rm   
conv_widthdepthr   
hidden_dimdilationpaddingr   s	           r3   r   zJukeboxResConv1DBlock.__init__   sy    66C
22E9"'')		*j!QR		*j!QBr5   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|| j                  |z  z   S r   )r   r   r   r   )r   r   	residualss      r3   r   zJukeboxResConv1DBlock.forward   sS    !	6m46m44>>M999r5   )r         ?r   r   s   @r3   r   r      s    	C:r5   r   c                   &     e Zd Zd fd	Zd Z xZS )JukeboxResnet1Dc           	      p   t         	|           |j                  | _        |j                  sdndt        j                  |      z  }g }t        |      D ]<  }| j                  |n|| j                  z  }|j                  t        ||||             > |r|d d d   }t        j                  |      | _        y )Nr   r   )r   r   res_dilation_cycledilation_cycleconv_res_scalemathsqrtr@   rO   r   r   
ModuleListresnet_block)
r   rm   r   n_depthreverse_dilationr   r   r   block_depthr   s
            r3   r   zJukeboxResnet1D.__init__  s    $77%44C#		'@R:R	7^ 	]E#'#6#6#>%EDL_L_D_KMM/
KQZ[\	] DbD\FMM&1r5   c                 8    | j                   D ]
  } ||      } |S r   )r   r   r   blocks      r3   r   zJukeboxResnet1D.forward  s'    && 	1E!-0M	1r5   Fr   r   s   @r3   r   r      s    2r5   r   c                   $     e Zd Z fdZd Z xZS )JukeboxEncoderConvBlockc           
      ~   t         |           g }|dz  }|dz  }	|dkD  r[t        |      D ]M  }
|j                  t	        j
                  |
dk(  r|n|||||	             |j                  t        |||             O t	        j
                  ||j                  ddd      | _        t	        j                  |      | _
        y )Nr8   r   rZ   r   )r   r   r@   rO   r   r   r   	embed_dimproj_outr   downsample_block)r   rm   r   r   r   down_tstride_tr   filter_tpad_tr   r   s              r3   r   z JukeboxEncoderConvBlock.__init__  s    a<AA:6] JbiiQ!V	ZYackmrstofj%HIJ 		*f.>.>1aH "f 5r5   c                 Z    | j                   D ]
  } ||      } | j                  |      }|S r   )r   r   r   s      r3   r   zJukeboxEncoderConvBlock.forward"  s5    ** 	1E!-0M	1m4r5   r   r   s   @r3   r   r     s    
6r5   r   c                   $     e Zd Z fdZd Z xZS )JukeboxEncoderc                 R   t         |           || _        t        j                         | _        t        t        t        | j                              ||      }|D ]J  \  }}	}
| j
                  j                  t        ||dk(  r|j                  n|j                  |||	|
             L y rN   )r   r   r[   r   r   level_blocksrc   r?   r@   rO   r   conv_input_shaper   )r   rm   widthr   r[   downs_t	strides_titeratorr   r   r   r   s              r3   r   zJukeboxEncoder.__init__*  s    MMOtE$++./)D#+ 	Avx$$'qAvF336CSCSUZ\aciks	r5   c                     g }t        | j                        D ]*  }| j                  |   } ||      }|j                  |       , |S r   )r@   r[   r   rO   )r   r   all_hidden_statesrn   level_blocks        r3   r   zJukeboxEncoder.forward7  sR     4;;' 	4E++E2K'6M$$]3	4
 ! r5   r   r   s   @r3   r   r   )  s    	!r5   r   c                   &     e Zd Zd fd	Zd Z xZS )JukeboxDecoderConvBockc           
         || _         || _        t        |           g }|dkD  r|dz  }	|dz  }
t	        j
                  ||ddd      | _        t        |      D ]Q  }|j                  t        ||||             |j                  t	        j                  |||dz
  k  r|n||	||
             S t	        j                  |      | _        y )Nr   r8   rZ   r   )r   r   r   r   r   r   proj_inr@   rO   r   ConvTranspose1dr   upsample_block)r   rm   r   r   r   r   r   r   r   r   r   r   r   s               r3   r   zJukeboxDecoderConvBock.__init__D  s    "$A:!|HME99Y
Aq!DDL6] ofj%IYZ[&&"!fqj.JiQY[cej !mmF3r5   c                 Z    | j                  |      }| j                  D ]
  } ||      } |S r   )r  r  r   s      r3   r   zJukeboxDecoderConvBock.forwardV  s5    ]3(( 	1E!-0M	1r5   Tr   r   s   @r3   r   r   C  s    4$r5   r   c                   &     e Zd Z fdZddZ xZS )JukeboxDecoderc                    t         
|           || _        t        j                         | _        t        t        t        | j                              ||      D ]9  \  }}}	| j
                  j                  t        ||j                  ||||	             ; t        j                  |j                  |j                  ddd      | _        y )NrZ   r   )r   r   r[   r   r   r   rc   r?   r@   rO   r   r   r   r   out)r   rm   r   r   r[   r   r   rn   r   r   r   s             r3   r   zJukeboxDecoder.__init__^  s    MMO'*4dkk0B+CWi'X 	#E68$$&vv/?/?UTZ\de	
 99V--v/F/F1aPr5   c                     |d   }t        t        | j                              D ]-  }| j                  |   } ||      }|dk7  s |s#|||dz
     z   }/ | j	                  |      }|S )Nr   r   r   )ri   r@   r[   r   r
  )r   r   
all_levelshidden_statern   r   s         r3   r   zJukeboxDecoder.forwardi  sy    $R( eDKK01 	GE++E2K&|4Lzj+mEAI.FF	G xx-r5   r  r   r   s   @r3   r  r  ]  s    	Qr5   r  c                   b     e Zd Zdef fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd ZddZ xZS )JukeboxBottleneckBlockrm   c                 4   t         |           |j                  | _        |j                  | _        |j
                  | _        d| _        d| _        d | _	        d | _
        | j                  dt        j                  | j                  | j                               y )Nr   Fcodebook)r   r   nb_discrete_codesr   codebook_widthlmumu	thresholdinitcodebook_sumcodebook_elemregister_bufferr!   r;   )r   rm   r   s     r3   r   zJukeboxBottleneckBlock.__init__y  sz    !'!9!9$..**	 !ZT5K5KTM`M`)abr5   c                     |j                   \  }}|| j                  k  rZ| j                  |z   dz
  |z  }dt        j                  |      z  }|j	                  |d      }|t        j                  |      |z  z   }|S )Nr   {Gz?)r\   r  rh   r   repeatr!   
randn_like)r   r   r   embed_width	n_repeatsstds         r3   _tilezJukeboxBottleneckBlock._tile  s    (..['''//#59cAI--C)00A>M)E,<,<],Kc,QQMr5   c                 (   | j                   }d| _        | j                  |      }|t        j                  |j
                  d            d | | _        | j                  | _        t        j                  || j                  j                        | _
        y )NTr   r   )r  r  r"  r!   randpermr\   r  r  r   r>   r  )r   r   r  codess       r3   init_codebookz$JukeboxBottleneckBlock.init_codebook  ss     22	

=)ennU[[^<=>P?PQ MM"ZZ(9$--BVBVWr5   c           	      X   | j                   | j                  | j                  }}}t        j                         5  t        j
                  ||j                  d   |j                        }|j                  d|j                  d|j                  d         d       t        j                  ||      }|j                  d      }| j                  |      }	|	t        j                  |	j                  d            d | }
| j                  }|| j                  z  d|z
  |z  z   | _        || j                   z  d|z
  |z  z   | _        | j                   j                  |d      | j"                  k\  j%                         }| j                  j                  ||      | j                   j                  |d      z  }||z  d|z
  |
z  z   | _        |t        j                  |      z  }t        j                  |t        j&                  |dz         z         }|| j"                  k\  j                         }t        j                  |      }t        j(                  | j                  |z
        t+        j,                  t+        j.                  |j                              z  }d d d        dS # 1 sw Y   xY w)	Nr   r   r   r   r   r   g:0yE>)entropy	used_currusagedk)r  r  r  r!   no_gradr;   r\   r>   r)   r   matmulsumr"  r$  r  r  r  r  re   lognormrh   r   prod)r   r   latent_statesr  r  r  latent_states_onehot_codebook_sum_codebook_elemr%  _random_codebookold_codebookr*  	norm_code_codebook_probr(  r)  r+  s                     r3   update_codebookz&JukeboxBottleneckBlock.update_codebook  s]   049L9LdNdNd-N]]_ 	a $);;/@-BUBUVWBXanauau#v  ))!]-?-?=CVCVWXCY-Z\]^!LL)=}MM155"5=NJJ}-E$U^^EKKN%CDEWFWX  ==L "T%6%6 6#(m9S SD!#d&8&8!8C"H;V!VD'',,->BdnnT[[]E))../@.QTXTfTfTkTk!1U I "Y/1u9@P2PPDM+eii.GGNyy%))NT<Q2R!RSSG'4>>9>>@IIIe$EDMML89BGGBGGLL^L^D_<``B3	a4 #URTUU5	a 	as   IJ  J)c                    |j                  ddd      j                         }|j                  d|j                  d         }|j                  d   | j                  k(  rbt        j                  |t        j                  |      z
        t        j                  t        j                  |j                              z  }||fS |j                  d   d| j                  z  k(  r|dd | j                  f   |d| j                  d f   }}t        j                  |t        j                  |      z
        t        j                  t        j                  |j                              z  t        j                  |t        j                  |      z
        t        j                  t        j                  |j                              z  z   }||z   }|fS )Nr   r8   r   r   .)permuter   r   r\   r  r!   r0  meanrh   r   r1  )r   r   prenormx1x2s        r3   
preprocessz!JukeboxBottleneckBlock.preprocess  sw   %--aA6AAC%**2}/B/B2/FGr"d&9&99jjM1J!JKbggVXV]V]^k^q^qVrNssG g%%   $D,?,?(??"3(=$*=*=(=#=>cSWSfSfShNh@iBzz"uzz"~"56AR9SS

2

2./"''"''"((:K2LLG
 GMg%%r5   c                     |\  }}|j                  ||d      j                  ddd      j                         }|j                  ||      }||fS )Nr   r   r8   r   )r   r<  r   )r   r2  dequantised_statesx_shapero   times         r3   postprocessz"JukeboxBottleneckBlock.postprocess  sZ    "
D/44ZrJRRSTVWYZ[ffh%**:t<000r5   c                 >   | j                   j                         }t        j                  |dz  dd      dt        j                  ||      z  z
  t        j                  |dz  dd      z   }t        j
                  |d      \  }}t        j                  |      }||fS )Nr8   r   Tr   keepdimr   r   )r  tr!   r.  r-  r   r=  )r   r2  codebook_weightsdistancemin_distancerj   fits          r3   quantisezJukeboxBottleneckBlock.quantise  s    ==??,IImQ&B=%,,}.>??@ii(!+DAB 	
 &+YYxR%@"ljj&S  r5   c                 F    t        j                  || j                        }|S r   )r%   	embeddingr  )r   rj   rC  s      r3   
dequantisez!JukeboxBottleneckBlock.dequantise  s    [[t}}E!!r5   c                     |j                   \  }}}| j                  |      \  }}| j                  |      \  }}|j                  ||      }|S r   )r\   rA  rO  r   )r   r2  samples_seq_lenrj   s         r3   encodezJukeboxBottleneckBlock.encode  sW    +11G  ??=9q --6a $((':r5   c                     |j                   \  }}| j                  |      }|j                  ||| j                        j	                  ddd      j                         }|S Nr   r8   r   )r\   rR  r   r  r<  r   )r   rj   rT  rV  rC  s        r3   decodezJukeboxBottleneckBlock.decode  se    '-- "__\: ##GWd6I6IJRRSTVWYZ[ffh 	 "!r5   c           	         |j                   \  }}}| j                  |      \  }}|r| j                  s| j                  |       | j	                  |      \  }}| j                  |      }	|r| j                  ||      }
ni }
t        j                  |	j                         |z
        dz  t        j                  |j                         z  }||	|z
  j                         z   }	| j                  ||	||f      \  }}	||	|t        d||d|
fS )Nr8   )rN  pn )r\   rA  r  r&  rO  rR  r:  r!   r0  detachrh   r1  rF  dict)r   r   r:  rT  rU  rV  r>  rj   rN  rC  update_metricscommit_losss               r3   r   zJukeboxBottleneckBlock.forward  s   +11G "&!?w 499}- !MM-8c!__\: !11-NNN jj!3!:!:!<}!LMQRRUWU\U\]j]p]pUqq +.@=.P-X-X-ZZ ,0+;+;LJ\_fho^p+q((/d>isw>iZh>iiir5   r  )r   r   r   r   r   r"  r&  r:  rA  rF  rO  rR  rW  rZ  r   r   r   s   @r3   r  r  x  sG    	c1 	cXV<&"1
!"
"jr5   r  c                   2     e Zd Z fdZd ZddZd Z xZS )JukeboxBottleneckc                     t         |           || _        t        j                         | _        t        | j                        D ]&  }| j
                  j                  t        |             ( y r   )	r   r   r[   r   r   r   r@   rO   r  )r   rm   r[   rn   r   s       r3   r   zJukeboxBottleneck.__init__  sW    MMO4;;' 	EE$$%;F%CD	Er5   c                     t        | j                  |      D cg c]  \  }}|j                  |       }}}|S c c}}w r   )rc   r   rW  )r   	raw_audior   r   rj   s        r3   rW  zJukeboxBottleneck.encode  sG    RUVZVgVgirRs
2N;K}-
 
 
s   :c                     || j                   }t        | j                  || |      D cg c]  \  }}|j                  |       }}}|S c c}}w r   )r[   rc   r   rZ  )r   rj   start_level	end_levelr   zquantised_audios          r3   rZ  zJukeboxBottleneck.decode   s^    I:=d>O>OP[\e>fht:u
&6{AKq!
 
 
s   Ac                    g g g g f\  }}}}t        | j                        D ]  }| j                  | dz
     }||   } ||| j                        \  }	}
}}|j	                  |	       | j                  s|
j                         }
|j	                  |
       |j	                  |       | j                  s|j	                  |        ||||fS )Nr   )r:  )r@   r[   r   trainingrO   r^  )r   input_audiorj   quantised_statescommit_lossesmetricsrn   r   r   sampled_tokensquantised_statera  metrics                r3   r   zJukeboxBottleneck.forward(  s    ACRR>&w4;;' 	'E++UFQJ7K'.MCNt}}D@NO[& /== #2"8"8":##O4  -}}v&	' -}gEEr5   r   N)r   r   r   r   rW  rZ  r   r   r   s   @r3   rc  rc    s    EFr5   rc  a?  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zThe Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/abs/2002.08111).

    c                        e Zd ZeZdZd Zdef fdZddZdde	j                  fdZddZdd	Zd
 Zde	j                  dee	j                  e	j                  f   fdZ xZS )JukeboxVQVAEvqvaec                 .   t        |t        j                        r@|j                  j                  j                  dd| j                  j                  z         nt        |t              rz| j                  j                  r%|j                  j                  j                          n|j                  j                  j                  dd| j                  j                  z         nt        |t              rr| j                  j                  r\|j                  j                  j                  j                          |j                  j                  j                  j                          t        |t        j                        rI|j                  j                  j                          |j                  j                  j                  d       t        |t        j                         r2|j                  %|j                  j                  j                          y y y )Nr   {Gz?r=  r!  r   )
isinstancer   	Embeddingr   datanormal_rm   
init_scaler   zero_outzero_r   r   r   r   fill_Linearr   modules     r3   _init_weightszJukeboxVQVAE._init_weightsX  sV   fbll+MM&&CTDKK<R<R5R&S.{{##""((*""**@V@V9V*W 564;;;O;OOO""''--/OO  %%++-fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r5   rm   c                 r   t         |   |       |j                  }|j                  }|j                  s~t        ||      D cg c]
  \  }}||z   }}}t        j                  |      }|j                  |j                  z  |z  |z  |_        |j                  j                  t              |_        |j                  | _        |j                  | _        |j                  | _        t        ||      D cg c]
  \  }}||z   c}}| _        t        j                  | j                        | _        |j"                  x| _        }t%        |      D 	cg c]+  }	t        | j                  | j                   |	 dz
     z        - c}	| _        |j(                  |j(                  ndg|z  | _        t+        j,                         | _        t+        j,                         | _        t%        |      D ]  }	|j2                  | j(                  |	   z  }
|j4                  | j(                  |	   z  }| j.                  j7                  t9        ||
||	dz   |d |	dz    |d |	dz                 | j0                  j7                  t;        ||
||	dz   |d |	dz    |d |	dz                  t=        ||      | _        y c c}}w c c}}w c c}	w Nr   ) r   r   res_downs_tres_strides_tra   rc   rh   r1  sample_length_in_secondssampling_rateastyperA   r  commitdownsamplescumprodhop_lengthsr[   r@   music_tokens_shapesmultipliersr   r   encodersdecodersres_conv_widthres_conv_depthrO   r   r  rc  
bottleneck)r   rm   r   r   stridedownr  top_raw_to_tokensr[   rn   r   r   r   s               r3   r   zJukeboxVQVAE.__init__i  s    $$((	##<?	7<STLFD64<TKT " 4//&2F2FFJ[[!$"F  $*#7#7#>#>s#CF !'!9!9mm#11=@G=TU\VTFDLU::d&6&67%}},fSXY_S`$
JOS##t'7'7
'CCD$
  281C1C1O6--VWUX[aUa6] 	E))D,<,<U,CCE))D,<,<U,CCEMM  vueUQY%RS)@TV_`kbgjkbkVlm MM  vueUQY%RS)@TV_`kbgjkbkVlm	 ,FF;A U V$
s   J(5J.0J4c                     || j                   }| j                  j                  |||      }| j                  |   |dd }} ||d      }|j	                  ddd      }|S )Nrh  ri  r   r   Fr  r8   )r[   r  rZ  r  r<  )r   rj   rh  ri  r2  decoderdequantised_states          r3   _decodezJukeboxVQVAE._decode  sq    I..|`i.j%)]];%?qQRAS"#$5%H-55aA>  r5   returnc                     |D cg c]  }t        j                  ||d       }}g }t        |      D ]9  }|D 	cg c]  }	|	|   	 }
}	| j                  |
||      }|j	                  |       ; t        j
                  |d      S c c}w c c}	w )a  
        Transforms the input `music_tokens` to their `raw_audio` representation.

        Args:
            music_tokens (`torch.LongTensor`):
                Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
                should be an index to a corresponding `code` vector in the codebook.
            start_level (`int`, *optional*):
                Level at which the decoding process will start. Default to 0.
            end_level (`int`, *optional*):
                Level at which the decoding process will start. Default to None.
            bs_chunks (int, *optional*):
                Number of chunks to process at the same time.
        r   r   r  )r!   rb   r@   r  rO   r:   )r   rj   rh  ri  	bs_chunkstokentoken_chunksrC  r   chunksmusic_tokens_ir  s               r3   rZ  zJukeboxVQVAE.decode  s     KWWE9!<WWy! 	9A6BCFfQiCNC $^`i j%%&78	9 yy+33 X Ds
   BBc                 .   || j                   }|j                  ddd      j                         }g }t        | j                         D ]-  }| j                  |   } ||      }|j                  |d          / | j                  j                  |      }	|	|| S )Nr   r8   r   r   )r[   r<  re   r@   r  rO   r  rW  )
r   rf  rh  ri  rn  r2  rn   encoderlatent_staterj   s
             r3   _encodezJukeboxVQVAE._encode  s    I''1a06684;;' 	3EmmE*G";/L  b!12	3 --m<K	22r5   c                     t        j                  ||d      }g }|D ]'  }| j                  |||      }|j                  |       ) t	        | D 	cg c]  }	t        j
                  |	d       }
}	|
S c c}	w )a  
        Transforms the `input_audio` to a discrete representation made out of `music_tokens`.

        Args:
            input_audio (`torch.Tensor`):
                Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
                form the codebook will be computed for each sequence of samples.
            start_level (`int`, *optional*, defaults to 0):
                Level at which the encoding process will start. Default to 0.
            end_level (`int`, *optional*):
                Level at which the encoding process will start. Default to None.
            bs_chunks (int, *optional*, defaults to 1):
                Number of chunks of raw audio to process at the same time.
        r   r   r  )r!   rb   r  rO   rc   r:   )r   rn  rh  ri  r  audio_chunksmusic_tokens_listchunk_ir  music_tokens_levelrj   s              r3   rW  zJukeboxVQVAE.encode  s     {{;	qA# 	5G!\\'{V_\`N$$^4	5 X[\mWnoAS		"4!<oo ps   A3c           	          | j                   D cg c])  }t        j                  d| j                  |g|d      + }}| j	                  |      S c c}w )Nr   rf   )r    r>   )r  r!   randintr  rZ  )r   	n_samplesmusic_tokens_shaperj   s       r3   r   zJukeboxVQVAE.sample  s\     '+&>&>
" MM!T339:ZGY:Zchi
 
 {{<((	
s   .Arf  c                    |j                  ddd      j                         }g }t        | j                        D ]-  }| j                  |   } ||      }|j                  |d          / | j                  |      \  }}}	}g }
t        | j                        D ]C  }| j                  |   } ||||dz    d      }|
j                  |j                  ddd             E t        |	      }| j                  |z  }|
|fS )a"  
        Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
        The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
        computed.

        Args:
            raw_audio (`torch.FloatTensor`):
                Audio input which will be encoded and decoded.

        Returns:
            `Tuple[torch.Tensor, torch.Tensor]`


        Example:
        ```python
        >>> from transformers import JukeboxVQVAE, set_seed
        >>> import torch

        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
        >>> set_seed(0)
        >>> zs = [torch.randint(100, (4, 1))]
        >>> model.decode(zs).shape
        torch.Size([4, 8, 1])
        ```
        r   r8   r   r   Fr  )
r<  re   r@   r[   r  rO   r  r  r.  r  )r   rf  rn  r2  rn   r  r  rU  rj   rp  rC  r  r  ra  losss                  r3   r   zJukeboxVQVAE.forward  s   8  ''1a06684;;' 	3EmmE*G";/L  b!12	3
 -1OOM,J)<4;;' 	JEmmE*G 'UUQY(GTY Z%%&7&?&?1a&HI	J
 -({{[(!4''r5   ru  r   Nr   )r   r   r   r   config_classbase_model_prefixr  r   r  r!   TensorrZ  r  rW  r   FloatTensorr   r   r   r   s   @r3   rw  rw  M  su     &L%"%<1 %<N	!4RWR^R^ 4.3.)-(!2!2 -(uU\\5<<=W7X -(r5   rw  c                   $     e Zd Z fdZd Z xZS )
JukeboxMLPc                 &   t         |           |j                  }t        |j                  |z        }t        ||      | _        t        ||      | _        t        |j                     | _
        t        j                  |j                        | _        y r   )r   r   hidden_sizerA   mlp_multiplierr   c_fcc_projr	   act_fnactr   Dropoutresid_dropoutdropout)r   rm   r   r   r   s       r3   r   zJukeboxMLP.__init__  sp    &&	..:;
!)Z8	#J	:&--(zz&"6"67r5   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r  )r   r   s     r3   r   zJukeboxMLP.forward  s@    		-0/M2]3r5   r   r   s   @r3   r  r    s    	8r5   r  c                   *     e Zd Zd fd	Z fdZ xZS )JukeboxLayerNormc                     t         |   |||       t        j                  |      | _        d| j                  z  | _        y )N)epselementwise_affinei  )r   r   rh   r1  r   	max_numel)r   normalized_shaper  r  r   s       r3   r   zJukeboxLayerNorm.__init__"  s:    )sGYZWW-.
+r5   c                    |j                         | j                  kD  rPt        j                  || j                  | j
                  | j                  | j                        j                  |      S t        | )  |      j                  |      S r   )numelr  r%   
layer_normr  r   r   r  r   r   r   )r   inputr   s     r3   r   zJukeboxLayerNorm.forward'  sg    ;;=4>>)<<t'<'<dkk499VZV^V^_gghmnn7?5)11%88r5   )gh㈵>Tr   r   s   @r3   r  r  !  s    ,
9 9r5   r  c                        e Zd Zd fd	Zd Zd ZddZd Zd Zd Z	d Z
d	 Zd
 Zd ZddZddZddZddZed        Zd ZddZd Zd ZddZd Zd Z xZS )JukeboxAttentionc           	      D   t         |           |j                  | _        |j                  | _        |j
                  | _        t        |j                  | j                  z        }||j                  z  | _	        || _
        || _        | j                  dz  | _        |j                  | _        |dk(  r:t        | j                  |      | _        t        | j                  |dz        | _        nt        | j                  |dz        | _        t        || j                        | _        t%        j&                  |j
                        | _        t%        j&                  |j(                        | _        || _        |dk(  r| j,                  | _        n(|dk(  r| j0                  | _        n| j2                  | _        | j4                  df| j6                  df| j8                  df| j:                  d f| j<                  df| j>                  df| j4                  d f| j@                  dfd	}||   \  | _!        | _"        |jF                  | _#        |jH                  | _$        | jF                  | j                  | jF                  z  | _%        d
| _&        i | _'        |jP                  | _)        d| _*        y )Ng      пcross_attentionr8   rZ   
prime_attnr   r   r   )
dense_attn
block_attntranspose_block_attnprev_block_attnsummary_attnsummary_spread_attnr  r  r   F)+r   r   r  r   n_headsattn_dropoutr  rA   attention_multiplierhead_dimrP   r   scaler   r   c_attnc_enc_kvr  r   r  r  	attn_func
decode_qkvqkv	prime_qkvfactored_qkvr  r  r  r  r  r  r  attn	attn_maskr   r   	block_ctxr   cachenb_relevant_lyric_tokensencoder_lenrecord_attn)r   rm   rP   r  r   ATTENTION_MAPr   s         r3   r   zJukeboxAttention.__init__/  s*   ++~~**44t~~EF
"fnn4
$]]E)
KK	))'
CDK)$..*q.IDM'
QGDK#J?JJv':':;ZZ(<(<= #))DH,&~~DH((DH  ??,<=??,<=%)%>%>@P$Q $ 4 4d;!..	:$($<$<i#H $6??G4	
 %2)$<!	4>mmmm;;"!ZZ4;;6DN
!:: r5   c           
      >   | j                   }| j                  rt        j                  ||z  ||z        }n*t        j                  ||      }|j	                  ||z         |j
                  }|j                         }| j                  rrt        | j                  |j                  d      |j                  d      | j                  | j                  |j                  || j                        }|||z  dd|z
  z  z   }t        j                   |d      j#                  |      }	| j$                  rJ|	| _        | j(                  dk(  r4| j&                  d d d d | j*                  d d | j*                  f   | _        | j-                  |	      }	t        j                  |	|      }
|
S )Nr   g    er   r   r  )r  rm  r!   r-  mul_r   re   r   r   r  r    r   r   r>   r   r%   r&   typer  attention_probr  r  r  )r   query_states
key_statesvalue_statesr   r  attention_weightattn_weight_typer   r  context_statess              r3   _attnzJukeboxAttention._attne  ss   

==$||L5,@*uBTU$||L*E!!%%-0+11+11399 !!"%# ''	D #3d#:TQX=N#N #3<AABRS"0D~~-&*&9&9!Q@P@P@RTfVZVfVfTf:f&g#**>:nlCr5   c                     |j                  dddd      j                         }g |j                         d d |j                  d      |j                  d      z  } |j                  | S )Nr   r8   r   rZ   r  r   )r<  r   r    r   )r   r   new_hidden_states_shapes      r3   merge_headszJukeboxAttention.merge_heads  ss    %--aAq9DDF"oM$6$6$8"$="o}?Q?QRT?UXeXjXjkmXn?n"o!}!!#:;;r5   c                     g |j                         d d | j                  |j                  d      | j                  z  } |j                  | }|r|j                  dddd      S |j                  dddd      S )Nr   r   r8   rZ   r   )r    r  r   r<  )r   r   is_keyr  s       r3   split_headszJukeboxAttention.split_heads  s    #
!#2&#
LL#
 r"dll2#

 +**,CD ((Aq!44 ((Aq!44r5   c                     | j                  |      }| j                  |d      }| j                  |      }| j                  ||||      }| j                  |      }|S )NT)r  )r  r   r  )r   querykeyr   r   r  s         r3   r  zJukeboxAttention.dense_attn  sa      's40  'E3v>)).9r5   c                    | j                   }|j                  \  }}}|r%| j                  ||||      j                  |d|      S |j                  d   }	|j                  ||	z  |z  ||      }|	|k  r6|	}|d d | d f   j	                         }|d d | d f   j	                         }|j                  ||z  |z  ||      }|j                  ||z  |z  ||      }| j                  ||||      j                  |||      S r  )r  r\   r  r   r   )
r   r  r	  r   r   r  ro   rV  r   r   s
             r3   r  zJukeboxAttention.block_attn  s   NN	).&
GY??5#uf=BB:qR[\\ ;;q>LJJzL8IEyR[\Eg%&!gXY,'224a'l+668((:/9<iSCJJzG3y@)YWE??5#uf=BB:wXabbr5   c                    | j                   }|j                  \  }}}|rK|dz
  |z  }	|d d |	d |d d f   }|d d |	d |d d f   }| j                  ||||      j                  |d|      S |j                  d   }
|j                  ||
|z  ||      }|j	                  dd      j                         }|j                  ||z  |
|z  |      }|j                  |||z  ||      }|j	                  dd      j                         }|j                  ||z  ||z  |      }|j                  |||z  ||      }|j	                  dd      j                         }|j                  ||z  ||z  |      }| j                  ||||      }|j                  |||
|z  |      }|j	                  dd      j                         }|j                  ||
|      }|S )Nr   r8   )r  r\   r  r   	transposer   )r   r  r	  r   r   r  ro   rV  r   	block_lenr   r  s               r3   r  z%JukeboxAttention.transpose_block_attn  s   NN	).&
GY 1	1Ia-I-q01C!Y1	1145E??5#uf=BB:qR[\\ ;;q>LJJz<9+DiQZ[EOOAq)446EJJzI5|y7PR[\E((:w)';Y	RC--1%002C((:	17i3GSCJJz7i+?IVEOOAq)446EJJzI5w)7KYWEUFCJ#YPY@Y[deJ#--a3>>@J#\9MJr5   c                    | j                   }|j                  \  }}}|r|dz
  |z  }	|	dz
  |z  }
|	dkD  r#|d d |
|
|z   d d f   }|d d |
|
|z   d d f   }n\t        j                  ||||j                  |j
                        }t        j                  ||||j                  |j
                        }| j                  ||||      j                  |d|      S |j                  d   }|j                  ||z  |z  ||      }|j                  |||z  ||      d d d dd d d d f   }t        j                  j                  j                  |d      }|j                  ||z  |z  ||      }|j                  |||z  ||      d d d dd d d d f   }t        j                  j                  j                  |d      }|j                  ||z  |z  ||      }||k  r||z  }||z  }|}|j                  ||||      d d | d f   }|j                         j                  ||z  ||      }|j                  ||||      d d | d f   }|j                         j                  ||z  ||      }| j                  ||||      j                  |||      S )Nr   r   r>   r   r   r   r   r   r   r   r   )r  r\   r!   r;   r>   r   r  r   r   r   r   r   )r   r  r	  r   r   r  ro   rV  r   r   prev_lr   nb_query_blocksnb_key_blockss                 r3   r  z JukeboxAttention.prev_block_attn  s   NN	).&
GYq[Y.Eai9,Fqy!Vfy&88!;<a&9*<!<a?@kk*i5<<_d_j_jkJ	9U\\afalalm??5#uf=BB:qR[\\ ;;q>LJJzL8IEyR[\E((:w)';Y	RSTVYWYVY[\^_S_`C((%%))#/ABC((:/9<iSCJJz7i+?IVWXZ][]Z]_`bcWcdEHH''++E3EFEJJzG3y@)YWEg%".)"; '9 4&hhz=)YOPQTcScSdPdenn&++J,H)U^_

:}iSTUXgWgWhThi((*//
_0LiYbc??5#uf=BB:wXabbr5   c                 *   | j                   }| j                  }|j                  \  }}}	|r|d d |dz
  ||z  dz
  |d d f   }t        j                  j
                  j                  |d      }|d d |dz
  ||z  dz
  |d d f   }t        j                  j
                  j                  |d      }| j                  ||||      j                  |d|	      S |j                  ||||z  |	      d d d ddd d f   }t        j                  j
                  j                  |d      }|j                  ||||z  |	      d d d ddd d f   }t        j                  j
                  j                  |d      }| j                  ||||      j                  |||	      S )Nr   r   r   )	r   r  r\   r!   r   r   r   r  r   )
r   r  r	  r   r   r   r  ro   rV  r   s
             r3   r  zJukeboxAttention.summary_attn  s   NN	).&
GYaQ));a)?)KQNOC((%%))#|<C!Y]Vi-?!-CiOQRRSEHH''++E<@E??5#uf=BB:qR[\\((:vw&/@)LQPSQSPSUWYZ][C((%%))#|<CJJz67f3DiPQRTWUWTWY[]^Q^_EHH''++E<@E??5#uf=BB:wXabbr5   c                    | j                   }| j                  }|j                  \  }}}	|rt        |j	                  ||||z  |	      d d d d| d d d f   }t
        j                  j                  j                  |d      j                         }|j	                  |||z  |	      }|j	                  ||||z  |	      d d d d| d d d f   }t
        j                  j                  j                  |d      j                         }|j	                  |||z  |	      }| j                  ||||      j	                  |||	      S )Nr   r  )r   r   r\   NotImplementedErrorr   r!   r   r   r   r   r  )
r   r  r	  r   r   r   r   ro   rV  r   s
             r3   r  z$JukeboxAttention.summary_spread_attn  s7   ).&
GY%%((:vw&/@)LQPSQSPSV\U\U]_`M`aC((%%))#/ABMMOC((:v	BCJJz67f3DiPQRTWUWTWZ`Y`YacdQdeEHH''++E3EFQQSEJJz6F?IFE??5#uf=BB:wXabbr5   c                 n    | j                   }|d d d |f   }|d d d |f   }| j                  ||||      S r   )_encoder_lenr  )r   r  r	  r   r   r  s         r3   r  zJukeboxAttention.prime_attn  sF    ''!\k\/"a+o&uc5&99r5   c                 
   |j                   d   }|t        d      |j                  dd      \  }}}|r| xj                  |z  c_        | j	                  ||      \  }}| j                         }| j                         |kD  r| j                  |        |dkD  rG| j                  dk7  r5| j                  |d      }| j                  |      }| j                  |      }d	}n| j                  d
   }| j                  d   }||||fS )Nr   )last_encoder_hidden_states should be NonerZ   r8   r   r  T)r  Fr	  r   )r\   	TypeErrorrb   r   _append_cache_suff_cache_len
_cache_len_slice_cacher  _pad_to_block_ctxr  )	r   r   last_encoder_hidden_statesr   curr_ctxr  r	  r   l_caches	            r3   r  zJukeboxAttention.factored_qkv  s    &&q)%1GHH)//q/9sEMMX%M++C7JC**,G 7*!!7(+!|>>\1 2252EE005C 2259Ejj'

7+c5&((r5   c                    |j                   d   }|t        d      |j                  dd      \  }}}|r| j                         | j                  k  r| j                  ||       | j                         | j                  kD  r| j                  d| j                         | j                  d   | j                  d   }}| xj                  |z  c_        ||||fS )	Nr   r  rZ   r8   r   r   r	  r   )	r\   r  rb   r  r  r  r  r  r   r   r   r!  r   r"  r  r	  r   s           r3   r  zJukeboxAttention.prime_qkv4  s     &&q)%1GHH)//q/9sE 4#4#44""3. 4#4#44!!!T%6%67E*DJJw,?CMMX%Mc5&((r5   c                    |j                   d   }|}|r| j                  dk(  rN| j                  |j                  |            j	                  dd      \  | j
                  d<   | j
                  d<   | j
                  d   | j
                  d   }}| xj                  |z  c_        n4| j                  |j                  |            j	                  dd      \  }}||||fS )Nr   r   r8   r   r	  r   )r\   r   r  r   rb   r  r%  s           r3   r  zJukeboxAttention.decode_qkvB  s     &&q)}}!9=.66}E:%q%/ 7

5!4::g#6 E*DJJw,?CMMX%M'A'I'I-'XY__`agh_iJCc5&((r5   c                 d   |j                   d   }| j                  |      }| j                  |||      \  }}}}| j                  ||||      }|j                   d   |k7  r0| j	                  |      }	|d d |	|	|z   d d f   j                         }| j                  |      }| j                  |      S )Nr   r!  r   )r\   r  r  r  _offsetr   r  r  )
r   r   r!  r   r"  r  r	  r   attention_scoresrG   s
             r3   r   zJukeboxAttention.forwardP  s     &&q)M2$(HH6PY_ %- %
!sE6  99UC?!!!$0\\(+F/6FX<M3Mq0PQ\\^;;'78!!"233r5   c                 \    | j                   }|| j                  z  dz   }|| j                  z  S r  )r  r   )r   r  encoder_blockss      r3   r  zJukeboxAttention._encoder_len]  s/    &&%49++r5   c                 Z    | j                   dk(  ry| j                  |z
  | j                  z  S )Nr  r   )r  r   r  )r   r"  s     r3   r)  zJukeboxAttention._offsetc  s)    >>\)(DNN::r5   c                    |j                   d   }|r| j                  |      nd}||z   | j                  z   dz
  | j                  z  }|| j                  z  |z
  |z
  }|dk(  r|dk(  r|S t        j                  |dd||f      S )Nr   r   )r\   r)  r  r%   r   )r   r   r  rV  rG   n_blocksr   s          r3   r   z"JukeboxAttention._pad_to_block_ctxh  s    %%a(*/g&Qf$t~~59dnnL''1F:!8!  55Avs(;<<r5   c                 Z    d| j                   vrdS | j                   d   j                  d   S )Nr	  r   r   )r  r\   r   s    r3   r  zJukeboxAttention._cache_lenr  s,    +qKE1B1H1H1KKr5   c           	         | j                   dz
  | j                  z  dz   | j                  z   }| j                   | j                   dz
  | j                  z  dz   | j                   | j                   | j                  k  r| j                   n|| j                  t        | j                   | j                        d}|| j
                     S )z
        Precondition:
            key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
            location in the context.
        r   )r  r  r  r  
cross_attnr  )r   r  r  r   r  r  )r   previous_block_lengthREQUIRED_CACHE_LENs      r3   r  z JukeboxAttention._suff_cache_lenu  s     "&!2dnn Dq H4>> Y--==1,>B$(MM040Ot}}Uj**dmmT->->?
 "$..11r5   c                     | j                   d   d d ||f   | j                   d<   | j                   d   d d ||f   | j                   d<   y )Nr	  r   )r  )r   rS   rv   s      r3   r  zJukeboxAttention._slice_cache  sJ     JJu-asl;

5"jj1!U3Y,?

7r5   c                    d| j                   vr|| j                   d<   || j                   d<   n||}}t        j                  | j                   d   |gd      }t        j                  | j                   d   |gd      }| j                   d= | j                   d= ~~|| j                   d<   || j                   d<   | j                   d   | j                   d   fS )Nr	  r   r   r   )r  r!   r:   )r   r	  r   old_key	old_values        r3   r  zJukeboxAttention._append_cache  s    

" #DJJu"'DJJw!$eYG))TZZ.8a@CIItzz'2I>AFE

5!

7# #DJJu"'DJJwzz% $**W"555r5   c                     d| _         d| j                  v r| j                  d= d| j                  v r| j                  d= i | _        y )Nr   r	  r   )r   r  r1  s    r3   	del_cachezJukeboxAttention.del_cache  s@    DJJ

5!djj 

7#
r5   r  r   NFr   )r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r   propertyr  r)  r   r  r  r  r  r;  r   r   s   @r3   r  r  .  s    4!l D<

5c :#cJc&c$:).))4 , ,
;
=L2$@6 r5   r  c                   (     e Zd Zd fd	ZddZ xZS )JukeboxBlockc                 D   t         |           |j                  | _        t	        |||      | _        t        |j                        | _        t        |      | _	        t        |j                        | _
        |j                  rd|j                  z  nd| _        || _        y )Nr  r   )r   r   r  r   r  r  r  layer_norm_0r  mlplayer_norm_1attn_res_scale
num_layersr   r  )r   rm   rP   r  r   s       r3   r   zJukeboxBlock.__init__  s    ''
$VUiH	,V-?-?@f%,V-?-?@4:4I4Iv000s"r5   c                     |}| j                  |      }| j                  |||      }| j                  ||z         }| j                  |      }| j                  dk(  r
||z   |z   }|S || j                  ||z   z  z   }|S )Nr   )rC  r  rE  rD  r   )r   r   r!  r   r   output_statesoutputs          r3   r   zJukeboxBlock.forward  s    !	))-8		-1KVT)))m*CD/>>S .>F  ==3P!QQFr5   r<  r   r   r   s   @r3   r@  r@    s    	#r5   r@  c                   2     e Zd Z fdZd ZddZd Z xZS )JukeboxLayerStackc           
      0   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        | j                  || j                  z  | _        |j                  | _
        |j                  | _        t        | j                     }t        j                         | _        t!        | j
                        D ]/  }| j                  j#                  t%        || ||                   1 g | _        y )NrB  )r   r   rP   r  r   rG  r   attention_patternr  r  r  r  r   r   r   
_attn_modsr@   rO   r@  saved_attn_weights)r   rm   rP   rN  r   r   s        r3   r   zJukeboxLayerStack.__init__  s    
''
 ++mm!'!9!9;;""dkk1DN!::~~ /t/E/EF--/4??+ 	dEOO""<IZ[`Ia#bc	d #%r5   c                     fd}t        | j                        D ]  \  }} ||      |j                  _         sg | _        yy)a-  
        Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.

        Args:
            record_attn (`Union[bool,set]`):
                Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
                to dump all.
        c                 0    t        t              rS | v S r   )r|  r(   )	layer_idxr  s    r3   _should_record_attnz>JukeboxLayerStack.set_record_attn.<locals>._should_record_attn  s    +t,""++r5   N)	enumeraterO  r  r  rP  )r   r  rT  r   layers    `   r3   set_record_attnz!JukeboxLayerStack.set_record_attn  sI    	,
 "$//2 	<HAu%8%;EJJ"	< &(D# r5   c                 ,   t        | j                        D ]{  \  }}|j                  dk(  r ||||      }n ||d |      }|j                  j                  sC| j
                  j                  |j                  j                  j                         } |S )Nr  r(  )	rU  rO  r  r  r  rP  rO   r  r   )r   r   r!  r   r   
attn_layers         r3   r   zJukeboxLayerStack.forward  s    &t7 	NMAz##'88 *!>Xag! !+=UYbh i**''..z/E/E/L/LM	N r5   c                 Z    | j                   D ]  }|j                  j                           y r   )rO  r  r;  )r   rY  s     r3   r;  zJukeboxLayerStack.del_cache  s%    // 	(JOO%%'	(r5   r=  )r   r   r   r   rW  r   r;  r   r   s   @r3   rL  rL    s    %()*(r5   rL  c                   $     e Zd Z fdZd Z xZS )JukeboxPositionalEmbeddingc                     t         |           t        j                  t	        j
                  ||f            | _        y r   )r   r   r   r   r!   r   pos_emb)r   r   r   r   s      r3   r   z#JukeboxPositionalEmbedding.__init__  s,    ||EKKE0B$CDr5   c                     | j                   }|S r   )r^  )r   r^  s     r3   r   z"JukeboxPositionalEmbedding.forward  s    ,,r5   r   r   s   @r3   r\  r\    s    Er5   r\  c                   |     e Zd Z	 	 	 	 	 d fd	Z	 	 	 	 	 	 ddZd Z	 	 	 	 	 	 	 	 d	dZd Z	 	 	 	 	 	 	 	 	 d
dZ xZ	S ) JukeboxConditionalAutoregressivec                 N   t         |           |j                  | _        |j                  | _        ||n|j
                  | _        ||n|j                  | _        t        j                  | j                  |j                        | _
        t        j                  |j                        | _        || _        || _        |s9t        j                   t#        j$                  d|j                  f            | _        t)        | j
                  |j                        | _        t        j                  |j                        | _        t/        || j
                        | _        || _        |j4                  | _        |j8                  rd| _        d| _        nd| _        d| _        |st        j>                  |j                  | j                  d      | _         | j<                  r%| j                  jB                  | j@                  _!        t"        j                  jE                         | _#        yy)aa  
        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
        set fro each configuration.

        Args:
            config (`JukeboxPriorConfig`):
                Model configuration class with all the parameters of the model. Initializing with a config file does
                not load the weights associated with the model, only the configuration. Check out the
                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
            n_ctx (`int`, *optional*):
                Number of tokens or lyrics tokens provided in a single pass.
            embed_dim (`int`, *optional*):
                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
                if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
            audio_conditioning (`bool`, *optional*, defaults to `False`):
                Whether or not the prior supports conditionning on audio.
            metadata_conditioning (`bool`, *optional*, defaults to `False`):
                Whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
            is_encoder (`bool`, *optional*, defaults to `False`):
                Whether the model is an encoder only model.
        Nr   )rP   FTr   )$r   r   r  r   rG  rP   music_vocab_sizer   r   r}  embed_tokensr  emb_dropoutembed_tokens_dropoutmetadata_conditioningaudio_conditioningr   r!   r   start_tokenr\  r^  pos_emb_dropoutrL  transformer
is_encoderr  r  merged_decoderadd_cond_after_transformershare_embed_tokens_fc_proj_outr  fc_proj_outr   CrossEntropyLossr  )r   rm   rP   r   ri  rh  rm  r   s          r3   r   z)JukeboxConditionalAutoregressive.__init__  s   > 	''
 ++#/UV\\
&/&;AXAXLL9K9KL$&JJv/A/A$B!%:""4$!||EKKF<N<N8O,PQD1$**f>P>PQ!zz&*<*<=,V4::F$!::  .3D+27D/.2D+26D/!yy););T^^RWXD22*.*;*;*B*B  '113DI	 r5   c                 B   |j                   d   }t        j                         5  |j                  |d      j	                         }ddd       | j
                  snt        j                  |d| j                  f|j                  | j                  j                  d   j                  j                  j                  j                        }|}	| j                  |      }
t        j                   |
ddddf   |
ddddf   fd      }
| j"                  r$|j                  || j                        |
dddf<   n| j$                  |
dddf<   | j'                  |
      | j)                  | j+                               z   |z   }
| j                  |
|      }
| j,                  r|
|z   }
|
}| j.                  r|
S | j1                  |
      }
t3        j4                         }|r|
ddd| j6                  f   j9                  d| j:                        }|
dd| j6                  df   j9                  d| j:                        } |||	ddd| j6                  f   j9                  d            t=        j>                  d      z  } |||	dd| j6                  df   j9                  d            t=        j>                  d      z  }||f}nH ||
j                  d| j:                        |	j                  d            t=        j>                  d      z  }|r||
fS |r||fS |dfS # 1 sw Y   xY w)	z
        Args:
            tokens (`torch.tensor`):
                Can represent music tokens, lyrics tokens or both, depending on the configuration.
        r   r   Nr   r  r   )r!  r7   ) r\   r!   r,  r   r<   ri  r;   r   r>   rl  rO  rD  r  r   r   re  r:   rh  rj  rg  rk  r^  ro  rm  rq  r   rr  r  reshaper   rh   r/  )r   rI   ri  rh  r!  	get_predsget_actsget_sep_lossro   targetr   activationsloss_fnlyric_hidden_statestoken_hidden_states
lyric_lossmusic_token_lossr  s                     r3   r   z(JukeboxConditionalAutoregressive.forwardD  s     \\!_
]]_ 	8[[R0557F	8 &&!&Q

+}}&&11!488==DDJJ" ))&1		=BC#8-3B3:O"PVWX%%"7"<"<Z"TM!Q$"&"2"2M!Q$ %%m4t7K7KDLLN7[[^pp 	 ((6P ) 
 **),>>M#??  ((7%%'"/3ET5E5E3E0E"F"N"NrSWSaSa"b"/43C3C3E0E"F"N"NrSWSaSa"b !4fQ@R$BRBR@R=R6S6[6[\^6_`cecicijmcnnJ&':F1dFVFVFXCX<Y<a<abd<efikioiopsitt 01D=--b$..A6;;r?SVXV\V\]`VaaD&&$$:g	8 	8s   !LLc                 F   |dk(  rt        j                  |d| j                  | j                  j                  j
                        j                  | j                  j                  j                        }| j                  r$|j                  || j                        |d d df<   n%| j                  |d d df<   n| j                  |      }|j                  || j                  | j                  fk(  r|d d ||dz   d d f   }n|}|| j                         ||dz    z   |z   }||fS )Nr   r   r   )r!   r   r   re  r   r   r=   r>   rh  r   rj  r\   rP   r^  )r   r   r  rI   ri  rh  r   conds           r3   get_embz(JukeboxConditionalAutoregressive.get_emb  s	   q=!KK	1djjHYHYH`H`HfHfgjj!!((//M ))&;&@&@DJJ&Wad#&*&6&6ad# --f5M##	4::tzz'JJ%aHqL)@!&CDD%D%x(Q,(OORVVd""r5   c
           	      R   |	| j                   }	| j                  st        j                  |d| j                  f| j
                  j                  d   j                  j                  j                  j                        j                  | j                  j                        }t        j                         5  g }
d }|rg }t        t!        d|	      d      }|D ]  }|j#                  d|	 dd	       | j%                  |||||      \  }}| j                  ||d
      }| j&                  r||z   }| j                  |      }|rj)                  |j+                                ||z  }t-        |||      }t        j.                  j1                  |      j3                         }|
j)                  |j+                                 ~| j
                  j5                          t        j6                  |
d      }|rt        j6                  d      }d d d        |rfS S # 1 sw Y   xY w)Nr   r   r   FleavezAncestral sampling  music tokensTrefreshr(  r+   r,   r*   r   )rP   ri  r!   r;   r   rl  rO  rD  r  r   r   r=   rq  r>   r,  r   r@   set_descriptionr  ro  rO   r   r4   distributionsCategoricalr   r;  r:   )r   r  ri  rh  r!  tempr+   r,   ru  sample_tokensrr  rI   predsiterr   r   r  s                    r3   r   z'JukeboxConditionalAutoregressive.sample  s      JJM&&!&Atzz*$2B2B2M2Ma2P2T2T2Y2Y2`2`2f2f"b!!(()  ]]_ !	0NFa/u=D  6$$':=/%Wae$f&*lli1CEZ'#t !% 0 0!>Xae !1 ! 22$1D$8M $ 0 0 ?LL!4!4!67 - 4 -m5PU V,,888NUUW%%flln5'6* &&(YY~15F		%Q/C!	0D 5= MK!	0 !	0s   5EHH&c                 H    ||z   dz
  |z  }g |g|dz
  z  |dz
  |z  dz   }|S r  r]  )r   length
chunk_sizen_passeschunk_sizess        r3   split_chunksz-JukeboxConditionalAutoregressive.split_chunks  sE    Z'!+
:U15U
j7PST7TUr5   c                    || j                   }|j                  d   }t        j                         5  |j	                  |d      j                         }d d d        t        j                  |dd      }t        |      }| j                  s|t        j                  |d| j                  f| j                  j                  d   j                  j                  j                  j                         j#                  |j$                        }t        j                         5  |	rg }|
t'        |      }
| j)                  t'        |      |
      }g }d}d }t+        |dd      D ]  }g g }}t-        |||z         D ]A  }| j/                  |||||      \  }}||   }|j1                  |       |j1                  |       C ||z   }t        j2                  |d      t        j2                  |d      }}~~|	s~| j                  ||d	
      }|	r$| j4                  r|z   }~|j1                  |       ~ |	r9t        j2                  |d      }| j7                  |      }j1                  |       |d   }t+        t-        t'        |      |      dt'        t-        t'        |      |             dd      }|D ]  }| j/                  |||||      \  }}| j                  ||d	
      }| j4                  r||z   }| j7                  |      }|	rj1                  |       ||z  }t9        |||      }t        j:                  j=                  |      j?                         }|j1                  |jA                                |} ~~| j                  jC                          t        j2                  |d      }|	rt        j2                  d      }d d d        |	rfS S # 1 sw Y   xY w# 1 sw Y   xY w)Nr   r   r   r   r   zPreparing past key valueF)rW   r  Tr(  	Sampling r  r  r  )"rP   r\   r!   r,  r   r<   splitr?   ri  r;   r   rl  rO  rD  r  r   r   r=   r>   r9   r  r   r@   r  rO   r:   ro  rq  r4   r  r  r   r   r;  )r   r  lyric_and_music_tokensri  rh  r!  r  r+   r,   ru  r  r  ro   sampled_audior  r  x_primesrS   r  current_chunk_sizesampled_audio_primeconds_primer   x_prime
cond_primeinput_tokens
itereratorr   r  rj   s                                 r3   primed_samplez.JukeboxConditionalAutoregressive.primed_sample  s      JJM+11!4
]]_ 	X%;%@%@R%P%U%U%W"	X $:A1E]+&&!&Atzz*$2B2B2M2Ma2P2T2T2Y2Y2`2`2f2f"b'../  ]]_ M	0 ! /
++C,>
KKHEE&*;=W_d&e  "35r[# %eU5G-G H 3H*.,, )U4FH]+'GZ *(3E'..w7&&z23  22&+ii0C&KUYYWbhiMj' "**7Oirv*w66")J"6"OOG,/ 2 ))H!4**73W% ),Lc-(-8 U3}+=}%M!N O}]J
 ' ,&*lli7IK`'#t !% 0 0!>Xae !1 ! 22$1D$8M $ 0 0 ?LL/ - 4 -m5PU V$22>>m>T[[]$$\%7%7%9:+',* l&&( 99]:L		%Q/[M	0\ &&y	X 	XM	0 M	0s   !OJ'OOO')NNFFF)NNNFFF)NNNr   r   r   FN)	NNNr   r   r   FNN)
r   r   r   r   r   r  r   r  r  r   r   s   @r3   ra  ra    s      #=4D  "#'DL#,  "#'9v  "#'n r5   ra  c                   *     e Zd ZdZ fdZddZ xZS )JukeboxMusicTokenConditionerz
    The `JukeboxMusicTokenConditioner` takes music tokens as an input (coresponding to the codes of the VQVAE's
    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
    c           	      t   t         |           t        j                  |j                  |j
                        | _        |j                  |_        t        ||j
                  |j                  |j                  |j                  |   |j                  |   d      | _        t        |j
                        | _        y )NF)r   )r   r   r   r}  rd  r  re  r   r   r  r  r  r  	upsamplerr  r  )r   rm   rn   r   s      r3   r   z%JukeboxMusicTokenConditioner.__init__T  s    LL)@)@&BTBTU!22/!!!!u%  '"
 +6+=+=>r5   c                     |d}|j                         }| j                  |      }||z   }|j                  ddd      }| j                  |      }|j                  ddd      }| j	                  |      }|S )a?  
        Args:
            music_tokens (`torch.LongTensor`):
                Music tokens form the uper level in range(nb_discrete_codes)
            raw_audio_conditionning (`torch.LongTensor`, *optional*):
                Audio used when primed sampling, raw audio information that conditions the generation
        r   r   r8   r   )r<   re  r<  r  r  )r   rj   raw_audio_conditionningr   s       r3   r   z$JukeboxMusicTokenConditioner.forwardd  s     #*&)##((*)),7%(?? &--aA6}5%--aA66r5   r   r   r   r   __doc__r   r   r   r   s   @r3   r  r  N  s    
? r5   r  c                   ,     e Zd ZdZd fd	ZddZ xZS )JukeboxRangeEmbeddinga  
    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
    embedding of length `n_ctx`.

    Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
    -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
    end
    c                     t         |           || _        || _        t	        j
                  ||      | _        |\  | _        | _        || _	        y r   )
r   r   n_timer   r   r}  embpos_minpos_maxr   )r   r  r   r@   	out_widthr   r   s         r3   r   zJukeboxRangeEmbedding.__init__  sC    "<<	95%*"dl
r5   c                 z   t        |j                        dk(  st        d|j                         | j                  |k  j	                         sE|| j
                  k  j	                         r(t        d| j                   d| j
                   d|       |j                         }|B| j                  r&|j                  | j                  | j
                        }|j                         }| j                  }|dk7  rPt        j                  d|t        j                  |j                        j                  d|      |z  }|||z
  |z  z   }n|}|| j                  z
  | j
                  | j                  z
  z  }| j                  |z  j                         j                         j!                         }| j#                  |      S )	Nr8   z Expected shape with 2 dims, got z
Range is [,z), got r   r   rV   )r9   r\   r  r  allr  re   r   r  r!   aranger>   r   r   floorr<   r^  r  )r   	pos_startpos_endr  interpolationpositionnormalised_positionbins_s           r3   r   zJukeboxRangeEmbedding.forward  sr   9??#q(>y>OPQQ	)..0i$,,6N5S5S5Uja~WYKXYYOO%	zz!--dllCmmoGQ;Qekk)BRBRSXXYZ\bcfll  !Gi$7=#HHH H  ($,,64<<$,,;VW"55<<>CCELLNxxr5   r   r   r  r   s   @r3   r  r  {  s    r5   r  c                   $     e Zd Z fdZd Z xZS )JukeboxLabelConditionerc                 .   t         |           |j                  }|j                  }|j                  }|j
                  \  }}|j                  }|j                  | _        t        j                  ||      | _
        t        j                  ||      | _        || _        | j                  rm|j                  |z  |j                  |z  f}	d|j                  |z  f}
d}t        d||	|      | _        t        |||
|      | _        t        ||||d      | _        y y )Nr   )r   r   r   T)r   )r   r   r  timing_dimsr  metadata_dimsrP   max_nb_genresr   r}  bow_genre_emb
artist_embinclude_time_signalmin_durationmax_durationr  total_length_embabsolute_pos_embrelative_pos_emb)r   rm   r  r   r  r  	nb_genres
nb_artistsr  total_length_rangeabsolute_pos_rangerelative_pos_ranger   s               r3   r   z JukeboxLabelConditioner.__init__  s   &&	((,, & 4 4	:#\\#11\\)Y?,,z9=#6 ##"("5"5"EvGZGZ]jGj!k"%v':':]'J!K!+$9![J\^g$hD!$9"K1CY%D! %:"K1CYVZ%D! $r5   c                 \   |d d ddf   }|d d ddf   }|d d ddf   }|d d ddf   }|d d dd f   }| j                  |      }|dk\  j                         j                  d      }| j                  |j	                  d            |z  j                  dd      }	|	|z   }
| j                  rx|||z   }}|j                         }|j                         }|j                         }| j                  |      | j                  ||      z   | j                  ||z  ||z        z   }|
|fS d }|
|fS )Nr   r   r8   rZ   r   TrH  )
r  re   rC   r  r   r.  r  r  r  r  )r   rw   rF   rG   r  artistgenrer  r   	genre_emb	start_embrS   rv   r^  s                 r3   r   zJukeboxLabelConditioner.forward  sa   1Q3'!QqS&!!QqS&!!QqS&!AB __V,

!!#--a0''A7$>CCSWCX	
*	 ##&3E'--/LKKME))+C%%l3''s34''(<cL>PQR  '!! G'!!r5   r   r   s   @r3   r  r    s    2"r5   r  c                   "    e Zd ZdZeZd Zddef fdZddZd Z	d	 Z
d
 Zd Zd ZddZddZd Z	 	 	 	 	 	 	 	 ddZddZd Zg dddfdZ	 	 ddej,                  deeej2                        dee   dee   deej,                     f
dZ xZS )JukeboxPrioru  
    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
    seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
    is defined, it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist,
    genre, lyrics and codes from lower-levels Priors.

    Args:
        config (`JukeboxPriorConfig`):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        level (`int`, *optional*):
            Current level of the Prior. Should be in range `[0,nb_priors]`.
        nb_priors (`int`, *optional*, defaults to 3):
            Total number of priors.
        vqvae_encoder (`Callable`, *optional*):
            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
            the vqvae module to avoid getting the parameters.
        vqvae_decoder (`Callable`, *optional*):
            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
            the vqvae module to avoid getting the parameters.
    c                 B   | j                   j                  }t        |t        j                        r,|j
                  j                  j                  dd|z         nt        |t              rh| j                   j                  r&|j
                  j                  j                          n|j
                  j                  j                  dd|z         nt        |t              r,|j                  j                  j                  dd|z         n`t        |t              r6|j                  j
                  j                  j                  dd|z         nt        |t              rAt!        |d      r5|j"                  j
                  j                  j                  dd|z         nt        |t              r7t!        |d      r+|j$                  j                  j                  dd|z         nt        |t&              rr| j                   j                  r\|j(                  j*                  j                  j                          |j(                  j,                  j                  j                          t        |t        j.                        rI|j,                  j                  j                          |j
                  j                  j1                  d       t        |t        j2                        r2|j,                  %|j,                  j                  j                          y y y )Nr   rz  r{  r  lm_headrj  r   )rm   r  r|  r   r}  r   r~  r  r   r  r  r\  r^  r  r  ra  hasattrr  rj  r   r   weigthr   r   r  r  )r   r  r  s      r3   r  zJukeboxPrior._init_weights  s'   [[++
fbll+MM&&CTJ5F&G.{{##""((*""**
9J*K :;NN''SdZ6G'H 56JJ""**
9J*K @AgfV_F`NN!!&&..CTJ=N.O @AgfVcFd##++$:K+L 564;;;O;OOO""''--/OO  %%++-fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r5   Nrm   c           
         t         	|   |       || _        || _        || _        ||n|j
                  | _        d| j
                   | _        |j                  | _        |j                  dkD  | _	        |j                  | _        |j                  | _
        | j
                  dk7  | _        | j
                  dz
  | _        | j                  rt        || j
                        | _        |j                  | _        | j                  rt!        || j                         | _        |j$                  | _        |j$                  r|j                  |j                  g| _        d|j(                  g| _        |j,                  | _        |j                  | _        t1        ||j                  |j                  z   |j(                  |j2                  z   | j                  xs | j                  d      | _        nO|j6                  }| j                  dk7  r| j                  r|j,                  | _        |j,                  | _        |j(                  | _        t1        || j                  | j<                  ddd      | _        tA        |j,                  |j,                        | j>                  _!        tE        |j,                        | j>                  _#        tI        jJ                  |j,                  |j(                  d	      | j>                  _&        nd| _        t1        || j                  xs | j                  | j                  
      | _        |j                  | _'        | j                  | jN                  z   | _(        tS        |jT                  |jV                        D cg c]
  \  }}||z   c}}| _,        | j
                  dk7  r| jX                  | j
                     nd | _-        t]        j^                  | jX                  d || j
                  z
         | _0        | j                  | j`                  z  | _1        td        jg                  d| j
                   d| jZ                   d| j`                   d| jb                          y c c}}w )Nzpriors.r   r   )r  T)rP   r   ri  rh  F)rP   r   ri  rh  rm  rc  )ri  rh  zLevel:z, Cond downsample:z, Raw to tokens:z, Sample length:)4r   r   vqvae_encodervqvae_decoderr[   rn   r  rP   r  lyric_conditioningencoder_loss_fractionri  
cond_levelr  conditioner_blocksrh  r  metadata_embeddingis_encoder_decoderinput_shapeslyric_vocab_sizeembed_dim_shiftr  r   ra  rd  rl   encoder_configlyric_acts_widthencoder_widthencoder_dimr  r   r  r  final_layer_normr   r  r  next_token_prediction_loss_dimstotal_loss_dimsrc   r  r  r  cond_downsamplerh   r1  raw_to_tokensra   loggerinfo)
r   rm   rn   	nb_priorsr  r  r  r  r  r   s
            r3   r   zJukeboxPrior.__init__  s     +*#/UV\\
#*4::,!7\\
"("A"AA"E(.(G(G%%+%A%A" #'**/**q.""&B64::&VD# &,%A%A"%%&=f^b^u^uZu&vD# #)";";$$!'!@!@&,, OD$%v'>'>#?D ++DJ,2,K,KD)955D 11F4K4KK$($;$;$Yt?Y?Y&*DJ $22N,,1d6M6M(6(B(B%%+%7%7"#)#:#: ?"77"..',*/#  (5^5O5OQWQcQc'd$0@ASAS0T-')yy1C1CVE\E\ch'i$01- :$($;$;$Yt?Y?Y&*&@&@DJ 06||,#<<t?c?cc=@AUAUW]WiWi=jk\VTFDLk?CzzQt//

;TXWWT%5%56N	DJJ8N%OP!ZZ$*<*<<TZZL 243G3G2HHXY]YkYkXl m))*,	
 ls   Q/Fc                    |j                         }||d d df<   t        | j                        |d d df<   t        || j                  z        t        || j                  z        z   |d d ddf<   | j	                  |      \  }}|r||fS |S rY  )r   rA   ra   r  set_metadata_lyric_tokens)r   rk   rS   rF   rG   rX   rw   rJ   s           r3   r`   zJukeboxPrior.get_metadataq  s    <<>%AT//0A v(:(::;c%$J\J\B\>]]AaC !::8D'W$$Or5   c                 f   | j                   dkD  rt        j                  |j                  d   | j                   ft        j                  |j
                        }g }t        |j                  d         D ]}  }|j                         ddd| j                  j                  z   df   }||df   ||df   ||df   }}}t        || j                   |||      \  }	}
|	||ddf<   |j                  |
        t        j                  |dddd| j                  j                  z   f   |fd      |fS |dfS )	z
        Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
        r   rV   Nr   r   r8   r   r   )r  r!   r;   r\   r<   r>   r@   r   r  r  rL   rO   r:   )r   rk   tokens_listindices_listidxrD   rF   rG   rH   rI   rJ   s              r3   r  z&JukeboxPrior.set_metadata_lyric_tokens  sJ    ((1,++a$"?"?@

[a[h[hK LV\\!_- -$llnQD4K4K4Y4Y0Y0[-[\17QQQWX[]^X^Q_hf";!>!>fV^# '-CF###G,- 		6!%Pq4+B+B+P+P'P%P"PQS^_egh 
 4<r5   c                    | j                   dk7  r|| j                   dz
     }|dd|| j                  z  || j                  z  f   }| j                  | j                  z  |d   j                  d   z
  }|dkD  rVt	        j
                  d|      j                  |j                        }t	        j                  ||fd      j                         }|g}|S d}|S )zE
        Extracts current level's conditioning music tokens.
        r   r   Nr   r   )
rn   r  rP   r\   r!   r;   r=   r>   r:   r<   )r   rj   rS   rv   music_tokens_condmissing_cond_len	init_condmusic_tokens_condss           r3   get_music_tokens_condsz#JukeboxPrior.get_music_tokens_conds  s     ::? ,TZZ!^ <,Q9M9M0MPSW[WkWkPk0k-klL#zzT-A-AADUVXDYD_D_`bDcc!#!KK+;<??@Q@X@XY	$)II/@).LRT$U$Z$Z$\!"3!4 "! "&!!r5   c                    |d   j                   d   }t        t        |            D ]3  }||   t        | j                  |         z   j                  |d      ||<   5 t        t        |            D ]W  }||   	t        j                  || j                  |   | j                  f|d   j                  |d   j                        ||<   Y t        j                  |d      t        j                  |d      fS )z
        Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music
        tokens should be shifted by. It is equal to `lyric_vocab_size`.
        r   r   rV   r   r   )r\   r@   r9   rA   r  r   r!   r;   r  r   r   r>   r:   )r   rI   condsro   r   s        r3   prior_preprocesszJukeboxPrior.prior_preprocess  s    
 AY__Q'
s6{# 	XAS)=)=a)@%AAGG
TVWF1I	X s5z" 	AQx ;;!2!21!5tzzB&QR)//bhijbkbrbra	 yyQ'5a)@@@r5   c                    |j                   d   }| j                  d   |j                   d   | j                  d   z
  f}t        t        j                  ||d            }t        t        |            D ]R  }t        | j                  |         }||   |z
  j                  |d      ||<   t        j                  ||   d      ||<   T |d   S )z
        Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
        tokens.
        r   r   r   r   )r   )r\   r  r?   r!   r  r@   r9   rA   r  r   r   )r   rI   ro   dimsr   
bins_shifts         r3   prior_postprocesszJukeboxPrior.prior_postprocess  s     \\!_
!!!$fll1o8I8I!8L&LMekk&$A67 s6{# 	6AT11!45JZ/55j"EF1IF1I15F1I	6
 bzr5   c                     |d| j                   dz    }d}t        t        t        || j                  g                  D ]  \  }} |||      } |S )zj
        Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
        Nr   )r  ri   r?   rc   r  )r   r   ri  r  conditioner_blocks        r3   re  zJukeboxPrior.embed_tokens  sj     00E$//A2EF!4<T#FX[_[r[rZsBt=u4v 	Z00!23DFX!Y	Z!!r5   c                     || j                   }|| j                  }t        j                         5  | j	                  ||||      }ddd       |S # 1 sw Y   S xY w)zi
        Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
        Nrh  ri  r  )rn   r[   r!   r,  r  )r   r   rh  ri  r  r2  s         r3   rW  zJukeboxPrior.encode  sj     **KI]]_ 	 ..;)W` / M	 		    AAc                     || j                   }|| j                  }t        j                         5  | j	                  ||||      }ddd       |S # 1 sw Y   S xY w)zK
        Usamples the sequence of codebook vectors to a raw audio.
        Nr  )rn   r[   r!   r,  r  )r   rj   rh  ri  r  rJ  s         r3   rZ  zJukeboxPrior.decode  sh     **KI]]_ 	''+V_ ( F	 		 r  c                    |3|j                   d   | j                  z
  }|ddd|f   |dd|df   }}nd\  }}| j                  r| j                  |      nd\  }}| j                  r| j                  |      n|}|||fS )z
        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
        can be None.
        Nr   )NN)r\   r  rh  r  ri  re  )r   r   rw   n_labelslyric_tokensrh  metadata_posri  s           r3   get_condzJukeboxPrior.get_cond  s    
 ~~a(4+H+HHH%-a(l%;Xal=SlH%/"Hl151K1KD##H-Q] 	,| GKF]F]T../ABco!#8,FFr5   c
                    |du xs |j                   d   dk(  }
ddd|
   }t        j                  | d| d| d	| d
| 	       t        j                         5  | j                  ||      \  }}}| j                  r{|
r| j                  |gd|g      \  }}n| j                  ||gd|g      \  }}|	|	| j                  z  }	| j                  j                  |||||||||		      }| j                  |      }n^| j                  |d      }|
r$| j                  j                  ||||||||	      }n%| j                  j                  ||||||||||	
      }ddd       |S # 1 sw Y   |S xY w)a  
        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.

        Args:
            n_samples (`int`):
                Number of samples to generate.
            music_tokens (`List[torch.LongTensor]`, *optional*):
                Previously gemerated tokens at the current level. Used as context for the generation.
            music_tokens_conds (`List[torch.FloatTensor]`, *optional*):
                Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
                conditionned on the upper-level tokens.
            metadata (`List[torch.LongTensor]`, *optional*):
                List containing the metatdata tensor with the artist, genre and the lyric tokens.
            temp (`float`, *optional*, defaults to 1.0):
                Sampling temperature.
            top_k (`int`, *optional*, defaults to 0):
                Top k probabilities used for filtering.
            top_p (`float`, *optional*, defaults to 0.0):
                Top p probabilities used for filtering.
            chunk_size (`int`, *optional*):
                Size of the chunks used to prepare the cache of the transformer.
            sample_tokens (`int`, *optional*):
                Number of tokens to sample.

        Nr   r   	AncestralPrimed)TFz
 sampling z samples with temp=z, top_k=z, top_p=)r  r+   r,   r  r  T)r   )r  r+   r,   r  )r\   r  r  r!   r,  r  r  r  r  rl   r  r  get_encoder_statesr   )r   r  rj   r   rw   r  r+   r,   r  r  no_past_contextnameri  rh  r  r  r!  s                    r3   r   zJukeboxPrior.sample  s   J '$.L,2D2DQ2G12L!(3ODtfJyk1DTF(SXRYYabgahij]]_ 3	FJmmTfhpFqC 5|&&"AEAVAV%/A(BB>*,> BFAVAV%|4t=O6PB>*,> !,!T%B%BBM#zz77*&))"/  8 
   $55lC-1-D-D\Z^-D-_*"#'::#4#4!*-2!##&3 $5 	$L $(::#;#;!$*-2!###-&3 $< $LQ3	h i3	h s   C<EE(c                 0   | j                   dk7  r| j                  rx|r*| j                  j                  |j                        | _        | j                  |ddd      }| j                  j                  |      }| j                  j                  |      }|S d}|S )z
        Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
        the lyric encoder.
        r   N)r  r  r  r=   r>   r  r  )r   r  r   
lyric_actsr!  s        r3   r  zJukeboxPrior.get_encoder_statesd  s    
 ((A-$2I2I#|||/B/BClD$EJ--j9J)-)F)Fz)R& *) *.&))r5   c                 X   | j                   r|| j                  j                  |      }t        j                  j                  |j                  d| j                        |j                  d            t        j                  d      z  }|S t        j                  d|j                        }|S )zW
        Computes the loss for the lyric encoder: next lyric token prediction.
        r   r7   r   r   )r  r  r  r   r   cross_entropyr   r  rh   r/  r!   tensorr>   )r   r!  target_lyricsencoder_losss       r3   get_encoder_losszJukeboxPrior.get_encoder_losss  s     "")-)=)=>X)Y&==66*//D4D4DE}GYGYZ\G]sL
  !<<4N4U4UVLr5   c                    |r%| j                   j                  j                  |       | j                  ||      \  }}}| j                  r6| j                  ||gd|g      \  }	}| j                  |	||d|      \  \  }
}}n<| j                  |      }| j                  ||      }
| j                  |||||      \  }}| j                  |
z  | j                  z  | j                  z  }||| j                  z  | j                  z  z  }|j                         j                         |
j                         j                         |j                         j                         d}|r!|j                         j                         |d<   |rG| j                   j                  j                  }| j                   j                  j                  d       |S ||fS )z
        Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
        vqvae's encoding layers.
        NT)rw  ru  )ru  )bpdr   next_token_prediction_lossr  F)rl   rl  rW  r  r  r  r  r!  r  r  r  r  r   r^  rP  )r   rj   r   rw   ru  rY   ri  rh  r  rI   r   r$  r  r!  r  rq  rP  s                    r3   rd   zJukeboxPrior.forward_tokens  s    JJ""223CDBF--PbdlBm?1<"")-)>)>|,t5G.H*&F& AE

*,APT`i AK A=6\5 *.)@)@)N&001K\ZL04

"%*# 1; 1-& ))L84;X;XX[_[o[oo*T-Q-QQTXThThhh .335<<>(..0779*D*J*J*L*S*S*U

 ${{}335GG!%!7!7!J!JJJ""2259%%= r5   r   rw   rZ  ru  r  c                     |j                   d   }| j                  ||      ^}}| j                  ||||      \  }}	|r| j                  |g|      }
nd}
|
||	fS )a  
        Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
        function. The loss is the sum of the `encoder` loss and the `decoder` loss.

        Args:
            hidden_states (`torch.Tensor`):
                Hidden states which should be raw audio
            metadata (`List[torch.LongTensor]`, *optional*):
                List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
            decode (`bool`, *optional*, defaults to `False`):
                Whether or not to decode the encoded to tokens.
            get_preds (`bool`, *optional*, defaults to `False`):
                Whether or not to return the actual predicitons of the model.
        r   )r  )rj   r   rw   ru  N)r\   rW  rd   rZ  )r   r   rw   rZ  ru  ro   rj   r   r  rq  rC  s              r3   r   zJukeboxPrior.forward  s    * #((+
,0KKQ[K,\))++%1	 , 
g !%l-P=O-P!Q!%!400r5   )NrZ   NNr   )NNr   )NNNr   r   r   NN)FF)r   r   r   r  r   r  r  r   r`   r  r  r  r  re  rW  rZ  r  r   r  r!  rd   r!   r  r   r   
LongTensorr(   r   r   r   s   @r3   r  r    s    . &L%6W
1 W
r" 2" A"$"G& ]~* 02DEdi+!b "'$)!1||!1 4 0 012!1 	!1
 D>!1 
ell	!1r5   r  c                   4     e Zd ZdZeZdZdZd Z fdZ	 xZ
S )JukeboxPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    jukeboxFc                 |    t        |t              st        |t              r|j                  |j                         y y r   )r|  r  rw  applyr  r  s     r3   r  z$JukeboxPreTrainedModel._init_weights  s-    fl+z&,/OLL--. 0Pr5   c                 $    t        |   |i | y r   )r   r   )r   inputskwargsr   s      r3   r   zJukeboxPreTrainedModel.__init__  s    &+F+r5   )r   r   r   r  r   r  r  supports_gradient_checkpointingr  r   r   r   s   @r3   r(  r(    s*    
 !L!&+#/, ,r5   r(  a"  
            labels (`List[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                condition the generation.
            sampling_kwargs (`Dict[Any]`):
                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
                arguments can bee seen in the [`_sample`] function documentation.
ao  The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
    individually.
    c                       e Zd ZdgZ fdZd ZddZddZd Zd Z	d Z
d	 Z ej                         	 	 	 	 	 	 	 	 	 	 	 dd
eej                     fd       Z ed      dd
eej                     fd       Z ede      d
eej                     fd       Z ede      d
eej                     fd       Z ede      d
eej                     fd       Z xZS )JukeboxModelr@  c           	      ,   t         |   |       |j                  }t        |      | _        | j                  |       t        j                  t        |j                        D cg c]  }t        |j                  |   |       c}      | _        y c c}w r   )r   r   vqvae_configrw  rx  set_shared_paramsr   r   r@   r  r  prior_configspriors)r   rm   r3  rn   r   s       r3   r   zJukeboxModel.__init__  st     **!,/
v&mmKPQWQaQaKbc%\&..u5u=c
cs   $ Bc                     |j                   D ]h  }|j                  |_        |j                  |_        |j                  |_        |j                  |_        |j
                  |_        |j                  |_        j y)z
        Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
        is nest, and is thus unreachable in the `from_dict` function
        N)r5  r  r  r  r  r  rh  )r   model_configrm   s      r3   r4  zJukeboxModel.set_shared_params	  so    
 #00 	NF#/#=#=F !-!9!9F".";";F".";";F#/#=#=F +7+M+MF(	Nr5   c                 >    | j                   j                  ||||      S r   )rx  rZ  )r   rj   rh  ri  r  s        r3   rZ  zJukeboxModel.decode	  s    zz  {IyQQr5   c                 >    | j                   j                  ||||      S r   )rx  rW  )r   rn  rh  ri  r  s        r3   rW  zJukeboxModel.encode	  s    zz  k9iPPr5   c                 6   ||z   dz
  |z  }t        |t        j                        rt        j                  ||d      S t        |t              r5t	        t        |D cg c]  }t        j                  ||d       c}       S |d g|z  S t        d      c c}w )Nr   r   r   zUnknown input type)r|  r!   r  r  r?   rc   r  )r   objr  
split_sizer  r   s         r3   split_batchzJukeboxModel.split_batch	  s    
*Q.:=c5<<(;;sJA66T"sStekk$
BSTUU[6H$$011	 Ts   Bc           	          | j                   |   }||   }	|j                  }
|	j                  d   }||
|z
  k  r||z   |d<   d}n|
|d<   ||
z
  |z   }| j                  |||||||      S )Nr   r  r   )r6  rP   r\   sample_single_window)r   rj   rk   rG   sampling_kwargsrn   tokens_to_samplemax_batch_sizerl   rr  rP   nb_sampled_tokensrS   s                r3   sample_partial_windowz"JukeboxModel.sample_partial_window 	  s     E"%e,*003u'777/@CS/SOO,E/4OO,%-0@@E((vvX]_dftuur5   c                    | j                   |   }|d   j                  d   }	|j                  }
||
z   }||   d d ||f   }|j                  dd       }d|v r||z
  }|j                  d   }||j                  d   z
  }t        j                  d| d| d||z    d| d	       |dk  r|S |j                  |||      }|j                  ||| j                  |      }| j                  ||	|      }| j                  ||	|      }| j                  ||	|      }g }t        t        |||      d	
      }|D ]  \  }}}ddg|j                  d   dk(     }|j                  d| d| d| d| j                  |j                  z   d        |j                  d|j                  d   |||d|}|j                  |        t!        j"                  |d      }|d d | d f   }t!        j"                  ||   |gd      ||<   |S )Nr   r  r   r  z tokens for [r  z]. Conditioning on z tokensFr  r  r  z[prior level z] z
 Sampling z tokens out of Tr  )r  rj   r   rw   r   r]  )r6  r\   rP   getr  r  r  r`   rF   r>  r   rc   r  r  r   rO   r!   r:   )r   rj   rk   rG   rA  rn   rS   rC  rl   r  rP   rv   previous_sampled_tokensr  conditioning_tokens
new_tokensr   rw   r  music_tokens_conds_listmetadata_listrI   r   r  music_tokens_conds_ir}   r  r|   rr  music_tokens_news                                 r3   r@  z!JukeboxModel.sample_single_window1	  s   E" O))!,	em".u"5asl"C'++OTBo-%KM5;;A>"%<%B%B1%EE
mE7!E-<O;P Q#$G-	

 ? #99,sS %%feT5F5FO ,,-DiQ_`"&"2"23EyR`"a((9nM-/FV^cd@H 	$<N0**>+?+?+Ba+GHD$$wbj H%%u':'::;= % 
 $u|| (..q1+#7#	
 "H MM(#	$ 6q1 *!j[\/:#iie)<>N(OUVWUr5   c	           
          || j                   |   j                  k\  rDt        || j                   |   j                  |      }	|	D ]  }
| j                  ||||||
|      } |S | j	                  |||||||      }|S r   )r6  rP   rT   r@  rE  )r   rj   rk   rG   rA  rn   rF   rQ   rC  r   rS   s              r3   sample_levelzJukeboxModel.sample_levell	  s     4;;u-333!,E0B0H0H*UH! #88 &&/5%Q_    55ffoulTbL r5   r  c                 t   | j                   d   }||}n<t        |	| j                  j                  z        |j                  z  |j                  z  }|t        t        | j                               }|| _        |D ]"  }|t        | j                         dz
  k(  rdn|||d}|| j                   |   j                  z  }t        | j                  j                  |   | j                   |   j                  z        }||k7  r|n|}| j                  |||   ||||||      }|s| j                  j                  ||   j                         t        j                         5  t        | j                         |z
  dz
  }| j                  j!                  |d|dz    |||   j"                  d         }ddd       d| }t$        j&                  j)                  |      st%        j*                  |       t-        |||j/                                |
s| j                   d   | j                   d   j0                  dkD  st        j                         5  t3        ||d   | j                   d   | j                        }ddd       t        j4                  d	i| d
       % |S # 1 sw Y   xY w# 1 sw Y   5xY w)aH  
        Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
        the generated raw audio at each step.

        Args:
            music_tokens (`List[torch.LongTensor]`):
                A sequence of music tokens of length `self.levels` which will be used as context to continue the
                sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
                level.
            labels (`List[torch.LongTensor]`):
                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
                which are used to condition the generation.
            sample_levels (`List[int]`):
                List of the desired levels at which the sampling will be done. A level is equivalent to the index of
                the prior in the list of priors
            metas (`List[Any]`, *optional*):
                Metadatas used to generate the `labels`
            chunk_size (`int`, *optional*, defaults to 32):
                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks
                means faster memory filling but more consumption.
            sampling_temperature (`float`, *optional*, defaults to 0.98):
                Temperature used to ajust the randomness of the sampling.
            lower_batch_size (`int`, *optional*, defaults to 16):
                Maximum batch size for the lower level priors
            max_batch_size (`int`, *optional*, defaults to 16):
                Maximum batch size for the top level priors
            sample_length_in_seconds (`int`, *optional*, defaults to 24):
                Desired length of the generation in seconds
            compute_alignments (`bool`, *optional*, defaults to `False`):
                Whether or not to compute the alignment between the lyrics and the audio using the top_prior
            sample_tokens (`int`, *optional*):
                Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
                experiments
            offset (`int`, *optional*, defaults to 0):
                Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
                greater than 0, the lyrics will be shifted take that intoaccount
            save_results (`bool`, *optional*, defaults to `True`):
                Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
                time.
            sample_length (`int`, *optional*):
                Desired length of the generation in samples.

        Returns: torch.Tensor

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
        >>> import torch

        >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()

        >>> labels = tokenizer(**metas)["input_ids"]
        >>> set_seed(0)
        >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
        >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
        >>> zs[0]
        tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
              353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
             1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
             1804,  541, 1804, 1434]])
        ```
        r   Nr   gGz?)r  r  r  )rh  r  zjukebox/level_)r   r   r   z/lyric_alignments.pt)r6  rA   rm   r  r  r@   r9   rF   r]   rP   rP  rx  r=   r>   r!   r,  rZ  r\   osr   existsmakedirsr   re   r  r   r   )r   rj   rk   sample_levelsr   r  sampling_temperaturelower_batch_sizerC  r  compute_alignmentsr  rG   save_resultsra   	top_priorrF   rn   rA  total_token_to_samplerQ   rh  rf  logdirr   s                            r3   _samplezJukeboxModel._sample|	  s   j KKN	$(L ,t{{/H/HHIYMdMdd''(L  !#dkk"23M )" %	\E %T[[)9A)= =CW(!.O %1DKK4F4T4T$T!T[[55e<t{{5?Q?W?WWXJ16-1G-^N,,u%	L 

l51889]]_ "%dkk"2U":Q">K $

 1 1$[uqy1{VbchViVoVopqVr !2 !I
 *%1ww~~f-KK'U	@QR%$++a.*DUVIpIpstIt i%2<DKKXYN\`\g\g%h
iJJj9fXEY;Z[K%	\N  i is   AJ")J."J+	.J7	a  
        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
        the VQ-VAE decoder to convert the music tokens to raw audio.

        Args:
            labels (`List[torch.LongTensor]`) :
                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
                which are used to condition the generation.
            n_samples (`int`, *optional*, default to 1) :
                Number of samples to be generated in parallel.
        c           
      b   |j                  dt        t        t        | j                                          }t        t        | j                              D cg c]6  }t        j                  |dt
        j                  |d   j                        8 }} | j                  |||fi |}|S c c}w )aR  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed

        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")

        >>> lyrics = "Hey, are you awake? Can you talk to me?"
        >>> artist = "Zac Brown Band"
        >>> genre = "Country"
        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
        >>> set_seed(0)
        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)

        >>> with torch.no_grad():
        ...     model.decode(music_tokens)[:, :10].squeeze(-1)
        tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
            -0.0818, -0.0697]])
        ```
        rU  r   rV   )
popr?   r@   r9   r6  r!   r;   r<   r>   r]  )r   rk   r  rA  rU  rU  rj   s          r3   ancestral_samplezJukeboxModel.ancestral_sample
  s    N (++OT%DKKHXBY=Z[Z_`cdhdodo`pZq
UVEKK	1EJJvay?O?OP
 
 $t||L&-[?[	
s   ;B,az  Generates a continuation of the previously generated tokens.

        Args:
            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c           
          |j                  dt        t        t        | j                                          } | j
                  |||fi |}|S )NrU  r_  r?   r@   r9   r6  r]  r   rj   rk   rA  rU  s        r3   continue_samplezJukeboxModel.continue_sample5
  sH     (++OT%DKKHXBY=Z[#t||L&-[?[r5   a  Upsamples a sequence of music tokens using the prior at level `level`.

        Args:
            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                A sequence of music tokens which will be used as context to continue the sampling process. Should have
                `self.levels` tensors, each corresponding to the generation at a certain level.
        c           
          |j                  dt        t        t        | j                        dz
                    } | j
                  |||fi |}|S )NrU  r   rb  rc  s        r3   upsamplezJukeboxModel.upsampleD
  sN     (++OT%DKKHX[\H\B]=^_#t||L&-[?[r5   a'  Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
        used: as conditioning for each level, which means that no ancestral sampling is required.

        Args:
            raw_audio (`List[torch.Tensor]` of length `n_samples` ) :
                A list of raw audio that will be used as conditioning information for each samples that will be
                generated.
        c           
         |j                  dt        t        t        | j                                          }| j
                  j                  |j                        j                          t        j                         5  | j
                  j                  |dt        | j                        |j                  d         }d d d         | j                  ||fi |}|S # 1 sw Y    xY w)NrU  r   r  )r_  r?   r@   r9   r6  rx  r=   r>   re   r!   r,  rW  r\   r]  )r   rf  rk   rA  rU  rj   s         r3   r  zJukeboxModel.primed_sampleS
  s     (++OT%DKKHXBY=Z[

i&&'--/]]_ 	::,,qC4DPYP_P_`aPb - L	 $t||L&-[?[	 	s   ?A CC'r  )N    g\(\?   ri     FNr   TN)r   )r   r   r   _no_split_modulesr   r4  rZ  rW  r>  rE  r@  rP  r!   r,  r   r&  r]  r   r`   JUKEBOX_SAMPLING_INPUT_DOCSTRINGrd  rf  r  r   r   s   @r3   r1  r1    sb    ((
NRQ	2v"8v  U]]_ !!# H  
e	!H HT 	$uO_O_J` > 	 	)	$uO_O_J` 	
 	 	)	4HXHXCY 	
 	 	)T%JZJZE[ r5   r1  )Cr  r   rR  typingr   r   r   rg   rh   r!   torch.nn.functionalr   r   r%   torch.nnr   FusedLayerNormry  r	   modeling_utilsr
   utilsr   r   utils.loggingr   configuration_jukeboxr   r   r   r   
get_loggerr   r  re   r4   rL   rT   r   r   r   Moduler   r   r   r   r   r   r  r  rc  JUKEBOX_START_DOCSTRINGrw  r  r  r  r@  rL  r\  ra  r  r  r  r  r(  rl  r1  r]  r5   r3   <module>rx     sC     	 ( (      0 " . 3 " l l 
		H	% !"E%L= #L,D6r=;6BII *:BII :*bii *bii (!RYY !4RYY 4RYY 6WjRYY Wjt'F		 'FT "  v(? v(v(r (
9~ 
9sryy sl299 49(		 9(x G ryy G T
*299 *Z-BII -`5"bii 5"pk1? k1\,_ ,$$   
 r) rrr5   