
    sg4                       d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mc mZ ddlZ	ddl	mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e%jP                  e)      Z*dZ+dZ,dZ-g dZ.dZ/dZ0dZ1ddgZ2dZ3dZ4	 	 dXdee5e5f   de6de5dee	jn                     de5dejp                  fdZ9 G d d ejt                        Z; G d! d"ejt                        Z< G d# d$ejt                        Z= G d% d&ejt                        Z> G d' d(ejt                        Z? G d) d*ejt                        Z@ G d+ d,e@      ZA G d- d.ejt                        ZB G d/ d0ejt                        ZC G d1 d2ejt                        ZD G d3 d4ejt                        ZE G d5 d6ejt                        ZF G d7 d8ejt                        ZG G d9 d:ejt                        ZH G d; d<ejt                        ZI G d= d>ejt                        ZJ G d? d@ejt                        ZK G dA dBe      ZLdCZMdDZN e"dEeM       G dF dGeL             ZO e"dHeM       G dI dJeL             ZP e"dKeM       G dL dMeL             ZQ e"dNeM       G dO dPeL             ZR G dQ dRejt                        ZS G dS dTejt                        ZT e"dUeM       G dV dWeL             ZUy)YzPyTorch WavLM model.    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_peft_availablelogging   )WavLMConfig   r   z1patrickvonplaten/wavlm-libri-clean-100h-base-plus)r   i$  i   zZ'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'gQ)@zmicrosoft/wavlm-base-plus-sdzmicrosoft/wavlm-base-plus-svg
ףp=
?shape	mask_probmask_lengthattention_mask	min_masksreturnc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                  d      j                         j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr   r   r   sequence_lengths     [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.pycompute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spanr   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFO    Ndtyper   F)replace)
ValueErrornprandomranditemsumdetachtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper%   put_along_axis)r   r   r   r   r   
batch_sizer+   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr&   r'   spec_aug_mask_idxdummy_mask_idxoffsetsr(   r)   s    `` `            @@r*   _compute_mask_indicesrP   L   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	2%%'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &     e Zd Zd fd	Zd Z xZS )WavLMNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   rY   z"WavLMNoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r,   c                 J    | j                  |      }| j                  |      }|S N)ra   rc   re   hidden_statess     r*   forwardz!WavLMNoLayerNormConvLayer.forward   s$    		-06r,   r   __name__
__module____qualname__rY   rm   __classcell__rh   s   @r*   rR   rR      s    Ar,   rR   c                   &     e Zd Zd fd	Zd Z xZS )WavLMLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rT   T)elementwise_affine)rX   rY   rZ   r[   r\   r   r]   r^   r_   r`   ra   	LayerNorm
layer_normr	   rb   rc   rd   s      r*   rY   z WavLMLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r,   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr-   )ra   	transposerz   rc   rk   s     r*   rm   zWavLMLayerNormConvLayer.forward   sV    		-0%//B76%//B76r,   rn   ro   rt   s   @r*   rv   rv      s    Ar,   rv   c                   &     e Zd Zd fd	Zd Z xZS )WavLMGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rT   T)
num_groupsnum_channelsaffine)rX   rY   rZ   r[   r\   r   r]   r^   r_   r`   ra   r	   rb   rc   	GroupNormrz   rd   s      r*   rY   z WavLMGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr,   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rj   )ra   rz   rc   rk   s     r*   rm   zWavLMGroupNormConvLayer.forward  s2    		-066r,   rn   ro   rt   s   @r*   r   r      s    r r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )rU   paddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)rX   rY   r   r]   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsra   utilsr   hasattrr   r
   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterWavLMSamePadLayerr   r	   rb   rc   )re   rf   r   r   r   r   rh   s         r*   rY   z%WavLMPositionalConvEmbedding.__init__  s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI()G)GH !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )r}   ra   r   rc   rk   s     r*   rm   z$WavLMPositionalConvEmbedding.forward1  sV    %//15		-0]36%//15r,   ro   rt   s   @r*   r   r     s    ABr,   r   c                   $     e Zd Z fdZd Z xZS )r   c                 P    t         |           |dz  dk(  rd| _        y d| _        y Nr   r   r   )rX   rY   num_pad_remove)re   r   rh   s     r*   rY   zWavLMSamePadLayer.__init__>  s)    #:Q#>!#Car,   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r   rk   s     r*   rm   zWavLMSamePadLayer.forwardB  s6    ")!Q0F43F3F2F0F*FGMr,   ro   rt   s   @r*   r   r   =  s    Kr,   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )WavLMFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )rg   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rX   rY   feat_extract_normr   r9   num_feat_extract_layersrR   rv   r1   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)re   rf   ir   rh   s       r*   rY   zWavLMFeatureEncoder.__init__L  s    ##w.26AFGKPQWQoQorsQsKtKFG)&1q5AK K %%0PUV\VtVtPuv126AFvKv01I1I0JJst  ==5&+#"K ws   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr   re   params     r*   _freeze_parametersz&WavLMFeatureEncoder._freeze_parameters]  s(    __& 	(E"'E	(#r,   c                 
   |d d d f   }| j                   r| j                  rd|_        | j                  D ]K  }| j                   r5| j                  r)| j                  r| j                  |j                  |      }D ||      }M |S )NT)r   trainingr   r   r   _gradient_checkpointing_func__call__)re   input_valuesrl   
conv_layers       r*   rm   zWavLMFeatureEncoder.forwardb  s    $QW- 4==*.M'** 	:J""t'B'Bt}} $ A A''!!
 !+= 9	: r,   )rp   rq   rr   __doc__rY   r   rm   rs   rt   s   @r*   r   r   I  s    8#"$
r,   r   c                        e Zd Z fdZ xZS )WavLMFeatureExtractorc                     t         |   |       t        j                  d| j                  j
                   d| j                  j                  d   j
                   dt               y )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)rX   rY   warningswarnrh   rp   	__bases__FutureWarningre   rf   rh   s     r*   rY   zWavLMFeatureExtractor.__init__v  s[     $..112 3NN,,Q/889E 		
r,   )rp   rq   rr   rY   rs   rt   s   @r*   r   r   u  s    
 
r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMFeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr-   eps)rX   rY   r   ry   rZ   layer_norm_epsrz   Linearr   
projectionDropoutfeat_proj_dropoutdropoutr   s     r*   rY   zWavLMFeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r,   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS rj   )rz   r   r   )re   rl   norm_hidden_statess      r*   rm   zWavLMFeatureProjection.forward  s:    !__];(:;]3000r,   ro   rt   s   @r*   r   r     s    <1r,   r   c                       e Zd ZdZ	 	 	 	 ddedededededef fdZ	 	 	 	 dd	ej                  d
e
ej                     de
ej                     dedeej                  e
ej                     e
eej                        f   f
dZd	ej                  d
eej                  ej                   f   dej                  dedej                  ej                  ff
dZdededej                  fdZdej                  dej                  fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   num_bucketsmax_distancehas_relative_position_biasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        || _        || _        t        j                   t#        j$                  d| j                  dd            | _        t        j                  | j
                  d      | _        |r0t        j*                  | j                  | j                        | _        y y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )rX   rY   r   r   r   head_dimr1   scalingr   r   k_projv_projq_projout_projr   r   	Parametertorchr@   gru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)re   r   r   r   r   r   r   rh   s          r*   rY   zWavLMAttention.__init__  s7    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*ii	95ii	95ii	95		)Y7&(!#ejjDNNAq.Q!R"$))DMM1"=%"$,,t/?/?"PD &r,   rl   r   position_biasoutput_attentionsr    c                     |j                         \  }}}|S| j                  ||      }|j                  d      j                  |ddd      j	                  || j
                  z  ||      }|j	                  |j                  dd | j
                  dfz         }	|	j                  dddd      }	| j                  |	      }
|
j	                  |	j                  dd dz         j                  d      }
t        j                  |
      j                  dd      \  }}||| j                  z  d	z
  z  d
z   }|j	                  || j
                  z  dd      |z  }|j	                  d||f      }| j                  ||||      \  }}|||fS )z'Attention layer with relative attentionNr   r   r-   r   r   )r      r         ?g       @)sizecompute_bias	unsqueezerepeatviewr   r   permuter   r6   r   sigmoidchunkr   torch_multi_head_self_attention)re   rl   r   r   r   indexbsztgt_lenrH   gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r*   rm   zWavLMAttention.forward  s    (,,.Wa   --gw?M''*11#q!Q?DDS4>>EY[bdkl  ,001D1DSb1IT^^]_L`1`a199!Q1E "&!8!89L!M!7!<!<=P=V=VWZXZ=[^d=d!e!i!ijl!m '=>DDQBDO)?)? ?# EFL *..sT^^/CRKm[166GW7MN$($H$H>+>@Q%
!\ L-77r,   r  c                 X   |j                  dd      x}x}}||j                  d      nd}dx}	}
d}t        j                  |||| j                  | j
                  t        j                  dg      t        j                  | j                  j                  | j                  j                  | j                  j                  f      |	|
|| j                  | j                  j                  | j                  j                  | j                   |||d| j                  j                  | j                  j                  | j                  j                        \  }}|j                  dd      }|C|dddf   j#                  |j$                  dd | j
                  fz   |j$                  dd z         }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)r}   neFmulti_head_attention_forwardr   r   r   emptycatr   rW   r   r   r   r   r   r   rD   r   )re   rl   r   r  r   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr  r  s                 r*   r   z.WavLMAttention.torch_multi_head_self_attention  s    ,55a;;;e3A3M>,,Q/SW  %&$B$BNNNNKKIIt{{'')9)94;;;K;KLMLLMM  MMMM%)++,,++,,++,,+%
!\2 "++Aq1# (40==""2A&$..)::\=O=OPQPR=SSL L((r,   query_length
key_lengthc                    t        j                  |t         j                        d d d f   }t        j                  |t         j                        d d d f   }||z
  }| j                  |      }|j	                  | j
                  j                  j                        }| j                  |      }|j                  g d      }|S )Nr.   )r   r   r   )	r   r=   long_relative_positions_buckettor   r   devicer   )re   r  r  context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r*   r   zWavLMAttention.compute_bias  s     <<EJJG4P,,zDT1WM+.>>#'#B#BCT#U #;#>#>t?R?R?Y?Y?`?`#a $$%=>	*r,   relative_positionsc                 $   | j                   dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }t        j
                  |j                         |z        }|t        j
                  | j                  |z        z  }|||z
  z  }||z   j                  t        j                        }t        j                  |t        j                  ||dz
              }|t        j                  |||      z  }|S r   )r   r  r   r  abslogfloatmathr   min	full_likewhere)re   r!  r   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r*   r  z)WavLMAttention._relative_positions_bucket!  s   &&!+.266uzzB[P"YY'9:1$	%	1&+ii0B0H0H0JY0V&W#&ADHHTM^M^ajMjDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\R\%]"%*YY&8RT_bcTc(d&
" 	EKK2DF`aar,   )        i@  i   TNNFr   )rp   rq   rr   r   r$   r%  r;   rY   r   Tensorr   r   rm   FloatTensorr   
LongTensor
BoolTensorr   r   r  rs   rt   s   @r*   r   r     s   G +/"Q"Q "Q 	"Q
 "Q "Q %)"QN 2604"''8||'8 !.'8  -	'8
  '8 
u||Xell3XeELL>Q5RR	S'8R5)((5) e..0@0@@A5) #..	5)
  5) 

U..	/5)n # %BSBS  U=N=N  SXSdSd  r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y rj   )rX   rY   r   r   activation_dropoutintermediate_dropoutr   r   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r*   rY   zWavLMFeedForward.__init__8  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r,   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S rj   )r;  r?  r9  r@  rB  rk   s     r*   rm   zWavLMFeedForward.forwardE  sX    //>00?11-@))-8++M:r,   ro   rt   s   @r*   r6  r6  7  s    @r,   r6  c                   2     e Zd Zddedef fdZddZ xZS )WavLMEncoderLayerrf   r   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y N)r   r   r   r   r   r   r   rX   rY   r   r   num_attention_headsattention_dropoutr   max_bucket_distance	attentionr   r   rA  r   ry   r   rz   r6  feed_forwardfinal_layer_normre   rf   r   rh   s      r*   rY   zWavLMEncoderLayer.__init__P      '((00,,**33'A
 zz&"7"78,,v'9'9v?T?TU,V4 "V-?-?VEZEZ [r,   c                     |}| j                  |||||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }||f}|r||fz  }|S )Nr   r   r   r   )rL  r   rz   rM  rN  )	re   rl   r   r   r   r   attn_residualr  outputss	            r*   rm   zWavLMEncoderLayer.forward_  s    %59^^)'/ 6D 6
2|] ]3%56%(9(9-(HH--m< -0&Gr,   Tr0  rp   rq   rr   r   r;   rY   rm   rs   rt   s   @r*   rE  rE  O  s    \{ \ \r,   rE  c                   2     e Zd Zddedef fdZddZ xZS ) WavLMEncoderLayerStableLayerNormrf   r   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y rG  rH  rO  s      r*   rY   z)WavLMEncoderLayerStableLayerNorm.__init__y  rP  r,   c                     |}| j                  |      }| j                  ||||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }||f}|r||fz  }|S )N)r   r   r   )rz   rL  r   rM  rN  )re   rl   r   r   r   rS  r  rT  s           r*   rm   z(WavLMEncoderLayerStableLayerNorm.forward  s    %659^^)'/	 6D 6
2|] ]3%5%(9(9$:O:OP]:^(__ -0&Gr,   rU  )NNFrV  rt   s   @r*   rX  rX  x  s    \{ \ \r,   rX  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w Nr   r   )r   F)rX   rY   rf   r   pos_conv_embedr   ry   r   r   rz   r   rA  r   r   r9   num_hidden_layersrE  layersr   re   rf   r   rh   s      r*   rY   zWavLMEncoder.__init__  s    :6B,,v'9'9v?T?TUzz&"7"78mmUZ[a[s[sUtuPQv16Ku
 ',# v   !Cc                    |rdnd }|rdnd }|d|| <   | j                  |      }||z   }| j                  |      }| j                  |      }t               xs t	        |       }	d }
t        | j                        D ]  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|	rM| j                  r,| j                  r | j                  |j                  |||
|      }n ||||
||      }|d d \  }}
|rd}|s|d   fz   } |r||fz   }|st        d |||fD              S t!        |||      S )	N r/  r   rR  r   NNNc              3   &   K   | ]	  }||  y wrj   re  .0vs     r*   	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>       mq_`_lm   last_hidden_staterl   
attentions)r_  rz   r   r
   r   	enumeratera  r   r4   r   rf   	layerdropr   r   r   tupler   re   rl   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsposition_embeddingssynced_gpusr   r   r   dropout_probabilityskip_the_layerlayer_outputss                   r*   rm   zWavLMEncoder.forward  s    #7BD$5b4%-0M>/*"11-@%(;;6]302R6LT6R!$++. !	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![..4==$($E$E%&%)%M %*%'5&3*;%M 0=Ra/@,} 2 &9]1=M<O&O#C!	PF   1]4D Dm]4EGZ$[mmm++*
 	
r,   NFFTro   rt   s   @r*   r\  r\    s    	, "C
r,   r\  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w r^  )rX   rY   rf   r   r_  r   ry   r   r   rz   r   rA  r   r   r9   r`  rX  ra  r   rb  s      r*   rY   z$WavLMEncoderStableLayerNorm.__init__  s    :6B,,v'9'9v?T?TUzz&"7"78mm v778 1UVZ[U[]
 ',#rc  c                    |rdnd }|rdnd }|d|| <   | j                  |      }||z   }| j                  |      }t               xs t        |       }	d }
t	        | j
                        D ]  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|	rL| j                  r,| j                  r | j                  |j                  |||
|      }n |||||
      }|d d \  }}
|rd}|s|d   fz   } | j                  |      }|r||fz   }|st        d |||fD              S t!        |||      S )Nre  r   )r   r   r   r   rf  c              3   &   K   | ]	  }||  y wrj   re  rh  s     r*   rk  z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr><  rl  rm  rn  )r_  r   r
   r   rq  ra  r   r4   r   rf   rr  r   r   r   rz   rs  r   rt  s                   r*   rm   z#WavLMEncoderStableLayerNorm.forward  s    #7BD$5b4%-.M>/*"11-@%(;;]302R6LT6R!$++.  	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ ..4==$($E$E%&%)%M %*%'5*;&3	%M 0=Ra/@,} 2 &9]1=M<O&O#A 	PD 6 1]4D Dm]4EGZ$[mmm+;LYl
 	
r,   r~  ro   rt   s   @r*   r  r    s    ," "A
r,   r  c                   8     e Zd ZdZ fdZed        Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    c                 0   t         |           |j                  | _        |j                  | _        |j                  | j                  z  dk7  r&t        d|j                   d| j                   d      t        j                  t        j                  d| j                  | j
                  z  |j                  | j                  z              | _        t        j                  |j                  d   | j                  | j
                  z        | _        d| _        y )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   r-   r   )rX   rY   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimr1   r   r   r   r2  codevectorsr   rZ   weight_projtemperaturer   s     r*   rY   z#WavLMGumbelVectorQuantizer.__init__H  s     6688  4??2a7)&*?*?)@ A66:oo5F G%%  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r,   c           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   r   gHz>r-   )meanr   expr6   r$  )probsmarginal_probs
perplexitys      r*   _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity]  sR    *YY		.599^VZEZ;[*[ac ddeiik
r,   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      }|j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )Nr-   T)tauhardr   r   r   r|   )r   r  r   r   r   r   
functionalgumbel_softmaxr%  r  type_asr   softmaxr  argmax	new_zerosscatter_r   r  r  r6   )re   rl   rG   r)   r   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  s              r*   rm   z"WavLMGumbelVectorQuantizer.forwardc  s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;M<O<O<QW[WgWgnr;s/77F $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r,   )	rp   rq   rr   r   rY   staticmethodr  rm   rs   rt   s   @r*   r  r  B  s&    
*  
"'r,   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterc                    t         |           j                  j                  k7  rTt	        j
                  j                  j                        | _        t	        j                  j                        | _        nd x| _        | _        t	        j                  fdt        j                        D              | _        j                  | _        y )Nc              3   4   K   | ]  }t                y wrj   )WavLMAdapterLayer)ri  rH   rf   s     r*   rk  z(WavLMAdapter.__init__.<locals>.<genexpr>  s     #h!$5f$=#hs   )rX   rY   output_hidden_sizer   r   r   projry   proj_layer_normr   r9   num_adapter_layersra  rr  r   s    `r*   rY   zWavLMAdapter.__init__  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#huVMfMfGg#hh))r,   c                 h   | j                   .| j                  "| j                  |      }| j                  |      }|j                  dd      }| j                  D ]D  }t        j
                  j                         }| j                  r|| j                  kD  s= ||      }F |j                  dd      }|S r   )r  r  r}   ra  r2   r3   r   rr  )re   rl   r   layerdrop_probs       r*   rm   zWavLMAdapter.forward  s    99 T%9%9%E IIm4M 00?M%//15[[ 	5EYY--/N==^dnn%D %m 4	5
 &//15r,   ro   rt   s   @r*   r  r    s    *r,   r  c                   $     e Zd Z fdZd Z xZS )r  c                     t         |           t        j                  |j                  d|j                  z  |j
                  |j                  d      | _        y )Nr   r   )rV   r   )rX   rY   r   r]   r  adapter_kernel_sizeadapter_stridera   r   s     r*   rY   zWavLMAdapterLayer.__init__  sJ    II%%)))&&((
	r,   c                 j    | j                  |      }t        j                  j                  |d      }|S )Nr   r   )ra   r   r  glurk   s     r*   rm   zWavLMAdapterLayer.forward  s/    		-0))-Q)?r,   ro   rt   s   @r*   r  r    s    
r,   r  c                       e Zd ZdZeZdZdZdZd Z		 dde
ej                  ef   dee   fd	Z	 dd
edej                  fdZy)WavLMPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    wavlmr   Tc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weightsr/  r   )r  stdr   r   )abNr   )r<  r  r  r   datanormal_rW   zero_r   inituniform_r  r   ra   r&  sqrtrU   in_channels	constant_r   r   in_featuresr   rf   initializer_rangery   r   fill_r]   kaiming_normal_r   )re   moduleks      r*   _init_weightsz"WavLMPreTrainedModel._init_weights  s    f89%%**222C##((..0GGV//0 <=GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 67		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r,   NrI   add_adapterc                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   divr&   rU   rV   s      r*   _conv_out_lengthzOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s"     99\K7wWZ[[[r,   r   )rf   r  zipr^   r_   r9   r  r  )re   rI   r  r  rU   rV   rH   s          r*    _get_feat_extract_output_lengthsz5WavLMPreTrainedModel._get_feat_extract_output_lengths  s     2=1Ddkk--+	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q 4;;99: _ 04;;C]C] ^_ r,   feature_vector_lengthr   c                     |j                  d      d d df   }| j                  ||      }|j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr-   r   r  r   )r/   r  r   )r  )cumsumr  r  r   r  r   r:   r/   r  r=   flipr;   )re   r  r   r  non_padded_lengthsoutput_lengthsrG   s          r*   "_get_feature_vector_attention_maskz7WavLMPreTrainedModel._get_feature_vector_attention_mask  s    
 ,22r2:1b5A>>?Q_j>k'**5::6#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   rj   )rp   rq   rr   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r   r   r3  r$   r   r;   r  r  re  r,   r*   r  r    sx    
 L$O&*#9D Z^"5#3#3S#89HPQU0 Y]%(:?:J:Jr,   r  a  
    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
            **not** be passed to avoid degraded performance when doing batched inference. For such models
            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
            models also yield slightly different results depending on whether `input_values` is padded or not.

            </Tip>

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z_The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.c                   b    e Zd Zdef fdZd Zd Z	 	 ddej                  de	ej                     de	ej                     fdZ ee       eeeed	e
      	 	 	 	 	 dde	ej&                     de	ej&                     de	ej                     de	e   de	e   de	e   deeef   fd              Z xZS )
WavLMModelrf   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        |j(                  rt+        |      nd | _        | j/                          y )Nr/  )rX   rY   rf   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   r   r   r1  r   r  masked_spec_embeddo_stable_layer_normr  encoderr\  r  r  adapter	post_initr   s     r*   rY   zWavLMModel.__init__R  s     !4V!<"8"@   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&6v>DL'/DL/5/A/A|F+t 	r,   c                 X    t        j                  dt               | j                          yz
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r   freeze_feature_encoderre   s    r*   freeze_feature_extractorz#WavLMModel.freeze_feature_extractorf  '    
 	Q	

 	##%r,   c                 8    | j                   j                          y
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r   r  s    r*   r  z!WavLMModel.freeze_feature_encoderr  s    
 	113r,   rl   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTNr   )r   r   r   r   )r  r/   )r   r   r   r-   )getattrrf   r   r  r  r/   r  r   rP   mask_time_lengthmask_time_min_masksr   tensorr  r;   r  mask_feature_lengthmask_feature_min_masksexpand)re   rl   r  r   rG   r)   r   mask_feature_indicess           r*   _mask_hidden_stateszWavLMModel._mask_hidden_statesy  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r,   audio
checkpointoutput_typer  modalityexpected_outputr   r   ru  rv  r    c                 H   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|!| j                  |j                  d   |d      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }| j                  | j                  |      }|s
||f|	dd  z   S t        |||	j                  |	j                        S )	Nr   r   Fr  )r  r   r   r   ru  rv  r   )ro  extract_featuresrl   rp  )rf   r   ru  use_return_dictr  r}   r  r   r  r  r  r  r   rl   rp  )
re   r   r   r  r   ru  rv  r  rl   encoder_outputss
             r*   rm   zWavLMModel.forward  sb   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!#34qr7JJJ&+-)77&11	
 	
r,   )NNNNNNN)rp   rq   rr   r   rY   r  r  r   r2  r   r3  r  r   WAVLM_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr1  r;   r   r   rm   rs   rt   s   @r*   r  r  L  s   { (
&4 :>59	,((, $E$5$56, !!1!12	,\ ++AB&+$. 269=,0/3&*2
u||,2
 !.2
 $E$5$56	2

 $D>2
 'tn2
 d^2
 
u--	.2
 C2
r,   r  zcWavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                       e Zd Zddee   f fdZd Zd Zd Zd Z	 e
e       eeeeee      	 	 	 	 	 ddeej&                     d	eej&                     d
ee   dee   dee   deej&                     deeef   fd              Z xZS )WavLMForCTCtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )rX   rY   r  r  r   r   final_dropoutr   r  
vocab_sizer1   rh   r   r  r  r   r   lm_headr  )re   rf   r  r  rh   s       r*   rY   zWavLMForCTC.__init__  s     '
zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r,   c                     | j                   }|&t        | j                  dd      t        d| d      |-t        | j                  dd      t        j                  d       y|| j                  |d       yy)a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  rf   r1   loggerinfoload_adapter)re   r  s     r*   tie_weightszWavLMForCTC.tie_weights  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r,   c                 X    t        j                  dt               | j                          yr  r  Nr  r  s    r*   r  z$WavLMForCTC.freeze_feature_extractor  r  r,   c                 L    | j                   j                  j                          yr  r  r  r   r  s    r*   r  z"WavLMForCTC.freeze_feature_encoder"      
 	

$$779r,   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  r   r   r   s     r*   freeze_base_modelzWavLMForCTC.freeze_base_model)  (    
 ZZ**, 	(E"'E	(r,   )r  r  r  r	  expected_lossr   r   r   ru  rv  labelsr    c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }| j                  |      }| j                  |      }	d}
|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |	dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }
ddd       |s|	f|t6        d z   }|
|
f|z   S |S t9        |
|	|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r.   r-   )r   r/   r   F)enabled)blank	reductionzero_infinitylosslogitsrl   rp  )rf   r  r%   r  r1   r  r   r  r   	ones_liker  r  r6   r  masked_selectr   r  log_softmaxfloat32r}   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rl   rp  )re   r   r   r   ru  rv  r-  rT  rl   r5  r4  rI   labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r*   rm   zWavLMForCTC.forward1  s'   0 &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]**)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIrj   r  )rp   rq   rr   r   r>  rY   r!  r  r  r*  r   r  r   r  r   r  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSr   r1  r;   r   r   rm   rs   rt   s   @r*   r  r    s    HSM .<*
&:( ++AB&"$,( 26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 CD
r,   r  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Z ee       e	e
eed      	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd              Z xZS )WavLMForSequenceClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nr  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )rX   rY   r   r  r1   r  r  r`  use_weighted_layer_sumr   r   r   r@   layer_weightsr   r   classifier_proj_size	projector
num_labels
classifierr  re   rf   
num_layersrh   s      r*   rY   z'WavLMForSequenceClassification.__init__  s     6=)f.@.@n   '
--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r,   c                 X    t        j                  dt               | j                          yr  r  r  s    r*   r  z7WavLMForSequenceClassification.freeze_feature_extractor  r  r,   c                 L    | j                   j                  j                          yr  r%  r  s    r*   r  z5WavLMForSequenceClassification.freeze_feature_encoder  r&  r,   c                 P    | j                   j                         D ]	  }d|_         yr(  r)  r   s     r*   r*  z0WavLMForSequenceClassification.freeze_base_model  r+  r,   r  )r  r  r  r  r   r   r   ru  rv  r-  r    c                    ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }||j                  d      }
nZ| j                  |j                   d   |      }d|| <   |j                  d      |j                  d      j                  dd      z  }
| j#                  |
      }d}|Ft%               } ||j                  d| j                   j&                        |j                  d            }|s|f|t        d z   }||f|z   S |S t)        |||j*                  |j,                  	      S )
  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   r-   r   r/  r3  )rf   r  rL  r  rA  r   stackr   r  r  rM  r   r6   rO  r  r  r   rQ  r   rP  r   rl   rp  )re   r   r   r   ru  rv  r-  rT  rl   norm_weightspooled_outputpadding_maskr5  r4  loss_fctrF  s                   r*   rm   z&WavLMForSequenceClassification.forward  s   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL+.M<-()--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r,   r  )rp   rq   rr   rY   r  r  r*  r   r  r   r  r   r  r   r   r1  r;   r   r   rm   rs   rt   s   @r*   rJ  rJ    s    $
&:( ++AB&,$	 26,0/3&*)-;
u||,;
 !.;
 $D>	;

 'tn;
 d^;
 &;
 
u..	/;
 C;
r,   rJ  za
    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
    c                        e Zd Z fdZd Zd Zd Z ee       e	e
eede      	 	 	 	 	 ddeej                      deej                      d	eej                      d
ee   dee   dee   deeef   fd              Z xZS ) WavLMForAudioFrameClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        |j                   | _        | j%                          y )Nr  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )rX   rY   r   r  r1   r  r  r`  rL  r   r   r   r@   rM  r   r   rP  rQ  init_weightsrR  s      r*   rY   z)WavLMForAudioFrameClassification.__init__  s     6=)f.@.@q   '
--1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r,   c                 X    t        j                  dt               | j                          yr#  r  r  s    r*   r  z9WavLMForAudioFrameClassification.freeze_feature_extractor  r  r,   c                 L    | j                   j                  j                          yr  r%  r  s    r*   r  z7WavLMForAudioFrameClassification.freeze_feature_encoder!  r&  r,   c                 P    | j                   j                         D ]	  }d|_         yr(  r)  r   s     r*   r*  z2WavLMForAudioFrameClassification.freeze_base_model(  r+  r,   r  r  r   r   r-  r   ru  rv  r    c           	         ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }
d}|\t               } ||
j                  d| j                        t        j                   |j                  d| j                        d            }|s|
f|t        d z   }|S t#        ||
|j$                  |j&                  	      S )
rX  NTr  r   r   r-   r   )axisr3  )rf   r  rL  r  rA  r   rY  r   r  r  rM  r   r6   rQ  r   rP  r  r   rl   rp  )re   r   r   r-  r   ru  rv  rT  rl   rZ  r5  r4  r]  rF  s                 r*   rm   z(WavLMForAudioFrameClassification.forward0  sh   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r,   r  )rp   rq   rr   rY   r  r  r*  r   r  r   _FRAME_CLASS_CHECKPOINTr   r  _FRAME_EXPECTED_OUTPUTr   r   r1  r;   r   r   rm   rs   rt   s   @r*   r_  r_    s     
&:( ++AB*)$. 26)-,0/3&*3
u||,3
 !.3
 &	3

 $D>3
 'tn3
 d^3
 
u++	,3
 C3
r,   r_  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLossc                     t         t        |           || _        || _        || _        t        j                  t        j                  ||      d      | _
        t        j                         | _        y )NT)r   )rX   rj  rY   scalemarginrP  r   r   r   randnr   r   r4  )re   	input_dimrP  rl  rm  rh   s        r*   rY   zAMSoftmaxLoss.__init__p  sS    mT+-
$ll5;;y*#EUYZ'')	r,   c                    |j                         }t        j                  j                  | j                  d      }t        j                  j                  |d      }t        j                  ||      }|| j                  z
  }t        j                  j                  || j                        }| j                  t        j                  |j                         ||      z  }| j                  ||      }|S )Nr   r   r   )flattenr   r  	normalizer   r   mmrm  one_hotrP  rl  r)  r;   r4  )	re   rl   r-  r   	cos_thetapsionehotr5  r4  s	            r*   rm   zAMSoftmaxLoss.forwardx  s    !((!(<//1/EHH]F3	$++%&&vt?ekk&++-iHHyy(r,   )g      >@g?ro   rt   s   @r*   rj  rj  o  s    *r,   rj  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )	TDNNLayerc                    t         |           |dkD  r|j                  |dz
     n|j                  |   | _        |j                  |   | _        |j
                  |   | _        |j                  |   | _        t        j                  | j                  | j                  z  | j                        | _        t        j                         | _        y )Nr   r   )rX   rY   tdnn_dimr[   r\   tdnn_kernelrU   tdnn_dilationdilationr   r   kernelReLUrc   rd   s      r*   rY   zTDNNLayer.__init__  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r,   rl   r    c                    t               r1ddlm} t        | j                  |      rt        j                  d       |j                  dd      }| j                  j                  j                  | j                  | j                  | j                        j                  dd      }t        j                  j                  ||| j                  j                   | j"                        }|j                  dd      }| j%                  |      }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )r~  )r   peft.tuners.lorar  r<  r  r   r   r}   r   r   r\   rU   r[   r   r  conv1drW   r~  rc   )re   rl   r  r   s       r*   rm   zTDNNLayer.forward  s    2$++y1O &//15##(():):D<L<LdN^N^_iijkmno,,]FDKKDTDT_c_l_l,m%//156r,   rn   )rp   rq   rr   rY   r   r1  rm   rs   rt   s   @r*   ry  ry    s#    $U\\ ell r,   ry  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                   *    e Zd Z fdZd Zd Zd Zdeej                  e
f   fdZ ee       eeeede      	 	 	 	 	 dd	eej(                     d
eej(                     dee   dee   dee   deej(                     deeef   fd              Z xZS )WavLMForXVectorc                    t         |   |       t        |      | _        |j                  dz   }|j
                  r0t        j                  t        j                  |      |z        | _
        t        j                  |j                  |j                  d         | _        t        t!        |j                              D cg c]  }t#        ||       }}t        j$                  |      | _        t        j                  |j                  d   dz  |j(                        | _        t        j                  |j(                  |j(                        | _        t/        |j(                  |j0                        | _        | j5                          y c c}w )Nr   r   r-   r   )rX   rY   r  r  r`  rL  r   r   r   r@   rM  r   r   r{  rO  r9   r>   ry  r   tdnnxvector_output_dimr  rQ  rj  rP  	objectivera  )re   rf   rS  r   tdnn_layersrh   s        r*   rY   zWavLMForXVector.__init__  s    '
--1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQy+QQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   >Fc                 X    t        j                  dt               | j                          yr#  r  r  s    r*   r  z(WavLMForXVector.freeze_feature_extractor  r  r,   c                 L    | j                   j                  j                          yr  r%  r  s    r*   r  z&WavLMForXVector.freeze_feature_encoder  r&  r,   c                 P    | j                   j                         D ]	  }d|_         yr(  r)  r   s     r*   r*  z!WavLMForXVector.freeze_base_model  r+  r,   rI   c                 V    d }| j                   j                  D ]  } |||d      } |S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr   re  r  s      r*   r  zBWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !;.69A==r,   r   )rf   r|  )re   rI   r  rU   s       r*   _get_tdnn_output_lengthsz(WavLMForXVector._get_tdnn_output_lengths  s:    
	>
  ;;22 	LK,]KKM	L r,   r  r  r   r   r   ru  rv  r-  r    c                    ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }| j                  D ]
  }
 |
|      } |%|j                  d      }|j!                  d      }n| j#                  |j                  d            }| j%                  |      }g }g }t'        |      D ]U  \  }}|j)                  ||d|f   j                  d             |j)                  ||d|f   j!                  d             W t        j                  |      }t        j                  |      }t        j*                  ||gd      }| j-                  |      }| j/                  |      }d}|| j1                  ||      }|s||f|t        d z   }||f|z   S |S t3        ||||j4                  |j6                        S )	rX  NTr  r   r   r-   r   )r4  r5  
embeddingsrl   rp  )rf   r  rL  r  rA  r   rY  r   r  r  rM  r   r6   rO  r  r  r  r  r  rq  rB   r  r  rQ  r  r   rl   rp  )re   r   r   r   ru  rv  r-  rT  rl   rZ  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr5  r4  rF  s                         r*   rm   zWavLMForXVector.forward  s   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5)) 	6J&}5M	6 !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':; J	6$$]1gvg:%>%C%C%C%JK##M!WfW*$=$A$Aa$A$HIJ "KK6M ;;|4L!II}l&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r,   r  )rp   rq   rr   rY   r  r  r*  r   r   r3  r$   r  r   r  r   _XVECTOR_CHECKPOINTr   r  _XVECTOR_EXPECTED_OUTPUTr   r1  r;   r   rm   rs   rt   s   @r*   r  r    s    &
&:(eE<L<Lc<Q6R  ++AB&!$0 26,0/3&*)-I
u||,I
 !.I
 $D>	I

 'tnI
 d^I
 &I
 
um#	$I
 CI
r,   r  r   )Vr   r&  r   typingr   r   r   numpyr2   r   torch.nn.functionalr   r  r
  torch.utils.checkpointtorch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   r   r   r   r   r   configuration_wavlmr   
get_loggerrp   r  rA  r  r  r  rG  rH  rg  rh  r  r  r$   r%  r3  ndarrayrP   ModulerR   rv   r   r   r   r   r   r   r   r6  rE  rX  r\  r  r  r  r  r  WAVLM_START_DOCSTRINGr  r  r  rJ  r_  rj  ry  r  re  r,   r*   <module>r     sW      ) )       % ! @ 7  .  - 
		H	% !"    J &  t   9 Q  5   26tc?tt t U--.	t
 t ZZtp		 ,bii 8bii 2*299 *\		 )")) )X
/ 
1RYY 1c RYY c Nryy 0&		 &R"ryy "JO
299 O
dP
")) P
fC' C'N299 @		 $U? Up (" J e
P
% P

P
f m
T
& T

T
n  s
%9 s
s
l  	g
'; g
g
VBII 0		 >  	O
* O
O
r,   