
    sg                     T   d Z ddlmZ ddlmZmZmZ ddlZddlm	Z
 ddlZddlmZ ddlZddlmZmZmZ ddlmZ ddlmZmZ ddlmZ d	d
lmZmZ d	dlmZmZm Z m!Z! d	dl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e&jR                  e*      Z+ejX                  jZ                   G d de#             Z.ejX                  jZ                   G d de#             Z/	 	 dRdee0e0f   de1de0deejd                     de0dejd                  fdZ3dSdede0deejd                     fdZ4dZ5dZ6 G d d e
jn                        Z8 G d! d"e
jn                        Z9 G d# d$e
jn                        Z: G d% d&e
jn                        Z; G d' d(e
jn                        Z< G d) d*e
jn                        Z= G d+ d,e
jn                        Z> G d- d.e
jn                        Z? G d/ d0e
jn                        Z@ G d1 d2e
jn                        ZA G d3 d4e
jn                        ZB G d5 d6e
jn                        ZC G d7 d8e
jn                        ZD G d9 d:e
jn                        ZE G d; d<e
jn                        ZF G d= d>e      ZG G d? d@e
jn                        ZH e$dAe5       G dB dCeG             ZIdDZJ e!eIe6eJz           e eIe.e(E        G dF dGe
jn                        ZK e$dHe5       G dI dJeG             ZLdKZM e!eLe6eMz           e eLee(E        G dL dMe
jn                        ZN e$dNe5       G dO dPeG             ZOdQZP e!eOe6ePz           e eOe/e(E       y)TzFlax Wav2Vec2 model.    )partial)OptionalTupleUnionN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxCausalLMOutput)ACT2FNFlaxPreTrainedModel append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )Wav2Vec2Configc                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                        ed<   dZe
eej                        ed<   y)FlaxWav2Vec2BaseModelOutputa  
    Output type of [`FlaxWav2Vec2BaseModelOutput`], with potential hidden states and attentions.

    Args:
        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        extract_features (`jnp.ndarray` of shape `(batch_size, sequence_length, last_conv_dim)`):
            Sequence of extracted feature vectors of the last convolutional layer of the model with `last_conv_dim`
            being the dimension of the last convolutional layer.
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlast_hidden_stateextract_featureshidden_states
attentions)__name__
__module____qualname____doc__r   jnpndarray__annotations__r   r   r   r   r         f/var/www/html/venv/lib/python3.12/site-packages/transformers/models/wav2vec2/modeling_flax_wav2vec2.pyr   r   ,   sW    , &*s{{)$(ckk(26M8E#++./6/3Js{{+,3r)   r   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   y) FlaxWav2Vec2ForPreTrainingOutputa%  
    Output type of [`FlaxWav2Vec2ForPreTrainingOutput`], with potential hidden states and attentions.

    Args:
        loss (*optional*, returned when model is in train mode, `jnp.ndarray` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`jnp.ndarray` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`jnp.ndarray` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprojected_statesprojected_quantized_statescodevector_perplexityr   r    )r!   r"   r#   r$   r-   r%   r&   r'   r.   r/   r   r   r   r    r(   r)   r*   r,   r,   J   sf    4 %)ckk(.22)-3;;-26M8E#++./6/3Js{{+,3r)   r,   shape	mask_probmask_lengthattention_mask	min_masksreturnc                    | \  }}|dk  rt        d      ||kD  rt        d| d| d      t        ||z  |z  t        j                  j	                  d      j                         z         }t        ||      }||z  |kD  r||z  }t        j                  ||ft              }t        j                  t        |      D 	cg c]=  }	t        j                  j                  t        j                  ||dz
  z
        |d      ? c}	      }
t        j                  |
d	d	d	d	d	f   |||f      }
|
j                  |||z        }
t        j                  |      d	d	d	d	f   }t        j                  ||||f      j                  |||z        }|
|z   }
t        j                  ||
dd
       |t        j                   ||d      }|S c c}	w )aw  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: the shape for which to compute masks.
            should be of size 2 where first element is batch size and 2nd is timesteps
        mask_prob:
            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
        mask_length: size of the mask
        min_masks: minimum number of masked spans

    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `dtypeF)replaceN)
ValueErrorintnprandomranditemmaxzerosboolarrayrangechoicearangebroadcast_toreshapeput_along_axiswhere)r0   r1   r2   r3   r4   
batch_sizesequence_lengthnum_masked_spansspec_aug_mask_spec_aug_mask_idxsoffsetss               r*   _compute_mask_indicesrT   m   s   . #(JQABB_$]^i]j k##2"316
 	
 96Dryy~~VWGXG]G]G__`+Y7 +%7*k9 HHj/:$GM  :&	
 IIRYY+/'JKM]glm	
 );Aq$J)G*VfhsItu+33J@PS^@^_ii$T4]3Goog
4Dk'RS[[${2G ,g5 m%7B?!F/	
s   >AG features_shapenum_negativesc                 8   | \  }}}|dk  rt        d|||f d      g }t        |      D ]V  }|||   j                         dz
  n|dz
  }t        j                  j                  d|||z  f      }	|j                  |	       X t        j                  |t        j                        }t        j                  t        j                  |      dddf   ||f      j                         }
|||
k\  xx   dz  cc<   t        d|      D ]  }||xx   ||z  z  cc<    |S )z>
    Sample `num_negatives` vectors from feature vectors.
    r   zl`features should have `sequence_length` > 1, but are of shape (batch_size, sequence_length, hidden_size) = ().Nr   )sizer8   )r<   rF   sumr>   r?   randintappendasarrayint32rI   rH   flatten)rU   rV   r3   rM   rN   hidden_sizesampled_negative_indices	batch_idxhighsampled_indices_slicefeature_indicess              r*   _sample_negative_indicesrf      sQ    0>,J!==GZe=e<ffhj
 	
  ":& ?	6D6P~i(,,.2VehiVi "		 1 1!TQ`A`@b 1 c ''(=>?
  "zz*B"((S oobii&@D&IO]jKklttvO 5HIQNI 1j) K	 +y?/JJ+K $#r)   a  
    Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
    Auli.

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a	  
    Args:
        input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `jnp.ndarray`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask) .. warning:: `attention_mask` should only be passed
            if the corresponding processor has `config.return_attention_mask == True`. For all models whose processor
            has `config.return_attention_mask == False`, such as
            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
            different results depending on whether `input_values` is padded or not.
        mask_time_indices (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   h    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
d Zy)FlaxWav2Vec2LayerNormConvLayerconfigr   layer_idr9   c           	         | j                   dkD  r#| j                  j                  | j                      nd| _        | j                  j                  | j                      | _        t        j                  | j                  j                  | j                      | j                  j                  | j                      f| j                  j                  | j                      f| j                  j                  t        j
                  j                  j                         d| j                        | _        t        j                  | j                  j                   | j                        | _        t$        | j                  j&                     | _        y )Nr   r   VALID)featureskernel_sizestridesuse_biaskernel_initpaddingr9   epsilonr9   )rj   ri   conv_dimin_conv_dimout_conv_dimnnConvconv_kernelconv_stride	conv_biasjaxinitializers	he_normalr9   conv	LayerNormlayer_norm_eps
layer_normr   feat_extract_activation
activationselfs    r*   setupz$FlaxWav2Vec2LayerNormConvLayer.setup&  s   BF--RSBS4;;//>YZ KK00?GG[[))$--800?A[[,,T]];=[[**++557**
	 ,,t{{/I/IQUQ[Q[\ !D!DEr)   c                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r   r   r   r   r   s     r*   __call__z'FlaxWav2Vec2LayerNormConvLayer.__call__6  s2    		-066r)   N)r!   r"   r#   r   r'   rj   r=   r%   float32r9   r   r   r(   r)   r*   rh   rh   !  s/    Hc{{E399"F r)   rh   c                   `    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	d Z
y)FlaxConvWithWeightNormri   r9   c                 f    t        j                   j                  j                   j                  j                  ft
        j                   j                  j                         d j                  j                   j                         _
         j                  j                   j                  j                   j                  j                  z   j                  j                  d   f} j                  dt
        j                   j                  j                         |       _         j                  d fd       _         j                  dt
        j                   j                  j"                   j                  j                  f       _         j                  j                  d   dz   _        y )	Nrl   )rm   rn   rq   rr   feature_group_countr9   r   weight_vweight_gc                 j    t         j                  j                  j                  d      d d d d f   S N)r   r   axis)r%   linalgnormr   )rQ   r   s    r*   <lambda>z.FlaxConvWithWeightNorm.setup.<locals>.<lambda>P  s,    ]c9deikoqrer9s r)   bias   )rx   ry   ri   r`   num_conv_pos_embeddingsr}   r~   r   num_conv_pos_embedding_groupsr9   r   rm   r   rn   paramr   r   rC   r   prev_padding)r   weight_shapes   ` r*   r   zFlaxConvWithWeightNorm.setupA  s+   GG[[,,<<>++557 $ I I**
	 IIII$))"?"??II!!!$

 

:svv/B/B/L/L/NP\]

:/stJJvsvv':':'@'@499CUCUBWX	 II11!49r)   c                     t         j                  j                  | j                  d      d d d d f   }t        j                  | j                  |      }t        j
                  || j                        }|S r   )r%   r   r   r   dividemultiplyr   )r   weight_v_normnormed_weight_vnormed_kernels       r*   _get_normed_weightsz*FlaxConvWithWeightNorm._get_normed_weightsT  sV    

FCD$PQMR**T]]MB_dmmDr)   c                     | j                         }t        j                  |d| j                  | j                  fdf      }| j                  j                  d|j                  | j                  di|      }|S )N)r   r   params)kernelr   )r   r%   padr   r   applyTr   )r   r   r   s      r*   r   zFlaxConvWithWeightNorm.__call__Z  si    ))+9J9JDL]L]8^`f/gh		fhhPTPYPY3Z([]jkr)   N)r!   r"   r#   r   r'   r%   r   r9   r   r   r   r(   r)   r*   r   r   =  s)    {{E399":&r)   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)#FlaxWav2Vec2PositionalConvEmbeddingri   r9   c                     t        | j                  | j                        | _        t        | j                  j
                     | _        | j                  j                  dz  dk(  rd| _        y d| _        y )Nr8   r   r   r   )	r   ri   r9   r   r   r   r   r   num_pad_remover   s    r*   r   z)FlaxWav2Vec2PositionalConvEmbedding.setupe  sT    *4;;djjI	 !D!DE#';;#F#F#Ja#OaUVr)   c                     |j                  d      }| j                  |      }| j                  dkD  r|d d d | j                   d d f   }| j                  |      }|j                  d      }|S )N)r   r   r   r   )	transposer   r   r   r   s     r*   r   z,FlaxWav2Vec2PositionalConvEmbedding.__call__j  sr    %//	:		-0")!-C0C0C/C-CQ*FGM6%//	:r)   N
r!   r"   r#   r   r'   r%   r   r9   r   r   r(   r)   r*   r   r   a  s%    {{E399"W

r)   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxConvLayersCollectionri   r9   c           
         | j                   j                  dk(  r]t        | j                   j                        D cg c].  }t	        | j                   |t        |      | j                        0 c}| _        y | j                   j                  dk(  rt        d      t        d| j                   j                   d      c c}w )Nlayer)rj   namer9   groupzFAt the moment only ``config.feat_extact_norm == 'layer'`` is supportedz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer'])
ri   feat_extract_normrF   num_feat_extract_layersrh   strr9   layersNotImplementedErrorr<   r   is     r*   r   zFlaxConvLayersCollection.setup{  s    ;;((G3 t{{BBC /t{{QSQRV[_[e[efDK [[**g5%&noo01N1N0O P  s   3B>c                 P    t        | j                        D ]  \  }} ||      } |S r   )	enumerater   )r   r   r   
conv_layers       r*   r   z!FlaxConvLayersCollection.__call__  s.    &t{{3 	6MAz&}5M	6r)   Nr   r(   r)   r*   r   r   w  s$    {{E399"r)   r   c                   `    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	ddZ
y)FlaxWav2Vec2FeatureEncoderz.Construct the features from raw audio waveformri   r9   c                 P    t        | j                  | j                        | _        y )Nr8   )r   ri   r9   conv_layersr   s    r*   r   z FlaxWav2Vec2FeatureEncoder.setup  s    3DKKtzzRr)   c                     |d d d d d f   }| j                  |      }|rt        j                  j                  |      }|S r   )r   r}   r   stop_gradient)r   input_valuesfreeze_feature_encoderr   s       r*   r   z#FlaxWav2Vec2FeatureEncoder.__call__  s?    $Q4Z0((7!GG11-@Mr)   N)F)r!   r"   r#   r$   r   r'   r%   r   r9   r   r   r(   r)   r*   r   r     s(    8{{E399"Sr)   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxWav2Vec2FeatureProjectionri   r9   c                    t        j                  | j                  j                  | j                        | _        t        j                  | j                  j                  t        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                        | _        y )Nrs   rq   r9   rate)rx   r   ri   r   r9   r   Denser`   r}   r~   normalinitializer_range
projectionDropoutfeat_proj_dropoutdropoutr   s    r*   r   z#FlaxWav2Vec2FeatureProjection.setup  s    ,,t{{/I/IQUQ[Q[\((KK##++224;;3P3PQ**

 zzt{{'D'DEr)   c                 t    | j                  |      }| j                  |      }| j                  ||      }||fS Ndeterministic)r   r   r   )r   r   r   norm_hidden_statess       r*   r   z&FlaxWav2Vec2FeatureProjection.__call__  s>    !__];(:;]-P000r)   NTr   r(   r)   r*   r   r     s%    {{E399"F1r)   r   c                      e Zd ZU eed<   eed<   eed<   dZeed<   dZe	ed<   e
j                  Ze
j                  ed<   ddZd Zd Z	 	 	 dde
j                   dee
j                      dee
j                      de	d	ee
j                      f
dZy
)FlaxWav2Vec2Attentionri   	embed_dim	num_heads        r   Tr   r9   r5   Nc           	      r   | j                   | j                  z  | _        | j                  | j                  z  | j                   k7  r&t        d| j                    d| j                   d      t	        t
        j                  | j                   | j                  | j                  t        j
                  j                  j                  | j                  j                              } |        |        |       c| _        | _        | _         |       | _        t        j$                  | j&                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rX   )rp   r9   rq   r   )r   r   head_dimr<   r   rx   r   r   r9   r}   r~   r   ri   r   q_projk_projv_projout_projr   r   dropout_layer)r   denses     r*   r   zFlaxWav2Vec2Attention.setup  s    $..8==4>>)T^^;MdnnM] ^NN#2' 
 HHNNYY**++224;;3P3PQ
 16%'-T[$+ZZT\\:r)   c                 p    |j                  |j                  d d | j                  | j                  fz         S Nr   )rJ   r0   r   r   r   s     r*   _split_headsz"FlaxWav2Vec2Attention._split_heads  s5    $$]%8%8!%<PTP]P]?^%^__r)   c                 Z    |j                  |j                  d d | j                  fz         S r   )rJ   r0   r   r   s     r*   _merge_headsz"FlaxWav2Vec2Attention._merge_heads  s,    $$]%8%8!%<?P%PQQr)   r   key_value_statesr3   r   c                 z   | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|t	        j
                  |d      }|t        j                  |dkD  t	        j                  |j                  d      j                  | j                        t	        j                  |j                  t	        j                  | j                        j                        j                  | j                              }nd}d}	|s | j                  dkD  r| j                  d      }	t!        ||||	| j                  d|| j                  d	      }
t	        j"                  d	|
|      }| j%                  |      }| j'                  |      }||
fS )
z#Input shape: Batch x Time x ChannelN)r   r   r   r   T)r   dropout_rngdropout_ratebroadcast_dropoutr   r9   	precisionz...hqk,...khd->...qhd)r   r   r   r   r%   expand_dimsr   selectfullr0   astyper9   finfominr   make_rngr
   einsumr   r   )r   r   r   r3   r   query_states
key_statesvalue_statesattention_biasr   attn_weightsattn_outputs               r*   r   zFlaxWav2Vec2Attention.__call__  s    {{=1[[/
{{=1((6&&z2
((6% __^(KN % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!3--	2K4#"'**

 jj!8,U''4mmK0L((r)   )r5   N)NNT)r!   r"   r#   r   r'   r=   r   floatr   rD   r%   r   r9   r   r   r   r&   r   r   r   r(   r)   r*   r   r     s    NNGUD${{E399";*`R 3704"5){{5) #3;;/5) !-	5)
 5) 
s{{	5)r)   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxWav2Vec2FeedForwardri   r9   c                 \   t        j                  | j                  j                        | _        t        j
                  | j                  j                  t        j                   j                  j                  | j                  j                        | j                        | _        t        | j                  j                  t              r#t         | j                  j                     | _        n| j                  j                  | _        t        j
                  | j                  j$                  t        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j(                        | _        y )Nr   r   )rx   r   ri   activation_dropoutintermediate_dropoutr   intermediate_sizer}   r~   r   r   r9   intermediate_dense
isinstance
hidden_actr   r   intermediate_act_fnr`   output_densehidden_dropoutoutput_dropoutr   s    r*   r   zFlaxWav2Vec2FeedForward.setup  s   $&JJDKK4R4R$S!"$((KK))++224;;3P3PQ**#

 dkk,,c2'-dkk.D.D'ED$'+{{'='=D$HHKK##++224;;3P3PQ**

 !jjdkk.H.HIr)   c                     | j                  |      }| j                  |      }| j                  ||      }| j                  |      }| j	                  ||      }|S r   )r  r  r  r  r  r   r   r   s      r*   r   z FlaxWav2Vec2FeedForward.__call__'  sb    //>00?11-}1]))-8++M+Wr)   Nr   r   r(   r)   r*   r  r    s%    {{E399"J(r)   r  c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)'FlaxWav2Vec2EncoderLayerStableLayerNormri   r9   c                 `   t        | j                  | j                  j                  | j                  j                  | j                  j                  | j
                        | _        t        j                  | j                  j                        | _
        t        j                  | j                  j                  | j
                        | _        t        | j                  | j
                        | _        t        j                  | j                  j                  | j
                        | _        y )N)ri   r   r   r   r9   r   rs   r8   )r   ri   r`   num_attention_headsattention_dropoutr9   	attentionrx   r   r  r   r   r   r   r  feed_forwardfinal_layer_normr   s    r*   r   z-FlaxWav2Vec2EncoderLayerStableLayerNorm.setup5  s    .;;kk--kk55KK11**
 zzt{{'A'AB,,t{{/I/IQUQ[Q[\3DKKtzzR "T[[5O5OW[WaWa br)   Nc                     |}| j                  |      }| j                  |||      \  }}| j                  ||      }||z   }|| j                  | j	                  |      |      z   }|f}|r||fz  }|S )N)r3   r   r   )r   r  r   r  r   )r   r   r3   r   output_attentionsattn_residualr  outputss           r*   r   z0FlaxWav2Vec2EncoderLayerStableLayerNorm.__call__B  s    %6&*nn. '5 '
#| ]-P%5%(9(9!!-0 ): )
 
 !"&Gr)   )NTFr   r(   r)   r*   r  r  1  s%    {{E399"cr)   r  c            	       x    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 d
de	de	de	de	fd	Z
y)1FlaxWav2Vec2EncoderLayerStableLayerNormCollectionri   r9   c           	          t        | j                  j                        D cg c]-  }t        | j                  t	        |      | j
                        / c}| _        y c c}w N)r   r9   )rF   ri   num_hidden_layersr  r   r9   r   r   s     r*   r   z7FlaxWav2Vec2EncoderLayerStableLayerNormCollection.setupZ  sJ     4;;889
 4DKKc!fTXT^T^_
 
   2ANr   r"  output_hidden_statesreturn_dictc                     |rdnd }|rdnd }t        | j                        D ]*  \  }	}
|r||fz  } |
||||      }|d   }|s"||d   fz  }, |r||fz  }|||f}|st        d |D              S t        |||      S )Nr(   )r   r"  r   r   c              3   &   K   | ]	  }||  y wr   r(   .0vs     r*   	<genexpr>zMFlaxWav2Vec2EncoderLayerStableLayerNormCollection.__call__.<locals>.<genexpr>       =qq}=   r   r   r    )r   r   tupler   )r   r   r3   r   r"  r+  r,  all_attentionsall_hidden_statesr   r   layer_outputsr$  s                r*   r   z:FlaxWav2Vec2EncoderLayerStableLayerNormCollection.__call__`  s      1d"6BD!$++. 	6HAu#!m%55!!~]^oM *!,M =#3"55	6  -!11 "3^D=G==="+;LYg
 	
r)   NTFFT)r!   r"   r#   r   r'   r%   r   r9   r   rD   r   r(   r)   r*   r&  r&  V  s]    {{E399"
 ""'%* #
 	#

  #
 ##
 #
r)   r&  c                   f    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 ddZ	y)"FlaxWav2Vec2StableLayerNormEncoderri   r9   c                 n   t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                        | _        t	        j                  | j                  j                        | _
        t        | j                  | j                        | _        y )Nr8   rs   r   )r   ri   r9   pos_conv_embedrx   r   r   r   r   r  r   r&  r   r   s    r*   r   z(FlaxWav2Vec2StableLayerNormEncoder.setup  sr    A$++UYU_U_`,,t{{/I/IQUQ[Q[\zzt{{'A'ABG[_[e[efr)   Nc                    |?t        j                  t        j                  |d d d d d f   |j                        |d      }| j	                  |      }||z   }| j                  ||      }| j                  |||||      }| j                  |d         }	d }|r|d   }|d d |	fz   }|s#|	|f|r|dd  n|dd  z   }t        d |D              S t        |	||j                        S )	Nr   r   )r"  r+  r,  r   r;   r   c              3   &   K   | ]	  }||  y wr   r(   r/  s     r*   r2  z>FlaxWav2Vec2StableLayerNormEncoder.__call__.<locals>.<genexpr>  r3  r4  r5  )r%   rL   rI   r0   r>  r   r   r   r6  r   r    )
r   r   r3   r   r"  r+  r,  position_embeddingsr$  r   s
             r*   r   z+FlaxWav2Vec2StableLayerNormEncoder.__call__  s!    %II  1d
!;]=P=PQS`bcM #11-@%(;;]-P++/!5#  
 !OOGAJ7 #AJM)#2.2C1EEM(-8K_GABKelmnmoepqG=G==="/}Y`YkYk
 	
r)   r:  r   r(   r)   r*   r<  r<    s6    {{E399"g "*
r)   r<  c                   r    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	e
dd       Zd	dZy)
!FlaxWav2Vec2GumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    ri   r9   c                    | j                   j                  | _        | j                   j                  | _        | j                   j
                  | j                  z  dk7  r0t        d| j                   j
                   d| j                   d      | j                  dt        j                  j                  j                         d| j                  | j                  z  | j                   j
                  | j                  z  f      | _        t        j                  | j                  | j                  z  t        j                  j                  j                  d      | j                        | _        y )	Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationcodevectorsr         ?r   )ri   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimr<   r   r}   rx   r~   uniformrE  r   r   r9   weight_projr   s    r*   r   z'FlaxWav2Vec2GumbelVectorQuantizer.setup  s   ++;;==;;%%71<)$++*D*D)E F337??2CCUW   ::FF'')$--/1K1Kt1^_

 88OOdmm+++2237**
r)   Nc           	         |t        j                  |j                         d d d d f   | j                        }t        j                  || t        j
                  |             } | j                  d      |j                         z  }n| j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   r   gHz>r;   )
r%   rI   r_   r0   rL   
zeros_likerZ   meanexplog)probsmaskmask_extendedmarginal_probs
perplexitys        r*   _compute_perplexityz5FlaxWav2Vec2GumbelVectorQuantizer._compute_perplexity  s    ,,T\\^AtTM-JEKKXMIImUCNN54IJE"YYAY.;N"ZZQZ/NWWcggnsww~PT?T7U&U\^__`ddf
r)   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }|s| j	                  d      }t
        j                  j                  ||j                         }	t        j                  ||	z   |z        }
t        j                  |j                  ||z  | j                  d      d      }| j                  ||      }nt|j                  d      }t
        j                  j                  ||j                   d         dz  }
|
j                  ||z  | j                  d      }
| j                  |
|      }|
j                  ||z  d      }
t        j                  |
d      | j                  z  }|j                  ||z  | j                  | j                   d      }|j#                  d      j                  ||d      }||fS )Nr;   gumbelr   rF  r   )r0   rM  rJ   rH  r  r}   r?   rZ  rx   softmaxrX  argmaxone_hotr%   r   rE  rJ  rZ   )r   r   mask_time_indicesr   temperaturerM   rN   r`   
gumbel_rnggumbelscodevector_probscodevector_soft_distrW  codevector_idxcodevectors_per_grouprE  s                   r*   r   z*FlaxWav2Vec2GumbelVectorQuantizer.__call__  s   3@3F3F0
O[ ((7%--j?.JT__.\^`ax0Jjj''
M4G4GHG!zz=7+Bk*QR $&::%%j?&BDOOUWX_a$  112FHYZJ +11r1:N"vv~~nm>Q>QRT>UVY\\/77
_8TVZVeVegij112BDUVJ+33J4PRTU #0@r JTM]M] ]+33J4PRVRaRacgcpcprtu!oob)11*orRJ&&r)   r   )NTr   )r!   r"   r#   r$   r   r'   r%   r   r9   r   staticmethodrX  r   r(   r)   r*   rC  rC    s?    
 {{E399"
, 	 	 'r)   rC  c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxWav2Vec2Adapterri   r9   c                 (   | j                   j                  | j                   j                  k7  rt        j                  | j                   j                  t
        j                  j                  j                  | j                   j                        | j                        | _
        t        j                  | j                   j                  | j                        | _        nd x| _
        | _        t        | j                   | j                        | _        y )Nr   rs   r8   )ri   output_hidden_sizer`   rx   r   r}   r~   r   r   r9   projr   r   proj_layer_norm#FlaxWav2Vec2AdapterLayersCollectionr   r   s    r*   r   zFlaxWav2Vec2Adapter.setup  s    ;;))T[[-D-DD..FF//66t{{7T7TUjjDI
 $&<<8R8RZ^ZdZd#eD /33DI,9$++TZZXr)   c                     | j                   .| j                  "| j                  |      }| j                  |      }| j                  |      }|S r   )rk  rl  r   r  s      r*   r   zFlaxWav2Vec2Adapter.__call__  sI    99 T%9%9%E IIm4M 00?MM2r)   Nr   r   r(   r)   r*   rh  rh    s%    {{E399"Yr)   rh  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxWav2Vec2AdapterLayerri   r9   c           	      P   t        j                  d| j                  j                  z  | j                  j                  f| j                  j
                  fdt        j                   j                  j                  | j                  j                        | j                        | _        y )Nr   ))r   r   )rm   rn   ro   rr   rq   r9   )rx   ry   ri   rj  adapter_kernel_sizeadapter_strider}   r~   r   r   r9   r   r   s    r*   r   zFlaxWav2Vec2AdapterLayer.setup,  sp    GG77788:[[//1++224;;3P3PQ**
	r)   c                 V    | j                  |      }t        j                  |d      }|S )Nr   r   )r   rx   glur   s     r*   r   z!FlaxWav2Vec2AdapterLayer.__call__6  s&    		-0}15r)   Nr   r(   r)   r*   rp  rp  (  s$    {{E399"
r)   rp  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)rm  ri   r9   c           	          t        | j                  j                        D cg c]-  }t        | j                  t	        |      | j
                        / c}| _        y c c}w r(  )rF   ri   num_adapter_layersrp  r   r9   r   r   s     r*   r   z)FlaxWav2Vec2AdapterLayersCollection.setupA  sG     4;;99:
 %T[[s1vTZZP
 
r*  c                 8    | j                   D ]
  } ||      } |S r   )r   )r   r   r   s      r*   r   z,FlaxWav2Vec2AdapterLayersCollection.__call__G  s'    ++ 	6J&}5M	6 r)   Nr   r(   r)   r*   rm  rm  =  s$    {{E399"
r)   rm  c                       e Zd ZU dZeZdZeed<   dZ	dZ
ej                  ed<   ddej                  d	fd
edededej"                  def
 fdZddej*                  j,                  dededefdZ ee      	 	 	 	 	 	 	 	 	 ddedej*                  j,                  dedee   dee   dedee   fd       Z	 ddeej>                  ef   dee   fdZ  xZ!S ) FlaxWav2Vec2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    wav2vec2base_model_prefixr   Nmodule_class)r   i   r   Tri   input_shapeseedr9   _do_initc                 Z     | j                   d||d|}t        | 	  ||||||       y )N)ri   r9   )r  r  r9   r  r(   )r~  super__init__)	r   ri   r  r  r9   r  kwargsmodule	__class__s	           r*   r  z$FlaxWav2Vec2PreTrainedModel.__init__Y  s=     #""H&HH[tSXcklr)   rngr   r5   c                    t        j                  |d      }t        j                  |      }t        j                  j                  |d      \  }}||d}| j                  j                  |||d      d   }	|dt        t        |	            }	t        t        |            }| j                  D ]
  }
|	|
   ||
<    t               | _
        t        t        |            S |	S )Ni4r8   r   )r   r   F)r,  r   )r%   rC   	ones_liker}   r?   splitr  initr   r	   _missing_keyssetr   r   )r   r  r  r   r   r3   
params_rngr   rngsrandom_paramsmissing_keys              r*   init_weightsz(FlaxWav2Vec2PreTrainedModel.init_weightse  s    yyD9|4"%**"2"23":
K$=((|^Y^(_`hi(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r)   r   trainr"  r+  r   r,  c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
|j                  \  }}|t        j                  ||f      }i }|||d<   d|xs | j                  i}| j                  j                  |t        j                  |d      t        j                  |d      || |||	|
|
      S )Nr   r   f4r8   r  r  ri   r"  r+  r,  r0   r%   onesr   r  r   rE   )r   r   r3   r^  r   r   r  r"  r+  r   r,  rM   rN   r  inputss                  r*   r   z$FlaxWav2Vec2PreTrainedModel.__call__x  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY&2&8&8#
O! XXz?&CDN ")DOF1dkk2{{  IIl$/IInD1I " ! 
 	
r)   input_lengthsadd_adapterc                 <    | j                   j                  ||      S )Nr  )r   _get_feat_extract_output_lengths)r   r  r  s      r*   r  z<FlaxWav2Vec2PreTrainedModel._get_feat_extract_output_lengths  s     {{;;MWb;ccr)   r   )	NNNNFNNFN)"r!   r"   r#   r$   r   config_classr}  r   r'   main_input_namer~  rx   Moduler%   r   r   r=   r9   rD   r  r}   r?   PRNGKeyr   r  r   WAV_2_VEC_2_INPUTS_DOCSTRINGdictr   r   r   r&   r  __classcell__)r  s   @r*   r{  r{  N  sq   
 "L's'$O"L"))"
 ';;
m
m 
m 	
m
 yy
m 
m!

 2 2 ! !PZ !fp !& ++GH *.,0/3',&**

 *
 ZZ''*
 *
 $D>*
 'tn*
 !%*
 d^*
 I*
Z UYd"3;;#34dCKD>dr)   r{  c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 	 	 ddZ		 dde
ej                  ef   dee   fdZ	 dd	ed
ej                  fdZy)FlaxWav2Vec2Moduleri   r9   c                 Z   t        | j                  | j                        | _        t	        | j                  | j                        | _        | j                  dt        j                  j                  j                         | j                  j                  f      | _        | j                  j                  r't        | j                  | j                        | _        nt!        d      | j                  j"                  r't%        | j                  | j                        | _        y d | _        y )Nr8   masked_spec_embedzD``config.do_stable_layer_norm is False`` is currently not supported.)r   ri   r9   feature_extractorr   feature_projectionr   r}   rx   r~   rL  r`   r  do_stable_layer_normr<  encoderr   r  rh  adapterr   s    r*   r   zFlaxWav2Vec2Module.setup  s    !;DKKtzz!Z"?SWS]S]"^!%!4!4!<!<!>AXAX@Z"
 ;;++=dkkQUQ[Q[\DL%&lmmMQ[[MdMd*4;;djjIjnr)   Nc	           
      L   | j                  ||      }	|!| j                  |	j                  d   |d      }| j                  |	|      \  }
}	|ot	        j
                  t	        j                  |d d d d d f   |
j                        t	        j                  | j                  d d d d f   |
j                        |
      }
| j                  |
|||||      }|d   }
| j                  | j                  |
      }
|s
|
|	f|dd  z   S t        |
|	|j                  |j                        S )	N)r   r   Fr  r   )r3   r   r"  r+  r,  r   )r   r   r   r    )r  "_get_feature_vector_attention_maskr0   r  r%   rL   rI   r  r  r  r   r   r    )r   r   r3   r^  r   r"  r+  r   r,  r   r   encoder_outputss               r*   r   zFlaxWav2Vec2Module.__call__  sU     11,Wm1n %!DD &&q)>u E N +/*A*ABRbo*A*p''(II  !21a:!>@S@ST  !7!7dA!FH[H[\M ,,)'/!5# ' 
 (*<<# LL7M!#34qr7JJJ*+-)77&11	
 	
r)   r  r  c                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )H
        Computes the output length of the convolutional layers
        c                     | |z
  |z  dz   S Nr   r(   input_lengthrn   strides      r*   _conv_out_lengthzMFlaxWav2Vec2Module._get_feat_extract_output_lengths.<locals>._conv_out_length       !;.69A==r)   r   ri   r  ziprz   r{   rF   rx  rs  r   r  r  r  rn   r  rQ   s          r*   r  z3FlaxWav2Vec2Module._get_feat_extract_output_lengths       2=1Ddkk--+	>
 $'t{{'>'>@W@W#X 	QK,]KPM	Q 4;;99: _ 04;;C]C] ^_ r)   feature_vector_lengthr3   c                    |j                  d      d d df   }| j                  ||      }|j                  d   }t        j                  ||f|j
                        }|j                  t        j                  |j                  d         |dz
  f   j                  d      }t        j                  t        j                  |d      j                  d      d      j                  d      }|S )Nr;   r   r  r   r8   r   rD   )cumsumr  r0   r%   rC   r9   atrH   r  flipr   )r   r  r3   r  non_padded_lengthsoutput_lengthsrM   s          r*   r  z5FlaxWav2Vec2Module._get_feature_vector_attention_mask  s    
 ,222;ArEB>>?Q_j>k#))!,
J0E#FnNbNbc (**3::n6J6J16M+NP^abPb+bcgghij#((>2">"E"Eb"I2NUUV\]r)   NNTNNFNr   )r!   r"   r#   r   r'   r%   r   r9   r   r   r   r&   r=   r   rD   r  r  r(   r)   r*   r  r    s    {{E399"o" !$2
j UY"3;;#34CKD>0 TX%(:=++r)   r  zbThe bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxWav2Vec2ModelN)r!   r"   r#   r  r~  r(   r)   r*   r  r    s	    
 &Lr)   r  aJ  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
    >>> from datasets import load_dataset
    >>> import soundfile as sf

    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
    >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")


    >>> def map_to_array(batch):
    ...     speech, _ = sf.read(batch["file"])
    ...     batch["speech"] = speech
    ...     return batch


    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> ds = ds.map(map_to_array)

    >>> input_values = processor(
    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
    ... ).input_values  # Batch size 1
    >>> hidden_states = model(input_values).last_hidden_state
    ```
)output_typer  c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 	 	 d	dZ		 d
de
ej                  ef   dee   fdZy)FlaxWav2Vec2ForCTCModuleri   r9   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                        | _        t	        j                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                        | _        y )Nr8   r   r   )r  ri   r9   r|  rx   r   final_dropoutr   r   
vocab_sizer}   r~   r   r   lm_headr   s    r*   r   zFlaxWav2Vec2ForCTCModule.setupN  sx    *4;;djjIzzt{{'@'@AxxKK""++224;;3P3PQ**
r)   Nc	           
          | j                  ||||||||      }	|	d   }
| j                  |
|      }
| j                  |
      }|s	|f|	dd  z   S t        ||	j                  |	j
                        S )N)r3   r^  r   r"  r+  r   r,  r   r   r   )logitsr   r    )r|  r   r  r   r   r    )r   r   r3   r^  r   r"  r+  r   r,  r$  r   r  s               r*   r   z!FlaxWav2Vec2ForCTCModule.__call__W  s     --)/'/!5#9#   	
  
]-Pm,9wqr{**!w?T?Tahasasttr)   r  r  c                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )r  c                     | |z
  |z  dz   S r  r(   r  s      r*   r  zSFlaxWav2Vec2ForCTCModule._get_feat_extract_output_lengths.<locals>._conv_out_length  r  r)   r   r  r  s          r*   r  z9FlaxWav2Vec2ForCTCModule._get_feat_extract_output_lengthsw  s     2=1Ddkk--+	>
 $'t{{'>'>@W@W#X 	QK,]KPM	Q 4;;99: _ 04;;C]C] ^_ r)   r  r   )r!   r"   r#   r   r'   r%   r   r9   r   r   r   r&   r=   r   rD   r  r(   r)   r*   r  r  J  sk    {{E399"
 !$uF '+S[[#-. d^r)   r  zfWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                       e Zd ZeZy)FlaxWav2Vec2ForCTCN)r!   r"   r#   r  r~  r(   r)   r*   r  r    s	    
 ,Lr)   r  a  
    Returns:

    Example:

    ```python
    >>> import jax.numpy as jnp
    >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
    >>> from datasets import load_dataset
    >>> import soundfile as sf

    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
    >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")


    >>> def map_to_array(batch):
    ...     speech, _ = sf.read(batch["file"])
    ...     batch["speech"] = speech
    ...     return batch


    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> ds = ds.map(map_to_array)

    >>> input_values = processor(
    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
    ... ).input_values  # Batch size 1
    >>> logits = model(input_values).logits
    >>> predicted_ids = jnp.argmax(logits, axis=-1)

    >>> transcription = processor.decode(predicted_ids[0])
    >>> # should give:  "A MAN SAID TO THE UNIVERSE SIR I EXIST"
    ```
c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 	 	 	 dde	de
fdZ	 ddeej                  e	f   d	ee
   fd
Zy) FlaxWav2Vec2ForPreTrainingModuleri   r9   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                        | _        t        | j                  | j                        | _	        t	        j                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                        | _        t	        j                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                        | _        y )Nr8   r   )r  ri   r9   r|  rx   r   feat_quantizer_dropoutdropout_featuresrC  	quantizerr   proj_codevector_dimr}   r~   r   r   	project_qproject_hidr   s    r*   r   z&FlaxWav2Vec2ForPreTrainingModule.setup  s    *4;;djjI "

4;;+M+M N:4;;djjYKK++++224;;3P3PQ**

 88KK++++224;;3P3PQ**
r)   Ngumbel_temperaturer   c
           
      p   |	|	n| j                   j                  }	| j                  ||||||||	      }
| j                  |
d         }| j	                  |
d   |      }| j                  ||||      \  }}| j                  |      }|	s|||f|
dd z   S t        ||||
j                  |
j                        S )	zC
        Returns:

        Example:

        ```python

        ```N)r3   r"  r+  r^  r   r   r,  r   r   r   )r   r_  r   )r-   r.   r/   r   r    )
ri   use_return_dictr|  r  r  r  r  r,   r   r    )r   r   r3   r^  r  r   r"  r+  r   r,  r$  transformer_featuresr   quantized_featuresr/   s                  r*   r   z)FlaxWav2Vec2ForPreTrainingModule.__call__  s    * &1%<k$++B]B]--)/!5/'#9#   	
  $//
;  00=0Y48NN/}Zl 5C 5
11 "^^,>?(*<>STW^_`_aWbbb/1'9"7!//))
 	
r)   r  r  c                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )r  c                     | |z
  |z  dz   S r  r(   r  s      r*   r  z[FlaxWav2Vec2ForPreTrainingModule._get_feat_extract_output_lengths.<locals>._conv_out_length  r  r)   r   r  r  s          r*   r  zAFlaxWav2Vec2ForPreTrainingModule._get_feat_extract_output_lengths  r  r)   )NNr   TNNFNr   )r!   r"   r#   r   r'   r%   r   r9   r   r=   rD   r   r   r&   r   r  r(   r)   r*   r  r    s    {{E399"
& "#"!$5

  5
 5
p UY"3;;#34CKD>r)   r  z5Wav2Vec2 Model with a quantizer and `VQ` head on top.c                       e Zd ZeZ ee      	 	 	 	 	 	 	 	 	 	 	 ddedede	j                  j                  de	j                  j                  dedee   dee   d	ed
ee   fd       Zy)FlaxWav2Vec2ForPreTrainingNr  r   r   r`  r  r"  r+  r   r,  c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }|j                  \  }}|t        j                  ||f      }i }|||d<   |||d<   d|xs | j                  i}| j                  j                  |t        j                  |d      t        j                  |d      ||| |	|
|||      S )Nr   rZ  r   r  r8   r  r  r  )r   r   r3   r^  r  r   r   r`  r  r"  r+  r   r,  rM   rN   r  r  s                    r*   r   z#FlaxWav2Vec2ForPreTraining.__call__*  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY&2&8&8#
O! XXz?&CDN ")DO!'DNF1dkk2{{  IIl$/IInD1I " ! 
 	
r)   )NNr   NNNFNNFN)r!   r"   r#   r  r~  r   r  r=   r  r}   r?   r  rD   r   r   r(   r)   r*   r  r  &  s    3L*+GH
 "#*.)-,0/3',&*0

  0
 0
 ZZ''0
 JJ&&0
 0
 $D>0
 'tn0
 !%0
 d^0
 I0
r)   r  a  
    Returns:

    Example:

    ```python
    >>> import optax
    >>> import numpy as np
    >>> import jax.numpy as jnp
    >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
    >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
    >>> from datasets import load_dataset
    >>> import soundfile as sf

    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
    >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")


    >>> def map_to_array(batch):
    ...     speech, _ = sf.read(batch["file"])
    ...     batch["speech"] = speech
    ...     return batch


    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> ds = ds.map(map_to_array)

    >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values  # Batch size 1

    >>> # compute masked indices
    >>> batch_size, raw_sequence_length = input_values.shape
    >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
    >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)

    >>> outputs = model(input_values, mask_time_indices=mask_time_indices)

    >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
    >>> cosine_sim = optax.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states)

    >>> # show that cosine similarity is much higher than random
    >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
    ```
)Nr   r   )Qr$   	functoolsr   typingr   r   r   flax
flax.linenlinenrx   r}   	jax.numpynumpyr%   r>   flax.core.frozen_dictr   r   r	   flax.linen.attentionr
   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   r   configuration_wav2vec2r   
get_loggerr!   loggerstruct	dataclassr   r,   r=   r	  r&   rT   rf   WAV_2_VEC_2_START_DOCSTRINGr  r  rh   r   r   r   r   r   r   r  r  r&  r<  rC  rh  rp  rm  r{  r  r  FLAX_WAV2VEC2_MODEL_DOCSTRINGr  r  FLAX_WAV2VEC2_FOR_CTC_DOCSTRINGr  r  'FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRINGr(   r)   r*   <module>r     s     ) )   
   > > > ;  L  g f 2 
		H	% 4+ 4 4: 4{ 4 4L ,0Fc?FF F RZZ(	F
 F ZZFR$U $3 $X`acakakXl $B$ N   FRYY 8!RYY !H")) ,ryy 0 "1BII 1(X)BII X)vbii D"bii "J-
		 -
`4
 4
nK'		 K'\")) :ryy *")) "Zd"5 Zdzm m` h&3 &	&! <  #@@ !#>^
Dryy DN l,4 ,	,!# F  #BB !!3ASbp q`ryy `F QSno5
!< 5
 p5
p*+ 'X  #JJ !,L[ir)   