
    sg'-                         d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ  ej                  e      Z G d d	e      Zy)
z&
Feature extractor class for Wav2Vec2
    )ListOptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   h    e Zd ZdZddgZ	 	 	 	 	 d fd	Ze	 ddeej                     deej                     de
deej                     fd       Z	 	 	 	 	 	 	 ddeej                  ee
   eej                     eee
      f   d	eeeef   d
ee   dedee   dee   deeeef      dee   defdZ xZS )Wav2Vec2FeatureExtractora  
    Constructs a Wav2Vec2 feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models, *e.g.*,
            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.

            <Tip>

            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
            should be passed.

            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
            passed for batched inference.

            </Tip>input_valuesattention_maskc                 H    t        |   d|||d| || _        || _        y )N)feature_sizesampling_ratepadding_value )super__init__return_attention_maskdo_normalize)selfr   r   r   r   r   kwargs	__class__s          k/var/www/html/venv/lib/python3.12/site-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.pyr   z!Wav2Vec2FeatureExtractor.__init__C   s0     	wl-_lwpvw%:"(    r   returnc                    |t        j                  |t         j                        }g }t        | |j	                  d            D ]m  \  }}||d| j                         z
  t        j                  |d| j                         dz         z  }||j                  d   k  r|||d |j                  |       o |S | D cg c]<  }||j                         z
  t        j                  |j                         dz         z  > }}|S c c}w )z[
        Every array in the list is normalized to have zero mean and unit variance
        NgHz>r   )
nparrayint32zipsummeansqrtvarshapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slicexs           r   zero_mean_unit_var_normz0Wav2Vec2FeatureExtractor.zero_mean_unit_var_normP   s    %XXnbhh?N"$"%lN4F4Fr4J"K 9 &)=)=)? ?2776RYSY?K^K^K`cgKgChhL..q11,9L)#**<89 #" Vb"bPQALBGGAEEGdN4K#K"b"b"" #cs   :AC?
raw_speechpadding
max_length
truncationpad_to_multiple_ofr   return_tensorsr   c	                 ~   |;|| j                   k7  rAt        d|  d| j                    d| j                    d| d	      t        j                  d       t	        |t
        j                        xr t        |j                        dkD  }
|
r&t        |j                        d	kD  rt        d
|        |
xs@ t	        |t        t        f      xr( t	        |d   t
        j                  t        t        f      }|s|g}t        d|i      }| j                  ||||||      }|d   }t	        |d   t
        j                        s8|D cg c]'  }t        j                  |t
        j                        ) c}|d<   nt	        |t
        j                        st	        |d   t
        j                        rc|d   j                  t        j                  t
        j                         u r1|D cg c]!  }|j#                  t
        j                        # c}|d<   nkt	        |t
        j                        rQ|j                  t        j                  t
        j                         u r"|j#                  t
        j                        |d<   |j%                  d      }|6|D cg c]'  }t        j                  |t
        j&                        ) c}|d<   | j(                  rK| j+                  ||      t,        j.                  ur|nd}| j1                  |d   || j2                        |d<   ||j5                  |      }|S c c}w c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
                [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
                `attention_mask` should be passed.

                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
                [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
                be passed for batched inference.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            padding_value (`float`, *optional*, defaults to 0.0):
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zIt is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.      z2Only mono-channel audio is supported for input to r   r   )r2   r3   r4   r5   r   )dtyper   )r3   )r   r   )r   
ValueErrorloggerwarning
isinstancer!   ndarraylenr)   listtupler   padasarrayfloat32r;   float64astypegetr#   r   _get_padding_strategiesr	   
DO_NOT_PADr0   r   convert_to_tensors)r   r1   r2   r3   r4   r5   r   r6   r   r   is_batched_numpy
is_batchedencoded_inputspadded_inputsr   r"   r   s                    r   __call__z!Wav2Vec2FeatureExtractor.__call__f   s   L $ 2 22 I$ P**+ ,**+9]O1F  NN\
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	
 $J &~z&BC!!1"7 ! 
 %^4,q/2::6^j,kUZRZZRZZ-P,kM.)<4<?BJJ7Q%%"**)==S_,`%U\\"**-E,`M.)bjj1l6H6HBHHUWU_U_L`6`,8,?,?

,KM.) '**+;<%^l.mUZrzz%rxx/P.mM*+  //J/OWfWqWqq  
 -1,H,Hn-n\`\n\n -I -M.) %)<<^LM; -l -a /ns   ,L0#&L5,L:)r9   i>          FT)rR   )FNFNNNN)__name__
__module____qualname____doc__model_input_namesr   staticmethodr   r!   r@   floatr0   r   boolstrr	   r   intr
   r   rQ   __classcell__)r   s   @r   r   r      sO   B ()9: #) ad#2::&#8<RZZ8H#Y^#	bjj	# #0 6;$( ,004;?'+J"**d5k4

3CT$u+EVVWJ tS/12J SM	J
 J %SMJ  (~J !sJ!78J  }J 
Jr   r   )rV   typingr   r   r   numpyr!   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr	   r
   r   
get_loggerrS   r=   r   r   r   r   <module>rd      sD    ) (  I 4 9 9 
		H	%Q7 Qr   