
    sg*                         d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ  ej$                  e      Z G d	 d
e      Zy)z"
Feature extractor class for CLVP
    )ListOptionalUnionN   )mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                   .    e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 d fd	Zdej                  dej                  fdZ		 	 	 	 	 	 	 dd	e
ej                  ee   eej                     eee      f   d
ee   dedee   dee
eef      dee   dee   dee   defdZ xZS )ClvpFeatureExtractora!  
    Constructs a CLVP feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
    Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 22050):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        default_audio_length (`int`, *optional*, defaults to 6):
            The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
            automatically be set to default_audio_length * `self.sampling_rate`.
        hop_length (`int`, *optional*, defaults to 256):
            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 1024):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        mel_norms (`list` of length `feature_size`, *optional*):
            If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
            mel-filter.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether to return the attention mask. If left to the default, it will return the attention mask.

            [What are attention masks?](../glossary#attention-mask)
    input_featuresattention_maskc
           	          t        |   d	||||	d|
 || _        || _        || _        ||z  | _        | j
                  |z  | _        || _        || _        || _	        t        d|dz  z   |dd|dd      | _        y )
N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaneyhtk)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   default_audio_length	mel_normsr   mel_filters)selfr   r   r*   r&   r'   r%   r   r+   r   kwargs	__class__s              c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clvp/feature_extraction_clvp.pyr$   zClvpFeatureExtractor.__init__G   s     	 	
%''"7		

 	
 
$(%5!^^z9*$8!"* EQJ/( '
    waveformreturnc           	      N   t        |t        | j                  d      | j                  | j                  d| j                  d      }t        j                  t        j                  |dd            }| j                  )|t        j                  | j                        dddf   z  }|S )z
        This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
        each mel-filterbank, if `mel_norms` is provided.
        hanng       @N)frame_lengthr&   powerr,   log_melgh㈵>)a_mina_max)
r   r	   r%   r&   r,   nplogclipr+   array)r-   r2   log_specs      r0   _np_extract_fbank_featuresz/ClvpFeatureExtractor._np_extract_fbank_featuresm   s    
 DJJ/((
 66"''($dCD>>%"((4>>":1d7"CCHr1   
max_length
raw_speechr   
truncationpad_to_multiple_ofreturn_tensorsr   paddingc	                 (   |O|| j                   k7  rUt        d| j                  j                   d| j                    d| j                    d| d	      t        j                  d       t        |t        j                        xr t        |j                        dkD  }
|
r&t        |j                        dkD  rt        d	|        |
xs@ t        |t        t        f      xr( t        |d
   t        j                  t        t        f      }|r>|D cg c]2  }t        j                  |gt        j                        j                  4 }}n|s@t        |t        j                        s&t        j                  |t        j                        }nht        |t        j                        rN|j                   t        j                   t        j"                        u r|j%                  t        j                        }|s!t        j                  |g      j                  g}t'        d|i      }|| j(                  | j                   z  n|}| j+                  ||||||      }|j-                  d      j/                  dd
d      }|d
   D cg c]0  }| j1                  |      j%                  t        j                        2 }}t        |d
   t2              r'|D cg c]  }t        j                  |       c}|d<   n||d<   |j5                  |      S c c}w c c}w c c}w )a	  
        `ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
        voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.

        First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
        seconds long and then the log-mel spectrogram is extracted from it.

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask. If left to the default, it will return the attention mask.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            max_length (`int`, *optional*):
                The maximum input length of the inputs.
        z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zIt is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )dtyper   )rF   rA   rC   rD   r   )r   
ValueErrorr/   __name__loggerwarning
isinstancer;   ndarraylenshapelisttupleasarrayfloat32TrI   float64astyper   r*   padget	transposer@   r   convert_to_tensors)r-   rB   r   rC   rD   rE   r   rF   rA   r.   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   r2   features                     r0   __call__zClvpFeatureExtractor.__call__   s   f $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NN\
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\v"**fXRZZ@BB\J\Jz2::$FJbjjAJ
BJJ/J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DEGQGYT..1C1CC_i
!!1"7 ! 
 '**+;<FFq!QO ZhhiYj
MUD++H5<<RZZH
 
 nQ'.R`.awrzz'/B.aM*+.<M*+//??G ]4

 /bs   7L>5L
L)	P   i"V           i   r   NF)NTNNTrA   N)rK   
__module____qualname____doc__model_input_namesr$   r;   r>   rO   r@   r   r   floatr   intboolstrr   r   rc   __classcell__)r/   s   @r0   r   r   !   s'   !F *+;< #$
L288 

 2 (,,0;?04!-$(k@"**d5k4

3CT$u+EVVWk@  }k@ 	k@
 %SMk@ !sJ!78k@  (~k@ #k@ SMk@ 
k@r1   r   )rj   typingr   r   r   numpyr;   audio_utilsr   r   r	   !feature_extraction_sequence_utilsr
   feature_extraction_utilsr   utilsr   r   
get_loggerrK   rL   r   r"   r1   r0   <module>rx      sI     ) (  H H I 4 ( 
		H	%M@3 M@r1   