
    sgE                         d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZ  ej.                  e      Z G d	 d
e      Zy)z%Feature extractor class for SpeechT5.    N)AnyDictListOptionalUnion   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                        e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dededededed	ed
edededededededef fdZ	e
	 d$deej                     deej                     dedeej                     fd       Zdej                  dej                  fdZ	 	 	 	 	 	 	 	 	 d%deeej                  ee   eej                     eee      f      deeej                  ee   eej                     eee      f      deeeef   dee   dedee   dee   deeeef      dee   defdZ	 	 	 	 	 	 	 d&deej                  ee   eej                     eee      f   d edeeeef   dee   dedee   dee   deeeef      defd!Zdeeef   f fd"Z xZS )'SpeechT5FeatureExtractora
  
    Constructs a SpeechT5 feature extractor.

    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
    the SpeechT5 speech encoder prenet.

    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
    prenet.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 80):
            The number of mel-frequency bins in the extracted spectrogram features.
        hop_length (`int`, *optional*, defaults to 16):
            Number of ms between windows. Otherwise referred to as "shift" in many papers.
        win_length (`int`, *optional*, defaults to 64):
            Number of ms per window.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        frame_signal_scale (`float`, *optional*, defaults to 1.0):
            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
        fmin (`float`, *optional*, defaults to 80):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*, defaults to 7600):
            Maximum mel frequency in Hz.
        mel_floor (`float`, *optional*, defaults to 1e-10):
            Minimum value of mel frequency banks.
        reduction_factor (`int`, *optional*, defaults to 2):
            Spectrogram length reduction factor. This argument is deprecated.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
    input_valuesattention_maskfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionframe_signal_scalefminfmax	mel_floorreduction_factorreturn_attention_maskc           	         t        |   d|||d| || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        ||z  dz  | _        ||z  dz  | _        t        | j                        | _        | j                   dz  dz   | _        t%        | j                  | j                  d      | _        t)        | j"                  | j                  | j                  | j                  | j*                  dd      | _        |	d	k7  rt/        j0                  d
t2               |dk7  rt/        j0                  dt2               y y )N)r   r   r   i        T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale      ?zeThe argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformersg       @zcThe argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers )super__init__r   r#   r   r   r   r   r   r   r    r!   r"   sample_sizesample_strider
   n_fftn_freqsr   windowr	   r   mel_filterswarningswarnFutureWarning)selfr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   kwargs	__class__s                   k/var/www/html/venv/lib/python3.12/site-packages/transformers/models/speecht5/feature_extraction_speecht5.pyr4   z!SpeechT5FeatureExtractor.__init__N   sQ   $ 	wl-_lwpvw(%:"($$("4		" 0%5='-74?'(8(89


a1,%D4D4D4K\K\gkl*#|| --)))),,
 $MMw s"MMu #    returnc                    |t        j                  |t         j                        }g }t        | |j	                  d            D ]m  \  }}||d| j                         z
  t        j                  |d| j                         dz         z  }||j                  d   k  r|||d |j                  |       o |S | D cg c]<  }||j                         z
  t        j                  |j                         dz         z  > }}|S c c}w )z[
        Every array in the list is normalized to have zero mean and unit variance
        NgHz>r   )
nparrayint32zipsummeansqrtvarshapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slicexs           rA   zero_mean_unit_var_normz0SpeechT5FeatureExtractor.zero_mean_unit_var_norm   s    %XXnbhh?N"$"%lN4F4Fr4J"K 9 &)=)=)? ?2776RYSY?K^K^K`cgKgChhL..q11,9L)#**<89 #" Vb"bPQALBGGAEEGdN4K#K"b"b"" #cs   :AC?one_waveformc           
          t        || j                  | j                  | j                  | j                  | j
                  | j                  d      }|j                  S )zZ
        Extracts log-mel filterbank features for one waveform array (unbatched).
        log10)r9   frame_lengthr   
fft_lengthr:   r!   log_mel)r   r9   r5   r6   r7   r:   r!   T)r>   rV   log_mel_specs      rA   _extract_mel_featuresz.SpeechT5FeatureExtractor._extract_mel_features   sP     #;;))))zz((nn	
 ~~rB   audioaudio_targetpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc
                    ||t        d      |	;|	| j                  k7  rAt        d|  d| j                   d| j                   d|	 d	      t        j                  d       | | j                  |d	||||||fi |
}nd}|> | j                  |d
||||||fi |
}||S |d   |d<   |j                  d      }|||d<   |S )aA  
        Main method to featurize and prepare for the model one or several sequence(s).

        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
        spectrogram features.

        Args:
            audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
                be mono channel audio, not stereo, i.e. single float per timestep.
            audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
                spectrogram features.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
                to pass `sampling_rate` at the forward call to prevent silent errors.
        Nz9You must provide either `audio` or `audio_target` values.z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zIt is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.FTr   labelsr   decoder_attention_mask)
ValueErrorr   loggerwarning_process_audioget)r>   r_   r`   ra   rb   rc   rd   r#   re   r   r?   inputsinputs_targetri   s                 rA   __call__z!SpeechT5FeatureExtractor.__call__   sI   ~ =\1XYY$ 2 22 I$ P**+ ,**+9]O1F  NN\
 (T(("%
 
F F#/D//"%
 
M ~$$#0#@x )6):):;K)L&)57MF34rB   speech	is_targetc	           	      Z   t        |t        j                        xr t        |j                        dkD  }
|
r&t        |j                        dkD  rt        d|        |
xs@ t        |t        t        f      xr( t        |d   t        j                  t        t        f      }|r4|D cg c]'  }t        j                  |t        j                        ) c}}n|s@t        |t        j                        s&t        j                  |t        j                        }nht        |t        j                        rN|j                  t        j                  t        j                        u r|j                  t        j                        }|s|g}| j                  }|r=|D cg c]  }| j                  |       }}t        d|i      }| j                   | _        nt        d|i      } | j"                  |f|||||d|	}|| _        |d   }t        |d   t        j                        s8|D cg c]'  }t        j                  |t        j                        ) c}|d<   nt        |t        j                        st        |d   t        j                        rc|d   j                  t        j                  t        j                        u r1|D cg c]!  }|j                  t        j                        # c}|d<   nkt        |t        j                        rQ|j                  t        j                  t        j                        u r"|j                  t        j                        |d<   |j%                  d      }|6|D cg c]'  }t        j                  |t        j&                        ) c}|d<   |sW| j(                  rK| j+                  ||	      t,        j.                  ur|nd }| j1                  |d   || j2                  
      |d<   ||j5                  |      }|S c c}w c c}w c c}w c c}w c c}w )Nr&   r%   z2Only mono-channel audio is supported for input to r   )dtyper   )ra   rb   rc   rd   r#   r   )rb   )r   r   )
isinstancerF   ndarraylenrN   rj   listtupleasarrayfloat32ru   float64astyper   r^   r   r   padrn   rH   r   _get_padding_strategiesr   
DO_NOT_PADrU   r   convert_to_tensors)r>   rr   rs   ra   rb   rc   rd   r#   re   r?   is_batched_numpy
is_batchedfeature_size_hackwaveformfeaturesencoded_inputspadded_inputsr   rG   r   s                       rA   rm   z'SpeechT5FeatureExtractor._process_audio)  s|    &fbjj9Sc&,,>ORS>SFLL 1A 5QRVQWXYY% 
ve}-d:fQi"**V[]aIb3c 	 IOPvbjjrzz:PFJvrzz$BZZbjj9F

+@T0T]]2::.F XF !-- MST228<THT)>8*DEN $ 1 1D)>6*BCN 
!!1"7
 
 . %^4,q/2::6^j,kUZRZZRZZ-P,kM.)<4<?BJJ7Q%%"**)==S_,`%U\\"**-E,`M.)bjj1l6H6HBHHUWU_U_L`6`,8,?,?

,KM.) '**+;<%^l.mUZrzz%rxx/P.mM*+ T.. //J/OWfWqWqq  
 -1,H,Hn-n\`\n\n -I -M.) %)<<^LMC Q U* -l -a /ns   ',PP$,P&P#4,P(c                 J    t         |          }g d}|D ]
  }||v s||=  |S )N)r9   r:   r5   r6   r7   r8   )r3   to_dict)r>   outputnamesr(   r@   s       rA   r   z SpeechT5FeatureExtractor.to_dict  s;    " ^ 	!Dv~4L	! rB   )r&   i>          FP      @   hann_windowr1   r   i  g|=r%   T)r   )	NNFNFNNNN)FFNFNNN)__name__
__module____qualname____doc__model_input_namesintfloatboolstrr4   staticmethodr   rF   rw   rU   r^   r   r   r   r   r   rq   rm   r   r   r   __classcell__)r@   s   @rA   r   r      sI   *X ()9: """)$'  !&*:: : 	:
 : : : : : ": : : : :  $:x  be#2::&#8<RZZ8H#Y^#	bjj	# #*jj 
* `dfj5:$( ,004;?'+sbjj$u+tBJJ7GdSXkIZZ[\s uRZZed2::>NPTUYZ_U`Pa%abcs tS/12	s
 SMs s %SMs  (~s !sJ!78s  }s 
sp  5:$( ,004;?Ubjj$u+tBJJ/?d5kARRSU U tS/12	U
 SMU U %SMU  (~U !sJ!78U 
Un	c3h 	 	rB   r   )r   r;   typingr   r   r   r   r   numpyrF   audio_utilsr	   r
   r   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerr   rk   r   r2   rB   rA   <module>r      sK    ,  3 3  \ \ I 4 9 9 
		H	%j7 jrB   