
    sg%Y                         d Z ddlmZmZmZmZmZ ddlZddl	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZmZmZ  ej,                  e      Z G d	 d
e      Zy)z)Feature extractor class for UnivNetModel.    )AnyDictListOptionalUnionN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc            )           e Zd ZdZg dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d-dedededededed	ed
ede	e   dedede	e   dedededededededef( fdZ
d Zd Zdej                  dej                  fdZ	 d.dede	ej                   j"                     dej                  fdZd.deej                     fd Z	 	 	 	 	 	 	 	 	 	 	 	 d/d!eej                  ee   eej                     eee      f   de	e   d"eeeef   d#e	e   d$ed%e	e   d&ede	ej                   j"                     d'ed(e	e   de	e   d)e	e   d*e	eeef      defd+Zdeeef   f fd,Z xZS )0UnivNetFeatureExtractora  
    Constructs a UnivNet feature extractor.

    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
    STFT implementation follows that of TacoTron 2 and Hifi-GAN.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value to pad with when applying the padding strategy defined by the `padding` argument to
            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
            `__call__` will also use this padding value.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
            performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 100):
            The number of mel-frequency bins in the extracted spectrogram features. This should match
            `UnivNetModel.config.num_mel_bins`.
        hop_length (`int`, *optional*, defaults to 256):
            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
            the `hop_length` in ms.
        win_length (`int`, *optional*, defaults to 1024):
            The direct number of samples for each sliding window. Note that this is different from other audio feature
            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        filter_length (`int`, *optional*, defaults to 1024):
            The number of FFT components to use. If `None`, this is determined using
            `transformers.audio_utils.optimal_fft_length`.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input lenght of the model in seconds. This is used to pad the audio.
        fmin (`float`, *optional*, defaults to 0.0):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*):
            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
        mel_floor (`float`, *optional*, defaults to 1e-09):
            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
            different than in [`transformers.audio_utils.spectrogram`].
        center (`bool`, *optional*, defaults to `False`):
            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
            `t` will start at time `t * hop_length`.
        compression_factor (`float`, *optional*, defaults to 1.0):
            The multiplicative compression factor for dynamic range compression during spectral normalization.
        compression_clip_val (`float`, *optional*, defaults to 1e-05):
            The clip value applied to the waveform before applying dynamic range compression during spectral
            normalization.
        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        model_in_channels (`int`, *optional*, defaults to 64):
            The number of input channels to the [`UnivNetModel`] model. This should match
            `UnivNetModel.config.model_in_channels`.
        pad_end_length (`int`, *optional*, defaults to 10):
            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
            number of appended samples will be `pad_end_length * hop_length`.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
    )input_featuresnoise_sequencepadding_maskfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfilter_lengthmax_length_sfminfmax	mel_floorcentercompression_factorcompression_clip_valnormalize_minnormalize_maxmodel_in_channelspad_end_lengthc           	         t        |   d||||d| || _        || _        || _        || _        || _        |	| _        || _        |t        |      dz  }|| _
        || _        |
| _        |
|z  | _        | j                  t        | j
                        | _        n| j                  | _        | j                  dz  dz   | _        t#        | j
                  | j                  d      | _        t'        | j                   | j                  | j                  | j                  | j(                  dd      | _        || _        || _        || _        || _        || _        || _        || _        y )	N)r   r   r   return_attention_mask      T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__r   r   r   r   r   r   r!   floatr"   r#   r    num_max_samplesr
   n_fftn_freqsr   windowr	   r   mel_filtersr$   r%   r&   r'   r(   r)   r*   )selfr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   kwargs	__class__s                          i/var/www/html/venv/lib/python3.12/site-packages/transformers/models/univnet/feature_extraction_univnet.pyr;   z UnivNetFeatureExtractor.__init__e   sf   2 	 	
%''"7		

 	
 )($$(*	<'!+D	"(+m;%+DOO<DJ++DJ

a1,%DOO$J[J[fjk*#|| --)))),,
 "4$8!**!2,    c                 `    d|| j                   z
  | j                  | j                   z
  z  z  dz
  S )Nr-   r.   r'   r(   rB   r   s     rE   	normalizez!UnivNetFeatureExtractor.normalize   s4    [4#5#55$:L:LtOaOa:abcfgggrF   c                 `    | j                   | j                  | j                   z
  |dz   dz  z  z   S )Nr.   r-   rH   rI   s     rE   denormalizez#UnivNetFeatureExtractor.denormalize   s6    !!T%7%7$:L:L%LR]`aRaefQf$gggrF   waveformreturnc                    t        j                  |t        | j                  | j                  z
  dz        t        | j                  | j                  z
  dz        fd      }t        || j                  | j                  | j                  | j                  d| j                  dd	      }t        j                  t        j                  |      dz  t        j                  |      dz  z   | j                  z         }t        j                  | j                  j                  |      }t        j                  t        j                   || j"                  d      | j$                  z        }|j                  S )a  
        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

        Args:
            waveform (`np.ndarray` of shape `(length,)`):
                The input waveform. This must be a single real-valued, mono waveform.

        Returns:
            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
        r-   reflect)modeN)r@   frame_lengthr   
fft_lengthpowerr$   rA   r#   )a_mina_max)nppadintr>   r   r   r@   r$   sqrtrealimagr#   matmulrA   Tlogclipr&   r%   )rB   rM   complex_spectrogramamplitude_spectrogrammel_spectrogramlog_mel_spectrograms         rE   rc   z'UnivNetFeatureExtractor.mel_spectrogram   s/    66$**t.!34c4::;W[\:\6]^
 *;;zz;;

 !#GG'(A-8K0LPQ0QQTXTbTbb!
 ))D$4$4$6$68MN !ffGGO4+D+DDQTXTkTkk

 #$$$rF   noise_length	generatorc                     |t         j                  j                         }|| j                  f}|j	                  |t         j
                        }|S )a  
        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
        [`UnivNetModel.forward`].

        Args:
            spectrogram_length (`int`):
                The length (dim 0) of the generated noise.
            model_in_channels (`int`, *optional*, defaults to `None`):
                The number of features (dim 1) of the generated noise. This should correspond to the
                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
                `self.config.model_in_channels`.
            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
                new generator with fresh entropy will be created.

        Returns:
            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
            model_in_channels)`.
        dtype)rW   randomdefault_rngr)   standard_normalfloat32)rB   re   rf   noise_shapenoises        rE   generate_noisez&UnivNetFeatureExtractor.generate_noise   sJ    0 		--/I#T%;%;<))+RZZ)HrF   c                     |D cg c]<  }|j                         j                         j                         j                         > }}|#t	        |      D cg c]  \  }}|d||     }}}|S c c}w c c}}w )a  
        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
        lengths after removing padding.

        Args:
            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                The batched output waveforms from the [`UnivNetModel`].
            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
                The batched lengths of each waveform before padding.

        Returns:
            `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
        N)detachclonecpunumpy	enumerate)rB   	waveformswaveform_lengthsrM   is        rE   batch_decodez$UnivNetFeatureExtractor.batch_decode  s~      NWWX__&,,.224::<W	W'LUV_L`a[Q"7$4Q$78aIa X bs   AA/A4
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_noisepad_end
pad_lengthr,   return_tensorsc                 X   ||n| j                   }|O|| j                  k7  rUt        d| j                  j                   d| j                   d| j                   d| d	      t
        j                  d       t        |t        j                        xr t        |j                        dkD  }|r&t        |j                        dkD  rt        d	|        |xs@ t        |t        t        f      xr( t        |d
   t        j                  t        t        f      }|r3|D cg c]'  }t        j                  |t        j                        ) }}n|s@t        |t        j                        s&t        j                  |t        j                        }nht        |t        j                        rN|j                   t        j                   t        j"                        u r|j%                  t        j                        }|s&t        j                  |t        j                        g}|	rN|
|
n| j&                  }
|D cg c]3  }t        j(                  |d
|
| j*                  z  f| j,                        5 }}t/        d|i      }| j)                  ||||n| j0                  |||      }|j3                  d      }|D cg c]  }| j5                  |       }}t        |d
   t6              r7|D cg c]'  }t        j                  |t        j                        ) c}|d<   n0|D cg c]!  }|j%                  t        j                        # c}|d<   |j3                  d      }|6|D cg c]'  }t        j                  |t        j8                        ) c}|d<   |r4|d   D cg c]!  }| j;                  |j                  d
   |      # }}||d<   |r%|d   D cg c]  }| j=                  |       c}|d<   ||j?                  |      }|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
                padding index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*, defaults to `True`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_noise (`bool`, *optional*, defaults to `True`):
                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
                An optional `numpy.random.Generator` random number generator to use when generating noise.
            pad_end (`bool`, *optional*, defaults to `False`):
                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
                padding will be done before the padding strategy specified in `padding` is performed.
            pad_length (`int`, *optional*, defaults to `None`):
                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
                will default to `self.config.pad_end_length`.
            do_normalize (`bool`, *optional*):
                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
                the performance for some models. If not set, this will default to `self.config.do_normalize`.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.np.array` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zIt is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.r.   r-   z2Only mono-channel audio is supported for input to r   rh   )constant_valuesr   )r|   r}   r~   r   r,   attention_maskr   r   ) r   r   
ValueErrorrD   __name__loggerwarning
isinstancerW   ndarraylenshapelisttupleasarrayrm   ri   float64astyper*   rX   r   r   r   r=   getrc   r   int32rp   rJ   convert_to_tensors)rB   r{   r   r|   r}   r~   r   r   rf   r   r   r   r,   r   is_batched_numpy
is_batchedspeechrM   batched_speechpadded_inputsr   mel_spectrogramsmelr   arrayr   ro   s                              rE   __call__z UnivNetFeatureExtractor.__call__  s   X (4'?|TEVEV$ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NN\
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 MWX6"**V2::>XJXJz2::$FJbjjAJ
BJJ/J4D4DQSQ[Q[H\4\#**2::6J **ZrzzBCJ '1'=4CVCVJ !+ x!Z$//%A!BTXTfTfgJ 
 &'7&DE%/%;zAUAU!1"7 ! 
 '**+;<KYZxD00:ZZnQ'.]m/nVY

3bjj0Q/nN+,Rb/c3

2::0F/cN+, '**+;<%]k-lTYbjjbhh.O-lN>* $22B#C ##K$5$5a$8)DE  05N+,?MN^?_00;{+0N+, %+>>~NNy Y( [ 0o/c
 .m0s0   ,P48P	5P&,P&P ,P&P"P'c                 J    t         |          }g d}|D ]
  }||v s||=  |S )N)r@   rA   r>   r?   r=   )r:   to_dict)rB   outputnamesr0   rD   s       rE   r   zUnivNetFeatureExtractor.to_dict  s;    " Q 	!Dv~4L	! rF   )r.   i]          Fd         hann_windowr   
   r   Ng&.>Fg      ?gh㈵>g    'g    ă@@   r   T)N)NTNTNTNFNNNN)r   
__module____qualname____doc__model_input_namesrY   r<   boolstrr   r;   rJ   rL   rW   r   rc   rj   	Generatorrp   r   rz   r   r   r   r   r   r   r   r   __classcell__)rD   s   @rE   r   r      s   CJ M """)'+ $$'&*21!# "-J-J- J- 	J-
 J- J- J- J- J-  }J- J- J- uoJ- J- J-  "!J-" $#J-$ %J-& 'J-( )J-* +J-Xhh.%

 .%rzz .%f 48 BII//0 
	@RZZ@P 4 (,59$(,0!37$(&*04;?_"**d5k4

3CT$u+EVVW_  }_ tS/12	_
 SM_ _ %SM_ _ BII//0_ _ SM_ sm_  (~_ !sJ!78_ 
_B	c3h 	 	rF   r   )r   typingr   r   r   r   r   ru   rW   audio_utilsr	   r
   r   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerr   r   r   r9   rF   rE   <module>r      sH    0 3 3  \ \ I 4 9 9 
		H	%k6 krF   