
    sg/:                         d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ  e       rddlZ ej*                  e      Z G d
 de      Zy)z%
Feature extractor class for Whisper
    )ListOptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       e Zd ZdZdgZ	 	 	 	 	 	 	 d fd	Zdej                  dedej                  fdZ
ddej                  dedej                  fd	Ze	 dd
eej                     deej                     dedeej                     fd       Z	 	 	 	 	 	 	 	 	 	 ddeej                  ee   eej                     eee      f   dedee   deeeef      dee   dee   dee   dee   dee   dee   dee   defdZ xZS )WhisperFeatureExtractoraZ  
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    input_featuresc           	          t        	|   d||||d| || _        || _        || _        ||z  | _        | j
                  |z  | _        || _        t        d|dz  z   |dd|dd      | _	        y )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )
super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   r   mel_filters)
selfr   r   r%   r&   r$   r   r   kwargs	__class__s
            i/var/www/html/venv/lib/python3.12/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr#   z WhisperFeatureExtractor.__init__@   s     	 	
%''"7		

 	
 
$(%5!^^z9** 5A:~( '
    waveform_batchdevicereturnc           
         |dk7  rt        d| d      g }|D ]  }t        |t        | j                  d      | j                  | j                  d| j
                  d      }|dddd	f   }t        j                  ||j                         d
z
        }|dz   dz  }|j                  |        t        j                  |      }|S )z
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr%   powerr)   log_melN       @      @)
ValueErrorr	   r
   r$   r%   r)   npmaximummaxappendarray)r*   r/   r0   log_spec_batchwaveformlog_specs         r-   _np_extract_fbank_featuresz2WhisperFeatureExtractor._np_extract_fbank_featuresb   s    
 U?vh 'q q 
 & 	,H"

F3!ZZ?? ,,H  3B3'Hzz(HLLNS,@AH 3#-H!!(+	, .1r.   rC   c                     t        j                  |      j                  t         j                        }t        j                  | j
                        }|dk7  r"|j                  |      }|j                  |      }t        j                  || j
                  | j                  |d      }|dddf   j                         dz  }t        j                  | j                        j                  t         j                        }|dk7  r|j                  |      }|j                  |z  }t        j                  |d	      j                         }|j                         dk(  rD|j                  dd
      d   j                  dd
      d   }	t        j                   ||	dz
        }n't        j                   ||j                         dz
        }|dz   dz  }|dk7  r|j#                         j%                         }|j'                         S )z
        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
        yielding results similar to cpu computing with 1e-5 tolerance.
        r3   T)windowreturn_complex.Nr9   r   g|=)min)dimkeepdimr   r   r:   r;   )torch
from_numpytypefloat32hann_windowr$   tostftr%   absr)   Tclampr5   rJ   r?   r>   detachr3   numpy)
r*   rC   r0   rG   rR   
magnitudesr)   mel_specrD   max_vals
             r-   _torch_extract_fbank_featuresz5WhisperFeatureExtractor._torch_extract_fbank_features   s   
 ##H-225==A""4::.U?{{6*HYYv&Fzz(DJJ_cd#ss(^'')Q.
&&t'7'78==emmLU?%..0K==:-;;xU399;<<>Qllq$l7:>>1d>STUVG}}Xw}=H}}Xx||~/CDHsNc)U?(,,.H~~r.   input_valuesattention_maskr   c                    |t        j                  |t         j                        }g }t        | |j	                  d            D ]m  \  }}||d| j                         z
  t        j                  |d| j                         dz         z  }||j                  d   k  r|||d |j                  |       o |S | D cg c]<  }||j                         z
  t        j                  |j                         dz         z  > }}|S c c}w )z[
        Every array in the list is normalized to have zero mean and unit variance
        Nr9   gHz>r   )
r=   rA   int32zipsummeansqrtvarshaper@   )r\   r]   r   normed_input_valuesvectorlengthnormed_slicexs           r-   zero_mean_unit_var_normz/WhisperFeatureExtractor.zero_mean_unit_var_norm   s    %XXnbhh?N"$"%lN4F4Fr4J"K 9 &)=)=)? ?2776RYSY?K^K^K`cgKgChhL..q11,9L)#**<89 #" Vb"bPQALBGGAEEGdN4K#K"b"b"" #cs   :AC?
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc                 8   |O|| j                   k7  rUt        d| j                  j                   d| j                    d| j                    d| d	      t        j                  d       t        |t        j                        xr t        |j                        dkD  }|r&t        |j                        d	kD  rt        d
|        |xs@ t        |t        t        f      xr( t        |d   t        j                  t        t        f      }|r>|D cg c]2  }t        j                  |gt        j                        j                  4 }}n|s@t        |t        j                        s&t        j                  |t        j                        }nht        |t        j                        rN|j                   t        j                   t        j"                        u r|j%                  t        j                        }|s!t        j                  |g      j                  g}t'        d|i      }| j)                  |||r|n| j*                  |||xs |	      }|	rD| j-                  |d   |d   | j.                        |d<   t        j0                  |d   d      |d<   |j3                  d      j5                  d	dd      }t7               r| j8                  n| j:                  } ||d   |
      }t        |d   t<              r7|D cg c]'  }t        j                  |t        j                        ) c}|d<   n||d<   |r|d   dddd| j>                  f   |d<   |)|D cg c]  }t        |      | j>                  z   c}|d<   ||jA                  |      }|S c c}w c c}w c c}w )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zIt is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )dtyper   )rq   rl   rn   ro   r   r]   )r]   r   )axis
num_frames)!r   r<   r,   __name__loggerwarning
isinstancer=   ndarraylenre   listtupleasarrayrO   rT   rv   float64astyper   padr'   rk   r   stackget	transposer   r[   rE   r   r%   convert_to_tensors)r*   rm   rn   ro   rp   r   rq   rl   r   rr   r0   rs   r+   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   extract_fbank_featuresfeatureraw_speech_is                         r-   __call__z WhisperFeatureExtractor.__call__   su   F $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  NN\
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\v"**fXRZZ@BB\J\Jz2::$FJbjjAJ
BJJ/J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DE %/zT^^!1"7"G< ! 
 .2.J.J./,-=>"00 /K /M*+
 /1hh}EU7V]^._M*+ '**+;<FFq!QO 3E2FD..DLkLk 	 0q0A6JnQ'.dr.sY`rzz'/T.sM*+ /=M*+ .;<L.MaQcTXTcTcQcNc.dM*+".dn*oT`3|+<+O*oM,'%)<<^LMq ]R /t +ps   7N7,NN)P   i>        i  r   F)r3   )r   )
TNNNrl   NNNr3   N)ry   
__module____qualname____doc__model_input_namesr#   r=   rA   strr}   rE   r[   staticmethodr   floatrk   r   boolr   intr   r   r   __classcell__)r,   s   @r-   r   r   $   s   2 ** # 
D 3 SUS]S] : bhh    XZXbXb  <  be#2::&#8<RZZ8H#Y^#	bjj	# #0  ,0;?04!-$('+'+ %26P"**d5k4

3CT$u+EVVWP P %SM	P
 !sJ!78P  (~P #P SMP  }P tnP P "*$P 
Pr.   r   )r   typingr   r   r   rW   r=    r   audio_utilsr   r	   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   rL   
get_loggerry   rz   r   r!   r.   r-   <module>r      sS    ) (  " H H I 4 ( 			H	%`6 `r.   