
    sg~M                         d Z ddlZddlmZmZmZ ddlZddlZddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZ  e       rddlZddlZ e       rddlZ e       rddlZ ej4                  e      Z G d	 d
e      Zy)z%Feature extractor class for Pop2Piano    N)ListOptionalUnion   )mel_filter_bankspectrogram)SequenceFeatureExtractor)BatchFeature)
TensorTypeis_essentia_availableis_librosa_availableis_scipy_availableloggingrequires_backendsc                       e Zd ZdZg dZ	 	 	 	 	 	 	 ddedededededed	ef fd
Zdej                  fdZ
dej                  fdZdej                  dej                  dej                  fdZdej                  dej                  fdZd dej                  fdZ	 d!dedededeeeef      fdZ	 	 	 	 d"deej                  ee   eej                     eee      f   deeee   f   dedee   dee   deeeef      defdZ xZS )#Pop2PianoFeatureExtractora  
    Constructs a Pop2Piano feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
    to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
    well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
    extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
    and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.

    Args:
        sampling_rate (`int`, *optional*, defaults to 22050):
            Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.
        padding_value (`int`, *optional*, defaults to 0):
            Padding value used to pad the audio. Should correspond to silences.
        window_size (`int`, *optional*, defaults to 4096):
            Length of the window in samples to which the Fourier transform is applied.
        hop_length (`int`, *optional*, defaults to 1024):
            Step size between each window of the waveform, in samples.
        min_frequency (`float`, *optional*, defaults to 10.0):
            Lowest frequency that will be used in the log-mel spectrogram.
        feature_size (`int`, *optional*, defaults to 512):
            The feature dimension of the extracted features.
        num_bars (`int`, *optional*, defaults to 2):
            Determines interval between each sequence.
    input_features	beatstepsextrapolated_beatstepsampling_ratepadding_valuewindow_size
hop_lengthmin_frequencyfeature_sizenum_barsc           	      D   t        	|   d|||d| || _        || _        || _        || _        || _        || _        || _        t        | j                  dz  dz   | j                  | j                  t        | j                  dz        | j                  d d      | _        y )N)r   r   r         htk)num_frequency_binsnum_mel_filtersr   max_frequencyr   norm	mel_scale )super__init__r   r   r   r   r   r   r   r   floatmel_filters)
selfr   r   r   r   r   r   r   kwargs	__class__s
            m/var/www/html/venv/lib/python3.12/site-packages/transformers/models/pop2piano/feature_extraction_pop2piano.pyr)   z"Pop2PianoFeatureExtractor.__init__R   s     	 	
%''	
 		
 +*&$*( * $ 0 0A 5: --,, 2 2a 78,,
    sequencec                    g }|D ]e  }t        j                  | j                  dz         dd }|j                  t	        ||| j                  | j
                  d| j                               g t        j                  |      }|S )z
        Generates MelSpectrogram.

        Args:
            sequence (`numpy.ndarray`):
                The sequence of which the mel-spectrogram will be computed.
        r    Ng       @)waveformwindowframe_lengthr   powerr+   )nphanningr   appendr   r   r+   array)r,   r1   	mel_specsseqr5   s        r/   mel_spectrogramz)Pop2PianoFeatureExtractor.mel_spectrogramt   s     	 	CZZ 0 01 45cr:F !!%!1!1# $ 0 0		 HHY'	r0   audioc                     t        | dg       t        j                  j                  d      } ||      \  }}}}}|||||fS )a  
        This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
        tempo in bpm for an audio signal. For more information please visit
        https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .

        Args:
            audio(`numpy.ndarray`):
                raw audio waveform which is passed to the Rhythm Extractor.
        essentiamultifeature)method)r   rA   standardRhythmExtractor2013)r,   r?   essentia_trackerbpm
beat_times
confidence	estimatesessentia_beat_intervalss           r/   extract_rhythmz(Pop2PianoFeatureExtractor.extract_rhythm   sS     	$-#,,@@@WJZ[`JaGZY0GJ
I7NNNr0   rH   steps_per_beatn_extendc                    t        | dg       t        j                  j                  t	        j
                  |j                        |dd      } |t	        j                  d|j                  |z   dz
  |j                  |z  |z               }|S )a  
        This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
        then used to convert raw audio to log-mel-spectrogram.

        Args:
            beat_times (`numpy.ndarray`):
                beat_times is passed into `scipy.interpolate.interp1d` for processing.
            steps_per_beat (`int`):
                used as an parameter to control the interpolation.
            n_extend (`int`):
                used as an parameter to control the interpolation.
        scipyFextrapolate)bounds_error
fill_valuer   r    )r   rP   interpolateinterp1dr8   arangesizelinspace)r,   rH   rM   rN   beat_times_function	ext_beatss         r/   interpolate_beat_timesz0Pop2PianoFeatureExtractor.interpolate_beat_times   s      	$	*#//88IIjoo&$	 9 
 (KK:??X59:??^;[^f;fg
	 r0   beatstepc                    |1t        |j                        dk7  rt        d|j                   d      |d   dkD  r||d   z
  }| j                  dz  }t        |      }| j	                  |d| j                  dz   dz  dz         }g }d}t        d||      D ]k  }|}	t        ||z   |      }
t        ||	   | j                  z        }t        ||
   | j                  z        }|j                  ||f       t        |||z
        }m g }|D ]G  \  }}||| }t        j                  |d||j                  d   z
  ffdd	      }|j                  |       I t        j                  |      }||fS )
a  
        Preprocessing for log-mel-spectrogram

        Args:
            audio (`numpy.ndarray` of shape `(audio_length, )` ):
                Raw audio waveform to be processed.
            beatstep (`numpy.ndarray`):
                Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
                the value at beatstep[0].
        r    zUExpected `audio` to be a single channel audio input of shape `(n, )` but found shape .r   g           rH   rM   rN   constantconstant_values)lenshape
ValueErrorr   r[   rangeminintr   r:   maxr8   padasarray)r,   r?   r\   	num_stepsnum_target_stepsr   sample_indicesmax_feature_lengthi	start_idxend_idxstart_sample
end_samplepadded_batchfeaturepadded_features                   r/   preprocess_melz(Pop2PianoFeatureExtractor.preprocess_mel   s    U[[!1Q!6ghmhshsgttuv  A;(1+-HMMA%	x= $ ; ;T]]Q=NRS<SVW<W !< !
 q*I6 	TAI!i-)9:G4Y?$BTBTTUL27;d>P>PPQJ!!<"<=!$%7l9R!S	T (6 	0$L*L4GVV''--*::;= !	N /	0 zz,/222r0   featuresc           	         |D cg c]  }|j                    }}g g }}t        |      D ]  \  }}t        |j                         dk(  r\t        g t	        | d         ||   d   z
  }t        j                  ||   d d t
        j                        }	dd|fdf}
|
d   |
d   f}nq|j                  dd      }t        g t	        | d         ||   d   z
  }t        j                  ||   t
        j                        j                  dd      }	dd|ffx}
}t        j                  ||
d| j                  	      }t        j                  |	|d| j                  	      }	|rt        g t	        | d         }t        j                  |t        j                  d|| j                  g      gd
      }t        j                  |	t        j                  d|g|	j                        gd
      }	|j                  |       |j                  |	        t        j                  |d
      j!                  t
        j"                        }t        j                  |d
      j!                  t
        j                        }||fS c c}w )Nr   r    r   )dtype)r   r   r   r3   ra   rb   )axis)re   	enumeraterd   rj   zipr8   onesint64reshaperk   r   concatenatezerosr   r|   r:   astypefloat32)r,   rz   add_zero_lineeach_featurefeatures_shapesattention_maskspadded_featuresrq   features_pad_valueattention_maskfeature_paddingattention_mask_paddingeach_padded_featurezero_array_lens                 r/   _padzPop2PianoFeatureExtractor._pad   sp   BJK,<--KK+-r(2 !	3OA|<%%&!+%()A3+@)A!)D%EXYHZ[\H]%]"!#);BQ)?rxx!P#)A/A+BF"K*9!*<oa>P)Q&  ,33Ar:%()A3+@)A!)D%EXYHZ[\H]%]"!#);288!L!T!TUVXZ![<BQHZD[;\\"8"$&&
dhdvdv"wVV 6
TXTfTfN !$%=sO'<%=a%@!A ')nn("((A~tGXGX3Y*Z[bc'# "$#RXXq..AI]I]%^_fg" ""#67"">2C!	3F ..qAHHT..qAHHR//Q Ls   I7inputs
is_batchedreturn_attention_maskreturn_tensorsc                    i }|j                         D ]T  \  }}|dk(  r$| j                  |d      \  }}	|||<   |s)|	|d<   /| j                  |d      \  }}	|||<   |sM|	|d| <   V |s|s|d   ddd	f   |d<   t        ||
      }
|
S )a  
        Pads the inputs to same length and returns attention_mask.

        Args:
            inputs (`BatchFeature`):
                Processed audio features.
            is_batched (`bool`):
                Whether inputs are batched or not.
            return_attention_mask (`bool`):
                Whether to return attention mask or not.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
                If nothing is specified, it will return list of `np.ndarray` arrays.
        Return:
            `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
            to it:
            - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
                Example :
                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 thats why there are 2 zeros at
                    the end indicating they are padded)

                    0, 0, 0, 0, 0 (zero pad to seperate audio 1 and 2)

                    1, 1, 1, 1, 1 (audio 2)

                    0, 0, 0, 0, 0 (zero pad to seperate audio 2 and 3)

                    1, 1, 1, 1, 1 (audio 3)
            - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
            - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size,
              max_extrapolated_beatstep_seq_length)`
        r   T)r   r   Fattention_mask_Nr3   .)tensor_type)itemsr   r
   )r,   r   r   r   r   processed_features_dictfeature_namefeature_valuepadded_feature_valuesr   outputss              r/   rk   zPop2PianoFeatureExtractor.pad  s    T #%+1<<> 
	_'L-//8<		-_c	8d5%~8M'5(@N+,<=8<		-_d	8e5%~8M'5(P^+ol^,LM
	_ "78OP`8abecebegjbj8k#$456NSr0   resamplereturnc                    t        | dg       t        t        |t        t        f      xr( t        |d   t
        j                  t        t        f            }|r&t        |t              st        d| d      |dn|}n|g}|g}|dn|}g g g }}
}	t        ||      D ]  \  }}| j                  |      \  }}}}}| j                  ||d	
      }| j                  |k7  rb| j                  V|r.t        j                  j                  ||| j                  d      }n&t        j                   d| j                   d| d       | j                  }t#        |d   |z        }t#        |d   |z        }| j%                  ||| ||d   z
        \  }}| j'                  |j)                  t
        j*                              }t        j,                  t        j.                  |dd            }t        j0                  |d      }|	j3                  |       |
j3                  |       |j3                  |        t5        |	|
|d      }| j7                  ||||      }|S )a  
        Main method to featurize and prepare for the model.

        Args:
            audio (`np.ndarray`, `List`):
                The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
                list of numpy arrays or a list of list of float values.
            sampling_rate (`int`):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            steps_per_beat (`int`, *optional*, defaults to 2):
                This is used in interpolating `beat_times`.
            resample (`bool`, *optional*, defaults to `True`):
                Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
                during inference.
            return_attention_mask (`bool` *optional*, defaults to `False`):
                Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
                output or not. Automatically set to True for batched inputs.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
                If nothing is specified, it will return list of `np.ndarray` arrays.
        librosar   zwPlease give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. Received z), expected [audio_1_sr, ..., audio_n_sr].NTF)r?   r    r`   kaiser_best)orig_sr	target_srres_typezmThe sampling_rate of the provided audio is different from the target sampling_rate of the Feature Extractor, z vs zp. In these cases it is recommended to use `resample=True` in the `__call__` method to get the optimal behaviour.r3   gư>)a_mina_max)r   r3   r   )r   r   r   )r   bool
isinstancelisttupler8   ndarrayrf   r   rL   r[   r   r   corer   warningswarnri   ry   r>   r   r   logclip	transposer:   r
   rk   )r,   r?   r   rM   r   r   r   r-   r   batch_input_featuresbatch_beatstepsbatch_ext_beatstepsingle_raw_audiosingle_sampling_raterG   rH   rI   rJ   rK   r   rt   ru   r   r   r<   log_mel_specsoutputs                              r/   __call__z"Pop2PianoFeatureExtractor.__call__V  s   F 	$,*UT5M:nz%PQ(UWU_U_afhlTm?no
mT2   -.WY  -B,IDOd!GE*OM-B-JEPe!DFB/Ao69%6O (	=22NRNaNa& Ob OKCZ4K 33zZhst3uI!!%99d>P>P>\'.||'<'<( 4"&"4"4!.	 (= ($ MM5595G5G4HMaLb c56 $(#5#5 y|.BBCLYr]-AABJ484G4G j99yQR|;S51N1 ,,^-B-B2::-NOI FF2779D#MNM\\-EN ''7""9-%%&;<Q(	=T "6,);
 !"7)	  
 r0   )i"V  r   i   i   g      $@i   r   )T)N)r   TFN)__name__
__module____qualname____doc__model_input_namesri   r*   r)   r8   r   r>   rL   numpyr[   ry   r   r
   r   r   r   strr   rk   r   r   __classcell__)r.   s   @r/   r   r   2   s   : Q ## 
 
  
 	 

  
  
  
  
D

 2OBJJ O --9>RWR_R_<.3BJJ .3"** .3`)0RZZ )0` <@>> >  $	>
 !sJ!78>H  #'05;?lRZZed2::.>T%[@QQRl S$s)^,l 	l
 4.l  (~l !sJ!78l 
lr0   r   )r   r   typingr   r   r   r   r8   audio_utilsr   r   !feature_extraction_sequence_utilsr	   feature_extraction_utilsr
   utilsr   r   r   r   r   r   rA   essentia.standardr   rP   
get_loggerr   loggerr   r'   r0   r/   <module>r      so    ,  ( (   7 I 4   
		H	%P 8 Pr0   