
    sg                     *    d Z ddlmZ  G d de      Zy)z$Speech processor class for SpeechT5.   )ProcessorMixinc                   B     e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
 xZS )	SpeechT5Processora}  
    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.

    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`SpeechT5Tokenizer`):
            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
    SpeechT5FeatureExtractorSpeechT5Tokenizerc                 &    t         |   ||       y )N)super__init__)selffeature_extractor	tokenizer	__class__s      c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/speecht5/processing_speecht5.pyr
   zSpeechT5Processor.__init__%   s    *I6    c                 <   |j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }||t        d      ||t        d      ||||t        d	      | | j                  |g|d|i|}n| | j                  |fi |}nd}| | j                  |||d
|}	|	d   }
n| | j                  |fi |}	|	d   }
nd}	||	S |	
|d<   |	j	                  d      }|||d<   |S )a  
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        audioNtexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r   get)r   argskwargsr   r   r   r   r   inputstargetsr   r   s               r   __call__zSpeechT5Processor.__call__(   s   0 

7D)zz&$'jj5zz.$7

?D9!1n  #(?n  =\1dl{GZs  +T++E`D``Y_`F#T^^D3F3FF#,d,,,]juntuG^,F$$dnn[;F;G[)FG>N%F8%,[[1A%B"%13I/0r   c                    |j                  dd      }|j                  dd      }|j                  dd      }||t        d      |||t        d      |! | j                  j                  |g|i |}n"| | j                  j                  |fi |}nd}|d|v st        |t              r*d|d   v r# | j                  j                  |fi |}|d   }nt| j                  j                  }| j                  j                  | j                  _         | j                  j                  |g|i |}|| j                  _        |d   }nd}||S |||d<   |j                  d      }	|	|	|d	<   |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr   
isinstancelistfeature_sizenum_mel_binsr   )
r   r   r    r   r   r   r!   r"   feature_size_hackr   s
             r   r&   zSpeechT5Processor.pado   s   " zz.$7JJ{D1	Hd+#	(=YZZI$5&.l  #/T++//NtNvNF"'T^^''	<V<FFf$FD)AkU[\]U^F^,$..,,V>v> -$($:$:$G$G!6:6L6L6Y6Y&&34$0044VMdMfM6G&&3 0G>N%F8%,[[1A%B"%13I/0r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder   r   r    s      r   r-   zSpeechT5Processor.batch_decode   s     
 +t~~**D;F;;r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoder.   s      r   r0   zSpeechT5Processor.decode   s     
 %t~~$$d5f55r   )__name__
__module____qualname____doc__feature_extractor_classtokenizer_classr
   r#   r&   r-   r0   __classcell__)r   s   @r   r   r      s1     9)O7EN:x<6r   r   N)r4   processing_utilsr   r    r   r   <module>r:      s    + .c6 c6r   