
    sg3                         d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ d	d
lmZ  ej"                  e      Z G d de
      Zy)z
Processor class for Bark
    N)Optional   )BatchFeature)ProcessorMixin)logging)get_file_from_repo   )AutoTokenizerc                        e Zd ZdZdZdgZddddZd fd	Ze	 dd       Z		 	 	 dd	e
f fd
ZddefdZddee   fdZ	 	 	 	 	 	 	 ddZ xZS )BarkProcessora	  
    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`].
        speaker_embeddings (`Dict[Dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.

    r
   	tokenizer   r	   semantic_promptcoarse_promptfine_promptc                 2    t         |   |       || _        y N)super__init__speaker_embeddings)selfr   r   	__class__s      [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bark/processing_bark.pyr   zBarkProcessor.__init__<   s    #"4    c                 X   |t        |||j                  dd      |j                  dd      |j                  dd      |j                  dd      |j                  dd      |j                  dd      |j                  d	d      |j                  d
d      
      }|:t        j                  dt        j
                  j                  ||       d       d}n,t        |      5 }t        j                  |      }ddd       nd}t        j                  |fi |} | |      S # 1 sw Y   )xY w)a  
        Instantiate a Bark processor associated with a pretrained model.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                  method, e.g., `./my_model_directory/`.
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file containing the speaker_embeddings dictionnary located in
                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
            **kwargs
                Additional keyword arguments passed along to both
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        N	subfolder	cache_dirforce_downloadFproxiesresume_downloadlocal_files_onlyuse_auth_tokenrevisionr   r   r   r    r!   r"   tokenr$   `z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`.)r   r   )r   poploggerwarningospathjoinopenjsonloadr
   from_pretrained)cls!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsspeaker_embeddings_pathr   speaker_embeddings_jsonr   s           r   r1   zBarkProcessor.from_pretrainedA   s9   . (3&81, **[$7 **[$7%zz*:EB

9d3 &

+<d C!',>!Fjj!148J5'# '."'',,'HIefg h^ a
 &*"12 L6M)-3J)K&L L "&!112S^W]^	Y;MNNL Ls   D  D)push_to_hubc                    | j                   4t        j                  t        j                  j	                  ||d      d       i }||d<   | j                   D ]  }|dk7  s	| j                  |      }i }	| j                   |   D ]m  }
t        j                  t        j                  j	                  |d   || d|
       ||
   d       t        j                  j	                  || d|
 d	      |	|
<   o |	||<    t        t        j                  j	                  ||      d
      5 }t        j                  ||       ddd       t        | 0  ||fi | y# 1 sw Y   xY w)a}  
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionnary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        Nv2T)exist_okrepo_or_path_F)allow_picklez.npyw)r   r+   makedirsr,   r-   _load_voice_presetnpsaver.   r/   dumpr   save_pretrained)r   save_directoryr4   speaker_embeddings_directoryr8   r5   embeddings_dict
prompt_keyvoice_presettmp_dictkeyfpr   s               r   rE   zBarkProcessor.save_pretrainedv   si   8 "".KK^5QSWXcgh O.<ON+"55 ;
/#'#:#::#FL!H#66zB nGGLL / ?A]blammnorns_t )-). )+5QV`UaabcfbggkSl(mn 3;OJ/;" bggll>3OPRUV /Z\		/2./ 	FvF/ /s   $EErJ   c                    | j                   |   }i }dD ]2  }||vrt        d| d| d      t        | j                   j                  dd      ||   |j	                  dd       |j	                  dd       |j	                  d	d
      |j	                  dd       |j	                  dd       |j	                  dd
      |j	                  dd       |j	                  dd       
      }|Mt        dt
        j                  j                  | j                   j                  dd      ||          d| d      t        j                  |      ||<   5 |S )Nr   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].r<   /r   r   r   Fr    r!   r"   r#   r$   r%   r'   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   
ValueErrorr   getr(   r+   r,   r-   rB   r0   )r   rJ   r5   voice_preset_pathsvoice_preset_dictrL   r,   s          r   rA   z BarkProcessor._load_voice_preset   sm   !44\BF 	3C,, 9#>cdpcqqst  &''++NC@"3' **[$7 **[$7%zz*:EB

9d3 &

+<d C!',>!Fjj!148J5D | "'',,t'>'>'B'B>SV'WXjknXopq rjjviw x #  &(WWT]c"3	36 ! r   c           	      ^   dD ]  }||vrt        d| d      t        ||   t        j                        s't	        | dt        | j                  |          d      t        ||   j                        | j                  |   k7  st        | dt        | j                  |          d       y )Nr   rO   z
 as a key.z voice preset must be a z
D ndarray.)	rQ   
isinstancerB   ndarray	TypeErrorstrpreset_shapelenshape)r   rJ   rL   s      r   _validate_voice_preset_dictz)BarkProcessor._validate_voice_preset_dict   s    F 	jC,& #Fse:!VWWl3/<3%'?DDUDUVYDZ@[?\\f ghh<$**+t/@/@/EE C5(@TEVEVWZE[A\@]]g!hii	jr   c           
         |t        |t              swt        |t              r,| j                   || j                  v r| j	                  |      }n;t        |t              r|j                  d      s|dz   }t        j                  |      }|  | j                  |fi | t        ||      } | j                  |f|d||||d|}	|||	d<   |	S )a  
        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.

        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            voice_preset (`str`, `Dict[np.ndarray]`):
                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
                `"en_speaker_1"`, or directly a dictionnary of `np.ndarray` embeddings for each submodel of `Bark`. Or
                it can be a valid file name of a local `.npz` single voice preset.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the
            `tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type.
        z.npz)datatensor_type
max_length)return_tensorspaddingra   return_attention_maskreturn_token_type_idsadd_special_tokenshistory_prompt)rV   dictrY   r   rA   endswithrB   r0   r]   r   r   )
r   textrJ   rb   ra   rf   rd   re   r5   encoded_texts
             r   __call__zBarkProcessor.__call__   s    H #J|T,J<-++7 D$;$;;#66|D lC09N9Nv9V#/&#8L!ww|4#,D,,\DVD'\~VL%t~~	
) !"7"71	
 	
 #-9L)*r   r   )speaker_embeddings_path.json)rm   r   F)NNpt   FTF)__name__
__module____qualname____doc__tokenizer_class
attributesrZ   r   classmethodr1   boolrE   rY   rA   r   rh   r]   rl   __classcell__)r   s   @r   r   r   #   s     &OJ L5
 Mk2O 2On &D%9!7G
 7Gr!s !B	j 	j  "#Dr   r   )rs   r/   r+   typingr   numpyrB   feature_extraction_utilsr   processing_utilsr   utilsr   	utils.hubr   autor
   
get_loggerrp   r)   r    r   r   <module>r      sI     	   4 .  +   
		H	%|N |r   