
    sg$                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ  e       rddlZ e       rddlZ ej(                  e      ZddiZd	 Z G d
 de      Zy)zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 Z    t        j                  d      }|j                  |       }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_romans       ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_charactersr   %   s3    

?3 $$\2E%M    c                        e Zd ZdZeZddgZ	 	 	 	 	 	 	 d	 d fdZed        Z	d Z
d	 Zd
 Z	 ddededee   deeeeef   f   fdZdedee   fdZdee   defdZd Zd Zddedee   deee   df   fdZ xZS )VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    	input_idsattention_maskNreturnc	                 d   t        |d      5 }
t        j                  |
      | _        d d d        | j                  j	                         D ci c]  \  }}||
 c}}| _        || _        || _        || _        || _	        || _
        t        | 0  d|||||||d|	 y # 1 sw Y   wxY wc c}}w )Nutf-8encoding)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uroman )openjsonloadencoderitemsdecoderr%   r&   r'   r(   r)   super__init__)selfr   r#   r$   r%   r&   r'   r(   r)   kwargsvocab_handlekv	__class__s                r   r2   zVitsTokenizer.__init__H   s     *w/ 	3<99\2DL	3 *.););)=>A1> """" 		
		
 		
	3 	3 ?s   B B, B)c                 ,    t        | j                        S N)lenr.   )r3   s    r   
vocab_sizezVitsTokenizer.vocab_sizej   s    4<<  r   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w r:   )ranger<   convert_ids_to_tokensupdateadded_tokens_encoder)r3   ivocabs      r   	get_vocabzVitsTokenizer.get_vocabn   sK    ;@;QRa++A.1RRT../ Ss   Ac                 x   t        | j                  j                               t        | j                  j                               z   }d}d}|t	        |      k  r`d}|D ]-  }|||t	        |      z    |k(  s||z  }|t	        |      z  }d} n |s|||   j                         z  }|dz  }|t	        |      k  r`|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr.   keysrA   r;   lower)r3   r   all_vocabularyfiltered_textrB   found_matchwords          r   normalize_textzVitsTokenizer.normalize_texts   s    dll//12T$:S:S:X:X:Z5[[#l##K& AD	M2d:!T)MTNA"&K a!6!6!88Q #l## r   c                 H    | j                   dk(  r|j                  dd      }|S )z4Special treatment of characters in certain languagesronu   țu   ţ)r%   replace)r3   texts     r   _preprocess_charzVitsTokenizer._preprocess_char   s#    ==E!<<d+Dr   rS   is_split_into_wordsr'   c                 H    ||n j                   }|r j                  |      } j                  |      }t        |      rQ j                  rEt               st        j                  d       n%t        j                         }|j                  |      } j                  rKt               st        d      t        j                  |ddddd      }t        j                   dd|      }||fS |r5d	j#                  t%        t'         fd
|                  j)                         }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        aC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r%   backendstrippreserve_punctuationwith_stressz\s+ rF   c                      | j                   v S r:   )r.   )charr3   s    r   <lambda>z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>   s    TT\\=Q r   )r'   rO   rT   r   r)   r   loggerwarningurUromanromanize_stringr(   r   ImportError
phonemizerr   subjoinrH   filterrY   )r3   rS   rU   r'   r4   rL   uromans   `      r   prepare_for_tokenizationz&VitsTokenizer.prepare_for_tokenization   s   4 "+!6IDNN	&&t,D--d3#M2t~~&(y  & 6 6} E>>*,!"ijj&00  %) M FF63>M
 f$$	 GGD0QS`)a$bciikMf$$r   c                     t        |      }| j                  r.| j                  d      gt        |      dz  dz   z  }||ddd<   |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rG   N)rH   r&   _convert_id_to_tokenr;   )r3   rS   tokensintersperseds       r   	_tokenizezVitsTokenizer._tokenize   sQ    d>> 55a89S[1_q=PQL!'LA!Fr   ro   c                 h    | j                   rt        |      dkD  r|dd d   }dj                  |      S )NrG   rm   rF   )r&   r;   rh   )r3   ro   s     r   convert_tokens_to_stringz&VitsTokenizer.convert_tokens_to_string   s0    >>c&kAoADqD\Fwwvr   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r.   getr$   )r3   tokens     r   _convert_token_to_idz"VitsTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHr   c                 8    | j                   j                  |      S )z=Converts an index (integer) in a token (str) using the vocab.)r0   ru   )r3   indexs     r   rn   z"VitsTokenizer._convert_id_to_token   s    ||&&r   save_directoryfilename_prefixc           	         t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t        |dd      5 }|j                  t        j                  | j                  d	d
d      dz          d d d        |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-rF   r   wr    r!   rm   TF)indent	sort_keysensure_ascii
)ospathisdirr`   errorrh   VOCAB_FILES_NAMESr+   writer,   dumpsr.   )r3   rz   r{   r   fs        r   save_vocabularyzVitsTokenizer.save_vocabulary   s    ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 *cG4 	cGGDJJt||ATYZ]aab	c }	c }s   96B::C)z<pad>z<unk>NTTTF)r   N)FNr:   )__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr2   propertyr<   rD   rO   rT   strboolr   r   r   r   rk   r   rq   rs   rw   rn   r   r   __classcell__)r8   s   @r   r   r   /   s   * *$&67
  
 
 
D ! !
* Y]?%?%.2?%GOPT~?%	sDcN"	#?%B	c 	d3i 	tCy S 
I'c HSM ]bchilcmoscs]t r   r   )r   r,   r   r   typingr   r   r   r   r   r   tokenization_utilsr
   utilsr   r   r   rf   rj   rb   
get_loggerr   r`   r   r   r   r*   r   r   <module>r      sg    #  	 	 : : 5 J J 			H	%!<0 D' Dr   