
    sg"                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ dd	lmZ  ej$                  e      Zd
diZ G d de      Zy)z Tokenization class for SpeechT5.    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging   )EnglishNumberNormalizer
vocab_filezspm_char.modelc            
       J    e Zd ZdZeZddgZ	 	 	 	 	 	 ddeee	e
f      ddf fdZddZed	        Zed
        Zej                   d        Zd Zd Zd Zde	dee	   fdZd Zd Zd Zddee   fdZ	 ddee   deee      dedee   f fdZdde	dee	   dee	   fdZ xZS ) SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_maskNsp_model_kwargsreturnc           
         |i n|| _         || _        || _        d | _        t	        j
                  di | j                   | _        | j                  j                  |       t        	| $  d|||||| j                   d| y )N)	bos_token	eos_token	unk_token	pad_token	normalizer    )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            e/var/www/html/venv/lib/python3.12/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr"   zSpeechT5Tokenizer.__init__O   s     &5%<r/$"22JT5I5IJ:& 	
 00	
 	
    c                 v    |j                  d| j                        }|rd|z   }|r| j                  |      }||fS )Nr    )popr   
normalizer)r#   textis_split_into_wordsr$   r   s        r&   prepare_for_tokenizationz*SpeechT5Tokenizer.prepare_for_tokenizationl   s=    JJ{DNN;	:D??4(Df~r'   c                 6    | j                   j                         S N)r   get_piece_sizer#   s    r&   
vocab_sizezSpeechT5Tokenizer.vocab_sizet   s    }}++--r'   c                 P    | j                   t               | _         | j                   S r0   )r   r   r2   s    r&   r+   zSpeechT5Tokenizer.normalizerx   s%    #68Dr'   c                     || _         y r0   )r   )r#   values     r&   r+   zSpeechT5Tokenizer.normalizer~   s
     r'   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w r0   )ranger3   convert_ids_to_tokensupdateadded_tokens_encoder)r#   ivocabs      r&   	get_vocabzSpeechT5Tokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Ac                 D    | j                   j                         }d |d<   |S )Nr   )__dict__copy)r#   states     r&   __getstate__zSpeechT5Tokenizer.__getstate__   s#    ""$ jr'   c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr   r   )r@   hasattrr   r   r   r   r    r   )r#   ds     r&   __setstate__zSpeechT5Tokenizer.__setstate__   sO     t./#%D 22JT5I5IJ4??+r'   r,   c                 D    | j                   j                  |t              S )zPTake as input a string and return a list of strings (tokens) for words/sub-words)out_type)r   encodestr)r#   r,   s     r&   	_tokenizezSpeechT5Tokenizer._tokenize   s    }}##D3#77r'   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r   piece_to_id)r#   tokens     r&   _convert_token_to_idz&SpeechT5Tokenizer._convert_token_to_id   s    }}((//r'   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r   	IdToPiece)r#   indexrO   s      r&   _convert_id_to_tokenz&SpeechT5Tokenizer._convert_id_to_token   s    ''.r'   c                    g }d}d}|D ]P  }|| j                   v r-|s|dz  }|| j                  j                  |      |z   z  }d}g }>|j                  |       d}R || j                  j                  |      z  }|j	                         S )z:Converts a sequence of tokens (string) in a single string. Fr)   T)all_special_tokensr   decodeappendstrip)r#   tokenscurrent_sub_tokens
out_stringprev_is_specialrO   s         r&   convert_tokens_to_stringz*SpeechT5Tokenizer.convert_tokens_to_string   s    
 
	(E///&#%Jdmm223EFNN
"&%'""))%0"'
	( 	dmm**+=>>
!!r'   c                 L    ||| j                   gz   S ||z   | j                   gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r#   token_ids_0token_ids_1s      r&    build_inputs_with_special_tokensz2SpeechT5Tokenizer.build_inputs_with_special_tokens   s5    $"3"3!444[(D,=,=+>>>r'   rb   rc   already_has_special_tokensc                     |rt         |   ||d      S dg}|dgt        |      z  |z   S dgt        |      z  dgt        |      z  z   |z   S )NT)rb   rc   re   r   r   )r!   get_special_tokens_masklen)r#   rb   rc   re   suffix_onesr%   s        r&   rg   z)SpeechT5Tokenizer.get_special_tokens_mask   su     &72'[]a 3   cC#k**k99c+&&A3[1A+AB[PPr'   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-rV   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr   serialized_model_protowrite)r#   rj   rk   out_vocab_fileficontent_spiece_models         r&   save_vocabularyz!SpeechT5Tokenizer.save_vocabulary   s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0)z<s>z</s>z<unk>z<pad>FN)Fr0   )NF) __name__
__module____qualname____doc__ru   vocab_files_namesmodel_input_namesr   r   rK   r   r"   r.   propertyr3   r+   setterr>   rC   rG   r   rL   rP   rT   r_   intrd   boolrg   r   r~   __classcell__)r%   s   @r&   r   r   !   sM   (T *$&67
 48
 "$sCx.1
 

: . .    
 ! !

,8c 8d3i 80"&?QUVYQZ ? sxQ9Q3;DI3FQkoQ	cQ!c !HSM !]bcf]g !r'   r   )r   ro   shutilr   typingr   r   r   r   r   sentencepiecer   tokenization_utilsr
   utilsr   number_normalizerr   
get_loggerr   rr   ru   r   r   r'   r&   <module>r      sQ    ' 	  3 3  5  6 
		H	%!#34 y!+ y!r'   