
    sgV7                         d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
ZddlmZmZ ddlmZ erddlmZ  ej&                  e      Zdd	iZd
Z G d de      ZdgZy)    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)logging)	TextInput
vocab_fileztokenizer.modelu   ▁c            
       `    e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 	 ddeee	e
f      f fdZd Zd Zed        Zd	 Zd
ddee	   f fdZd Zd Zd Zd Zddee	   dee	   fdZddZ	 d dee   deee      dedee   f fdZ	 ddee   deee      dee   fdZ	 	 d!dee   dedede	fdZ xZS )"GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    	input_idsattention_masksp_model_kwargsc                    |i n|| _         t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}|| _        || _        || _        |
| _        t        j                  di | j                   | _
        | j                  j                  |       t        | 4  d||||||||	|
|d
| y )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r   r   r   spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r    kwargs	__class__s                _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/gemma/tokenization_gemma.pyr)   zGemmaTokenizer.__init__\   s
    &5%<r/MWXacfMgJyUDImv	MWXacfMgJyUDImv	MWXacfMgJyUDImv	MWXacfMgJyUDImv	$**)B&22JT5I5IJ:& 	
''+)E&?*G	
 	
    c                 ~    | j                   j                         }d |d<   | j                  j                         |d<   |S )Nr&   sp_model_proto)__dict__copyr&   serialized_model_proto)r*   states     r-   __getstate__zGemmaTokenizer.__getstate__   s;    ""$ j"&--"F"F"Hr.   c                     | j                   j                  |       t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr!   )r1   updater$   r%   r   r&   LoadFromSerializedProtor0   )r*   ds     r-   __setstate__zGemmaTokenizer.__setstate__   sG    Q22JT5I5IJ--d.A.ABr.   c                 6    | j                   j                         S )zReturns vocab size)r&   get_piece_size)r*   s    r-   
vocab_sizezGemmaTokenizer.vocab_size   s     }}++--r.   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w )zReturns vocab as a dict)ranger=   convert_ids_to_tokensr7   added_tokens_encoder)r*   ivocabs      r-   	get_vocabzGemmaTokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atextr   returnc                 $    t        |   |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r(   tokenize)r*   rE   r+   r,   s      r-   rH   zGemmaTokenizer.tokenize   s     w///r.   c                 D    | j                   j                  |t              S )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)r&   encoder#   )r*   rE   r+   s      r-   	_tokenizezGemmaTokenizer._tokenize   s     }}##D3#77r.   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r&   piece_to_id)r*   tokens     r-   _convert_token_to_idz#GemmaTokenizer._convert_token_to_id   s    }}((//r.   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r&   	IdToPiece)r*   indexrO   s      r-   _convert_id_to_tokenz#GemmaTokenizer._convert_id_to_token   s    ''.r.   c                     g }d}|D ]E  }|| j                   v r$|| j                  j                  |      |z   z  }g }5|j                  |       G || j                  j                  |      z  }|S )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr&   decodeappend)r*   tokenscurrent_sub_tokens
out_stringrO   s        r-   convert_tokens_to_stringz'GemmaTokenizer.convert_tokens_to_string   s    
 	1E222dmm223EFNN
%'""))%0	1 	dmm**+=>>
r.   filename_prefixc                    t         j                  j                  |      st        j	                  d| d       yt         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       ddd       |fS |fS # 1 sw Y   |fS xY w)a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rV   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr&   r3   write)r*   save_directoryr^   out_vocab_fileficontent_spiece_models         r-   save_vocabularyzGemmaTokenizer.save_vocabulary   s'    ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0c                     | j                   r| j                  gng }| j                  r| j                  gng }||z   |z   }|||z   |z   |z   }|S N)r   bos_token_idr   eos_token_idr*   token_ids_0token_ids_1rt   ru   outputs         r-    build_inputs_with_special_tokensz/GemmaTokenizer.build_inputs_with_special_tokens   s`    .2.@.@))*b.2.@.@))*b+l:"l*[8<GFr.   rw   rx   already_has_special_tokensc                     |rt         |   ||d      S | j                  rdgng }| j                  rdgng }||dgt	        |      z  z   |z   S |dgt	        |      z  z   |z   |z   dgt	        |      z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rw   rx   r{      r   )r(   get_special_tokens_maskr   r   len)r*   rw   rx   r{   rt   ru   r,   s         r-   r~   z&GemmaTokenizer.get_special_tokens_mask   s    $ &72'[]a 3   #00sb"00sbA3[)9#9:\IIsS%%'  sS%%	'
 	
r.   c                     | j                   r| j                  gng }| j                  r| j                  gng }dgt	        ||z   |z         z  }||dgt	        ||z   |z         z  z  }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r}   )r   rt   r   ru   r   rv   s         r-   $create_token_type_ids_from_sequencesz3GemmaTokenizer.create_token_type_ids_from_sequences  sz    . /3.@.@))*b.2.@.@))*bs<+5DEE"qcC{ :\ IJJJFr.   	token_idsskip_special_tokensr    c                    g }g }|D ]  }|r|| j                   v r|| j                  v rW|r*|j                  | j                  j	                  |             |j                  | j                  |   j
                         g }y|j                  |        |r*|j                  | j                  j	                  |             |rdj                  |      }ndj                  |      }|j                  t        d      S )N rV   )	all_special_ids_added_tokens_decoderrY   r&   rX   contentrg   replaceSPIECE_UNDERLINE)r*   r   r   r    r+   	sub_textscurrent_sub_textidss           r-   _decodezGemmaTokenizer._decode/  s     	 		-C"sd.B.B'Bd000#$$T]]%9%9:J%KL  !;!;C!@!H!HI#%  '',		- T]]112BCD(+I	*I  !1377r.   )
z<unk>z<bos>z<eos>z<pad>NTFFFFrs   )NF)FF) __name__
__module____qualname____doc__rh   vocab_files_namesmodel_input_namesr   r   r#   r   r)   r5   r:   propertyr=   rD   r   rH   rL   rP   rT   r]   r	   rq   rz   intboolr~   r   r   __classcell__)r,   s   @r-   r   r   *   sw   ,\ *$&67
 48%*"'&+(
 "$sCx.1(
TC
 . .0[ 0tCy 080
!x} !X]^aXb !6	 sx#
9#
3;DI3F#
ko#
	c#
L JN93;DI3F	cH %*.3	898 "8 (,	8 
8r.   r   )rb   shutilr   typingr   r   r   r   r   r	   sentencepiecer$   tokenization_utilsr   r   utilsr   tokenization_utils_baser   
get_loggerr   re   rh   r   r   __all__r!   r.   r-   <module>r      sb   , 
  B B  A  4			H	%!#45  `8( `8F	 
r.   