
    sgW                          d dl Z d dlmZ d dlmZmZ d dlmZ ddlm	Z	 ddl
mZmZ ddlmZ  ed	        e       rd
dlmZ ndZ ej"                  e      ZdddZ G d de	      Zy)    N)copyfile)OptionalTuple)
processors   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging)require_versionztokenizers>=0.13.3   )GemmaTokenizerztokenizer.modelztokenizer.json)
vocab_filetokenizer_filec                        e Zd ZdZeZeZdZddgZ		 	 	 	 	 	 	 	 	 d fd	Z
edefd       Zd Zed	        Zed
        Zej"                  d        Zej"                  d        Zddedee   dee   fdZddZ xZS )GemmaTokenizerFastu
  
    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.

    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`

    ```python
    >>> from transformers import GemmaTokenizerFast

    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
    >>> tokenizer.encode("Hello this is a test")
    [2, 4521, 736, 603, 476, 2121]
    ```

    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token
        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
    left	input_idsattention_maskc
                     t        |   d|||||||||	d	|
 || _        |	| _        | j	                          || _        y )N)	r   r   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_tokenadd_bos_tokenadd_eos_token )super__init___add_bos_token_add_eos_tokenupdate_post_processorr   )selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/gemma/tokenization_gemma_fast.pyr   zGemmaTokenizerFast.__init__Z   sa     	 	
!))E''	
 	
 ,+""$$    returnc                 p    | j                   r)t        j                  j                  | j                         S dS )NF)r   ospathisfiler#   s    r&   can_save_slow_tokenizerz*GemmaTokenizerFast.can_save_slow_tokenizerx   s$    26//rww~~doo.LuLr'   c                 P   | j                   }| j                  }|| j                  rt        d      | j                  }| j
                  }|| j                  rt        d      | j                  r|dz   nd d| j                  rd|z   dz   nd }| | j                  rd|z   d	z   nd d
| j                  rd|z   d	z   nd }g }| j                  r|j                  ||f       | j                  r|j                  ||f       t        j                  |||      | j                  _        y)ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = Nonez)add_eos_token = True but eos_token = Nonez:0  z$A:0 z:0z:1z $B:1)singlepairspecial_tokens)r   bos_token_idr   
ValueErrorr   eos_token_idr   appendr   TemplateProcessing
_tokenizerpost_processor)r#   bosr5   eosr7   r2   r3   r4   s           r&   r"   z(GemmaTokenizerFast.update_post_processor}   s+    nn((;4--HIInn((;4--HII#'#5#5SY2>dUYUgUgCGDLmoBpqD,>,>3s74<BGu_c_q_qcRUgVZlwyLz{!!3"56!!3"56)3)F)F^*
&r'   c                     | j                   S N)r!   r-   s    r&   r   z GemmaTokenizerFast.add_eos_token       """r'   c                     | j                   S r?   )r    r-   s    r&   r   z GemmaTokenizerFast.add_bos_token   r@   r'   c                 2    || _         | j                          y r?   )r!   r"   r#   values     r&   r   z GemmaTokenizerFast.add_eos_token       #""$r'   c                 2    || _         | j                          y r?   )r    r"   rC   s     r&   r   z GemmaTokenizerFast.add_bos_token   rE   r'   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory-r0   r   )r.   r6   r*   r+   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r#   rG   rH   out_vocab_files       r&   save_vocabularyz"GemmaTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r'   c                     | j                   r| j                  gng }| j                  r| j                  gng }||z   |z   }|||z   |z   |z   }|S r?   )r   r5   r   r7   )r#   token_ids_0token_ids_1r5   r7   outputs         r&    build_inputs_with_special_tokensz3GemmaTokenizerFast.build_inputs_with_special_tokens   s`    .2.@.@))*b.2.@.@))*b+l:"l*[8<GFr'   )	NNFz<unk>z<bos>z<eos>z<pad>TFr?   )__name__
__module____qualname____doc__rO   vocab_files_namesr   slow_tokenizer_classpadding_sidemodel_input_namesr   propertyboolr.   r"   r   r   setterstrr   r   rR   rW   __classcell__)r%   s   @r&   r   r   %   s    -^ *)L$&67 %*%< M M M
4 # # # # % % % %
!c !HSM !]bcf]g !(	r'   r   )r*   shutilr   typingr   r   
tokenizersr   tokenization_utils_fastr   utilsr	   r
   utils.versionsr   tokenization_gemmar   
get_loggerrX   rL   rO   r   r   r'   r&   <module>rm      sb    
  " ! > 8 - $ %2N			H	%#4HXY b0 br'   