
    sg'                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ  e       rdd	lmZ ndZ ej"                  e      Zd
ddZ G d de      Zy)z Tokenization class for model T5.    N)copyfile)ListOptionalTuple   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )T5Tokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   	 	 	 	 	 	 	 	 d fd	Zedefd       Zed        Zdd	ed
ee   dee   fdZ	 dde
e   dee
e      de
e   fdZ	 dde
e   dee
e      de
e   fdZd Zd Z xZS )T5TokenizerFasta`  
    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
            "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        add_prefix_space (`bool`, *optional*):
            Whether or not the tokenizer should automatically add a prefix space
        from_slow (`book`, *optional*, defaults to `False`):
            Whether or not the tokenizer should be converted from a slow one. If `add_prefix_space` is set, this will be set to `True`.
    	input_idsattention_maskprefix_tokensc	                    |q|D 
cg c]  }
dt        |
      v s|
 }}
t        |      dk  r!|t        |      D cg c]  }d| d
 c}z  }nC|dkD  r>|t        |      k7  r0t        d| d| d      t        |      D cg c]  }d| d
 }}|}|t        j                  d       d	|	d
<   t        |   d|||||||d|	 || _        || _	        y c c}
w c c}w c c}w )Nz
<extra_id_r   >r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokenszXYou set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizersT	from_slow)r   r   	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens )
strlenrange
ValueErrorloggerwarning_oncesuper__init__r   
_extra_ids)selfr   r   r   r   r   r   r   add_prefix_spacekwargsxextra_tokensi	__class__s                ^/var/www/html/venv/lib/python3.12/site-packages/transformers/models/t5/tokenization_t5_fast.pyr$   zT5TokenizerFast.__init__V   s4    %0'@[!LTWXYTZDZA[L[< 1$)yIY-ZA
1#Q.?-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH!j1-HLH(4%'j #'F; 		
!)&?		
 		
 %#? \-Z Is   CCCCreturnc                 p    | j                   r)t        j                  j                  | j                         S dS )NF)r   ospathisfiler&   s    r-   can_save_slow_tokenizerz'T5TokenizerFast.can_save_slow_tokenizer   s$    26//rww~~doo.LuL    c                     | t         j                  v rEt         j                  |    }|||k7  r|S |'t        j                  d| d|  d| d| d	t               |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       r-   !_eventually_correct_t5_max_lengthz1T5TokenizerFast._eventually_correct_t5_max_length   s    (O,Q,QQ*9*O*OPm*n'$05JN^5^,,&.34 5 66 734 5$$?#@ Agg "  r5   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  r.t        | j                  |       t
        j                  d|        |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   zCopy vocab file to )r4   r    r0   r1   isdirr!   errorjoinVOCAB_FILES_NAMESabspathr   r   info)r&   r@   rA   out_vocab_files       r-   save_vocabularyzT5TokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5KK-n-=>?  r5   token_ids_0token_ids_1c                     || j                   gz   }|| j                  |z   S || j                   gz   }| j                  |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )eos_token_idr   )r&   rM   rN   s      r-    build_inputs_with_special_tokensz0T5TokenizerFast.build_inputs_with_special_tokens   sW    & "T%6%6$77%%33%):):(;;K%%3kAAr5   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )rP   r   )r&   rM   rN   eoss       r-   $create_token_type_ids_from_sequencesz4T5TokenizerFast.create_token_type_ids_from_sequences   sP        !{S()QC//;${2S89QC??r5   c                 T    t        t        t        d | j                                    S )Nc                 D    t        t        j                  d|             d uS )Nz<extra_id_\d+>)boolresearch)r)   s    r-   <lambda>z5T5TokenizerFast.get_sentinel_tokens.<locals>.<lambda>   s    bii0A1&E!Fd!R r5   )listsetfilterr   r3   s    r-   get_sentinel_tokensz#T5TokenizerFast.get_sentinel_tokens   s&    RTXTrTrst
 	
r5   c                 f    | j                         D cg c]  }| j                  |       c}S c c}w N)r^   convert_tokens_to_ids)r&   tokens     r-   get_sentinel_token_idsz&T5TokenizerFast.get_sentinel_token_ids   s*    ?C?W?W?YZe**51ZZZs   .)NNz</s>z<unk>z<pad>d   NNr`   )__name__
__module____qualname____doc__rH   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr   r   int__annotations__r$   propertyrW   r4   staticmethodr?   r   r   r   rL   rQ   rT   r^   rc   __classcell__)r,   s   @r-   r   r   )   s%   $L *$&67&!M49! "&-$^ M M M    *!c !HSM !]bcf]g !* JNB9B3;DI3FB	cB6 JN@9@3;DI3F@	c@,

[r5   r   )rh   r0   rX   r8   shutilr   typingr   r   r   tokenization_utils_fastr   utilsr	   r
   tokenization_t5r   
get_loggerre   r!   rH   r   r   r5   r-   <module>rw      sc    ' 	 	   ( ( > 8 ,K 
		H	%#1EUV @[- @[r5   