
    sgN                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZ  ej0                  e      ZddiZdZ G d de      Zy)z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging
vocab_filezspiece.modelu   ▁c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 d#deee	e
f      ddf fdZd$dZed	        Zed
        Zd Z	 d%dee   deee      dedee   f fdZd Zd Zdee   dee   fdZ	 d&dee   deee      dee   fdZ	 d&dee   deee      dee   fdZd Zd Zdddee	   f fdZed        Zd Zd Z d Z!d Z"d&d e	d!ee	   de#e	   fd"Z$ xZ%S )'T5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_maskNsp_model_kwargsreturnc
                    t        |t              rt        |d      n|}t        |t              rt        |d      n|}t        |t              rt        |d      n|}|i n|| _        || _        || _        t        j                  di | j                  | _        | j                  j                  |       |q|D cg c]  }dt        |      v s| }}t        |      dk  r!|t        |      D cg c]  }d| d
 c}z  }nC|dkD  r>|t        |      k7  r0t        d| d| d	      t        |      D cg c]  }d| d
 }}|}i | _        t        t        |            D ]@  }t        d| dd
dddd
      | j                  t        | j                        dz
  |z   |z
  <   B |%t        j                  d| j                    d       d}|| _        | j%                  |
j'                  dd
            | _        || _        || _        |	| _        t+        | X  d|||||| j                  ||	d|
 y c c}w c c}w c c}w )NT)specialz
<extra_id_   >r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensF)single_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_space )
isinstancestrr   r   r   
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueError_added_tokens_decoderloggerwarning_once	__class__r%   get_spm_processorpopr&   super__init__)selfr   r    r!   r"   r#   r$   r   r%   r&   kwargsxextra_tokensir5   s                 Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/t5/tokenization_t5.pyr9   zT5Tokenizer.__init__   sy    <FiQT;UJy$7[d	;EiQT;UJy$7[d	;EiQT;UJy$7[d	%4%<r/$#22JT5I5IJ:&$0'@[!LTWXYTZDZA[L[< 1$)yIY-ZA
1#Q.?-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH!j1-HLH(4% &("s<() 	AQ[QCq!uT$X\inRD&&s4=='9A'=	'IA'MN	
 >DT^^DT UJ J F..vzz+u/MN$# 0 
	
&? 00-
	
 
	
I \-Z Is   =H2H22H77H<c                 4   t        j                  di | j                  }| j                  s|r|j	                  | j
                         |S t        | j
                  d      5 }|j                         }t        d| j                  j                   d      }|j                  j                  |      }|j                         }d|_        |j                  j!                  |       |j#                         }|j%                  |       d d d        |S # 1 sw Y   |S xY w)NrbzThe new behaviour of z (with `self.legacy = False`)Fr'   )r+   r,   r   r%   r.   r   openreadr   r5   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r:   r   	tokenizerfr-   	model_pb2modelrI   s           r?   r6   zT5Tokenizer.get_spm_processor   s    ..F1E1EF	;;)NN4??+$//4( 	8AvvxH'*?@W@W?XXu(vwI((33H=E'668O/4O,!!++O<..0H--h7	8 	8 s   !B"DDc                     | t         j                  v rEt         j                  |    }|||k7  r|S |'t        j                  d| d|  d| d| d	t               |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       r?   !_eventually_correct_t5_max_lengthz-T5Tokenizer._eventually_correct_t5_max_length   s    (K,M,MM*5*K*KLi*j'$05JN^5^,,&.34 5 66 734 5$$?#@ Agg "      c                 6    | j                   j                         S N)r-   get_piece_sizer:   s    r?   
vocab_sizezT5Tokenizer.vocab_size   s    }}++--r[   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w r]   )r0   r`   convert_ids_to_tokensupdateadded_tokens_encoder)r:   r>   vocabs      r?   	get_vocabzT5Tokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rg   rh   ri   r   r   )r8   get_special_tokens_maskr/   )r:   rg   rh   ri   r5   s       r?   rk   z#T5Tokenizer.get_special_tokens_mask   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr[   c                 T    t        t        t        d | j                                    S )Nc                 D    t        t        j                  d|             d uS )Nz<extra_id_\d+>)boolresearch)r<   s    r?   <lambda>z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>  s    bii0A1&E!Fd!R r[   )listsetfilterr$   r_   s    r?   get_sentinel_tokenszT5Tokenizer.get_sentinel_tokens  s&    RTXTrTrst
 	
r[   c                 f    | j                         D cg c]  }| j                  |       c}S c c}w r]   )ru   convert_tokens_to_idsr:   tokens     r?   get_sentinel_token_idsz"T5Tokenizer.get_sentinel_token_ids  s*    ?C?W?W?YZe**51ZZZs   .	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r/   eos_token_idrS   rT   r    )r:   r{   s     r?   _add_eos_if_not_presentz#T5Tokenizer._add_eos_if_not_present  s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r[   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )r~   r/   )r:   rg   rh   eoss       r?   $create_token_type_ids_from_sequencesz0T5Tokenizer.create_token_type_ids_from_sequences)  sP        !{S()QC//;${2S89QC??r[   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r   )r:   rg   rh   s      r?    build_inputs_with_special_tokensz,T5Tokenizer.build_inputs_with_special_tokens?  s;    & 22;?66{CK,,r[   c                 D    | j                   j                         }d |d<   |S )Nr-   )__dict__copy)r:   states     r?   __getstate__zT5Tokenizer.__getstate__Y  s#    ""$ jr[   c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr   r'   )r   hasattrr   r+   r,   r-   r.   r   )r:   ds     r?   __setstate__zT5Tokenizer.__setstate__^  sO     t./#%D 22JT5I5IJ4??+r[   textr   c                 2   | j                   st        |      dk(  rt        |   |fi |S |j	                  t
        d      }| j                  r	t
        |z   }t        |   |fi |}t        |      dkD  r"|d   t
        k(  r|d   | j                  v r|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r   N)r%   r/   r8   tokenizereplaceSPIECE_UNDERLINEr&   all_special_tokens)r:   r   r;   tokensr5   s       r?   r   zT5Tokenizer.tokenizeh  s    
 ;;#d)q.7#D3F33||,c2  #d*D!$1&1v;?vay,<<dNeNeAeABZFr[   c                 p    t        | j                  j                  t        | j                                    S r]   )r/   r-   encoder)   r!   r_   s    r?   unk_token_lengthzT5Tokenizer.unk_token_lengthz  s%    4==''DNN(;<==r[   c                 8   | j                   s|j                  t        df      s!| j                  j	                  |t
              S | j                  j	                  | j                  |z   t
              }t        |      | j                  k\  r|| j                  d S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        r   )out_typeN)	r%   
startswithr   r-   r   r)   r!   r/   r   )r:   r   r;   r   s       r?   	_tokenizezT5Tokenizer._tokenize~  s     ;;doo/?.EF==''s';; %%dnnt&;c%J25f+AVAV2Vvd++-.b\bbr[   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r-   piece_to_idrx   s     r?   _convert_token_to_idz T5Tokenizer._convert_token_to_id  s    }}((//r[   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r-   	IdToPiece)r:   indexry   s      r?   _convert_id_to_tokenz T5Tokenizer._convert_id_to_token  s    ''.r[   c                 r   |d   j                  t              r| j                  r|d   dd |d<   g }d}d}|D ]P  }|| j                  v r-|s|dz  }|| j                  j                  |      |z   z  }d}g }>|j                  |       d}R || j                  j                  |      z  }|j                         S )z:Converts a sequence of tokens (string) in a single string.r   r   N Fr   T)r   r   r&   r   r-   decodeappendstrip)r:   r   current_sub_tokens
out_stringprev_is_specialry   s         r?   convert_tokens_to_stringz$T5Tokenizer.convert_tokens_to_string  s     !9 01d6K6Kq	!"F1I
 
	(E///&#%Jdmm223EFNN
"&%'""))%0"'
	( 	dmm**+=>>
!!r[   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirr3   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rB   r-   serialized_model_protowrite)r:   r   r   out_vocab_fileficontent_spiece_models         r?   save_vocabularyzT5Tokenizer.save_vocabulary  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0)z</s>z<unk>z<pad>d   NNNT)F)NFr]   )&rD   
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r)   r   r9   r6   staticmethodrZ   propertyr`   rf   r   intrn   rk   ru   rz   r   r   r   r   r   r   r   r   r   r   r   r	   r   __classcell__)r5   s   @r?   r   r   -   s   L\ *$&67
 "&48H
 "$sCx.1H
 
H
V"    * . . sxO9O3;DI3FOkoO	cO8

[	3c 	3tCy 	3 JN@9@3;DI3F@	c@. JN-9-3;DI3F-	c-4
,[ tCy $ > >c$0
".!c !HSM !]bcf]g !r[   r   )r   r   ro   rS   shutilr   typingr   r   r   r   r   r	   sentencepiecer+   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   
get_loggerrD   r3   r   r   r   r'   r[   r?   <module>r      si    ' 	 	   B B  5 5 1 4  
		H	%!>2 
  R!% R!r[   