
    sg-'                     z    d Z ddlZddlmZmZmZ ddlmZmZ ddl	m
Z
  e
j                  e      Z G d de      Zy)	z"Tokenization class for model ByT5.    N)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
       N    e Zd ZdZddgZ	 	 	 	 	 d	 d fdZed        Zd Z	 dde	e
   d	ee	e
      d
ede	e
   f fdZde	e
   de	e
   fdZ	 dde	e
   d	ee	e
      de	e
   fdZ	 dde	e
   d	ee	e
      de	e
   fdZdede	e   fdZd Zd Zd Zddedee   dee   fdZ xZS )ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskreturnc           	      0   |dkD  r|t        |      D cg c]  }d| d
 }}nK|dkD  rF|Dt        |      dkD  r6t        t        t        d |                  }||k7  rt	        d| d| d      t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}|||d
| _        t        | j                        | _	        d| _
        t        	| 0  d|||d|d| y c c}w )Nr   z
<extra_id_>c                 .    t        dt        |       v       S )Nextra_id)boolstr)xs    ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py<lambda>z(ByT5Tokenizer.__init__.<locals>.<lambda>L   s    Ds1v9M4N     zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens )rangelensetfilter
ValueError
isinstancer   r   _added_tokens_decoderoffset_utf_vocab_sizesuper__init__)
selfr   r   r    r!   r"   kwargsiextra_tokens	__class__s
            r   r.   zByT5Tokenizer.__init__>   sI    q=6>DI)DT(Uq:aS):(U%(U]8DMfIgjkIks6*NPi#jklLy( &yk1RSlRm n( (  HRR[]`GaJydCgp	GQR[]`GaJydCgp	GQR[]`GaJydCgp	)2yY%O"$445# 	
&?	
 	
' )Vs   Dc                     | j                   S N)r,   )r/   s    r   
vocab_sizezByT5Tokenizer.vocab_sizee   s    ###r   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w r5   )r$   r6   r+   convert_ids_to_tokensupdateadded_tokens_encoder)r/   r1   vocabs      r   	get_vocabzByT5Tokenizer.get_vocabi   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r=   r>   r?   r   r   )r-   get_special_tokens_maskr%   )r/   r=   r>   r?   r3   s       r   rA   z%ByT5Tokenizer.get_special_tokens_maskn   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r%   eos_token_idwarningswarnr   )r/   rB   s     r   _add_eos_if_not_presentz%ByT5Tokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )rE   r%   )r/   r=   r>   eoss       r   $create_token_type_ids_from_sequencesz2ByT5Tokenizer.create_token_type_ids_from_sequences   sP        !{S()QC//;${2S89QC??r   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )rH   )r/   r=   r>   s      r    build_inputs_with_special_tokensz.ByT5Tokenizer.build_inputs_with_special_tokens   s;    & 22;?66{CK,,r   textc                 ^    |j                  d      D cg c]  }t        |       }}|S c c}w )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r/   rN   r1   tokenss       r   	_tokenizezByT5Tokenizer._tokenize   s,    "&++g"67Q#a&77 8s   *c                 Z    t        |      dk7  rd}|S t        |      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r   N)r%   ordr+   )r/   tokentoken_ids      r   _convert_token_to_idz"ByT5Tokenizer._convert_token_to_id   s4     u:?H  5zDKK/Hr   c                 6    t        || j                  z
        }|S )z=Converts an index (integer) in a token (str) using the vocab.)rR   r+   )r/   indexrW   s      r   _convert_id_to_tokenz"ByT5Tokenizer._convert_id_to_token   s    EDKK'(r   c                    d}|D ]i  }|| j                   v r| j                   |   j                  d      }n5|| j                  v r|j                  d      }nt        t	        |      g      }||z  }k |j                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r   rP   ignore)errors)added_tokens_decoderrQ   r:   bytesrV   decode)r/   rS   bstringrW   
tok_stringstrings         r   convert_tokens_to_stringz&ByT5Tokenizer.convert_tokens_to_string   s     	"E111!66u=DDWM
$333"\\'2
"CJ<0
z!G	" 9r   save_directoryfilename_prefixc                      y)Nr#   r#   )r/   rg   rh   s      r   save_vocabularyzByT5Tokenizer.save_vocabulary   s    r   )z</s>z<unk>z<pad>}   N)r   N)NFr5   )__name__
__module____qualname____doc__model_input_namesr.   propertyr6   r<   r   intr   r   rA   rH   rK   rM   r   rT   rY   r\   rf   r   rj   __classcell__)r3   s   @r   r   r      sg   @ %&67 "&%
 
%
N $ $ sxO9O3;DI3FOkoO	cO8	3c 	3tCy 	3 JN@9@3;DI3F@	c@. JN-9-3;DI3F-	c-4c d3i 

c HSM ]bcf]g r   r   )ro   rF   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerrl   loggerr   r#   r   r   <module>ry      s<    )  ( ( A  
		H	%N' Nr   