
    sgR                     v    d Z ddlmZmZmZmZ ddlmZmZ ddl	m
Z
  e
j                  e      Z G d de      Zy)	z!Tokenization class for Perceiver.    )DictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
           e Zd ZdZddgZ	 	 	 	 	 	 	 d	 d fdZdeeef   fdZ	e
d        Z	 ddee   d	eee      d
edee   f fdZ	 ddee   d	eee      dee   fdZdedee   fdZd Zd Zd Zddedee   dee   fdZ xZS )PerceiverTokenizeraS  
    Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            The BOS token (reserved in the vocab, but not actually used).
        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            The end of sequence token (reserved in the vocab, but not actually used).

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The MASK token, useful for masked language modeling.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The CLS token (reserved in the vocab, but not actually used).
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from two sequences.

    	input_idsattention_maskreturnc                    t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}d| _        ||||||d| _        t        | j                        | _        t        	|    d|||||||d| y )NF)lstriprstrip   )r         r         )	pad_token	bos_token	eos_token
mask_token	cls_token	sep_tokenmodel_max_length )	
isinstancestrr   _utf_vocab_size_added_tokens_decoderlen_num_special_tokenssuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            g/var/www/html/venv/lib/python3.12/site-packages/transformers/models/perceiver/tokenization_perceiver.pyr'   zPerceiverTokenizer.__init__;   s    JTT]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	KUV`beKfZ
5Glv
IST]_bIcJyuEir	IST]_bIcJyuEir	# 6
" $'t'A'A#B  		
!-		
 		
    c                     i }t        | j                        D ]  }t        |      }|| j                  z   ||<   ! |j	                  | j
                         |S N)ranger"   chrr%   updateadded_tokens_encoder)r(   vocabitokens       r+   	get_vocabzPerceiverTokenizer.get_vocabd   sW    t++, 	8AFEt777E%L	8 	T../r,   c                     | j                   S r.   )r"   )r(   s    r+   
vocab_sizezPerceiverTokenizer.vocab_sizel   s    ###r,   token_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r9   r:   r;   r   r   )r&   get_special_tokens_maskr$   )r(   r9   r:   r;   r*   s       r+   r=   z*PerceiverTokenizer.get_special_tokens_maskp   s    $ &72'[]a 3  
 3!s;///1#55sqcC,,-3sS=M7MNRSQTTTr,   c                     || j                   g|z   | j                  gz   S | j                   g|z   | j                  gz   |z   | j                  gz   S )af  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)r(   r9   r:   s      r+    build_inputs_with_special_tokensz3PerceiverTokenizer.build_inputs_with_special_tokens   sb    & %%&48I8I7JJJ%%&48I8I7JJ[X\`\m\m[nnnr,   textc                 ^    |j                  d      D cg c]  }t        |       }}|S c c}w )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encoder0   )r(   rB   r4   tokenss       r+   	_tokenizezPerceiverTokenizer._tokenize   s,    "&++g"67Q#a&77 8s   *c                 n    t        |      dk7  r| j                  }|S t        |      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r   )r$   unk_token_idordr%   )r(   r5   token_ids      r+   _convert_token_to_idz'PerceiverTokenizer._convert_token_to_id   s:    u:?((H  5zD$<$<<Hr,   c                 6    t        || j                  z
        }|S )z=Converts an index (integer) in a token (str) using the vocab.)r0   r%   )r(   indexr5   s      r+   _convert_id_to_tokenz'PerceiverTokenizer._convert_id_to_token   s    ED4445r,   c                     d}|D ]E  }|| j                   v rt        |      j                  d      }nt        t	        |      g      }||z  }G |j                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r,   rD   replace)errors)r2   r!   rE   bytesrJ   decode)r(   rF   bstringr5   
tok_stringstrings         r+   convert_tokens_to_stringz+PerceiverTokenizer.convert_tokens_to_string   sk     	"E111 Z..w7
"CJ<0
z!G	" 	:r,   save_directoryfilename_prefixc                      y)Nr   r   )r(   rY   rZ   s      r+   save_vocabularyz"PerceiverTokenizer.save_vocabulary   s    r,   )z[PAD]z[BOS]z[EOS]z[MASK]z[CLS]z[SEP]i   )r   N)NFr.   )__name__
__module____qualname____doc__model_input_namesr'   r   r!   intr6   propertyr8   r   r   boolr=   rA   rG   rL   rO   rX   r   r\   __classcell__)r*   s   @r+   r   r      s+   < %&67 '
 
'
R4S>  $ $ sxU9U3;DI3FUkoU	cU: JNo9o3;DI3Fo	co0c d3i 

c HSM ]bcf]g r,   r   N)r`   typingr   r   r   r   tokenization_utilsr   r	   utilsr
   
get_loggerr]   loggerr   r   r,   r+   <module>rk      s9    ( . . A  
		H	%k, kr,   