
    sg                         d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZ  e       rdd	lmZ ndZ ej                  e      Zd
ddZ G d de	      Zy)zTokenization classes for XGLM.    N)copyfile)ListOptionalTuple   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )XGLMTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 d fd	Z	e
defd       Z	 ddee   deee      dee   fd	Z	 ddee   deee      dee   fd
Zddedee   dee   fdZ xZS )XGLMTokenizerFasta{	  
    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
    and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskc	                 *   d| _         t        | j                         D 
cg c]  }
d|
 d
 }}
|	j                  dg       xs g |	d<   |	dxx   |D cg c]  }||	d   vs| c}z  cc<   t        |   |f|||||||d|	 || _        y c c}
w c c}w )N   z<madeupword>additional_special_tokens)r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token)num_madeup_wordsrangegetsuper__init__r   )selfr   r   r   r   r   r   r   r   kwargsimadeup_wordsword	__class__s                b/var/www/html/venv/lib/python3.12/site-packages/transformers/models/xglm/tokenization_xglm_fast.pyr!   zXGLMTokenizerFast.__init__X   s     !"49$:O:O4PQq+aS*QQ.4jj9TVX.Y._]_*+*+)0
T@[9\-\D0
 	
+ 	
	
)
	
 
	
 %' R0
s   BB
B
returnc                 p    | j                   r)t        j                  j                  | j                         S dS )NF)r   ospathisfile)r"   s    r(   can_save_slow_tokenizerz)XGLMTokenizerFast.can_save_slow_tokenizer{   s$    26//rww~~doo.LuL    token_ids_0token_ids_1c                 \    || j                   g|z   S | j                   g}||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr"   r0   r1   seps       r(    build_inputs_with_special_tokensz2XGLMTokenizerFast.build_inputs_with_special_tokens   sF    ( %%&44  ![ 3&,{::r/   c                 z    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        r   )r3   lenr4   s       r(   $create_token_type_ids_from_sequencesz6XGLMTokenizerFast.create_token_type_ids_from_sequences   sU    $   !s[()QC//3$s*S0;>?1#EEr/   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )r.   
ValueErrorr+   r,   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r"   r:   r;   out_vocab_files       r(   save_vocabularyz!XGLMTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r/   )NN<s></s>rI   rH   z<unk>z<pad>)N)__name__
__module____qualname____doc__rD   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr!   propertyboolr.   r   intr   r6   r9   strr   rG   __classcell__)r'   s   @r(   r   r   $   s    -^ *$&67( !%F M M M JN;9;3;DI3F;	c;4 JNF9F3;DI3FF	cF0!c !HSM !]bcf]g !r/   r   )rM   r+   shutilr   typingr   r   r   tokenization_utils_fastr   utilsr	   r
   tokenization_xglmr   
get_loggerrJ   rA   rD   r    r/   r(   <module>r]      s[    % 	  ( ( > 8 0M 
		H	%#<P`a ]!/ ]!r/   