
    sg0                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ  ej                   e      ZdZd	d
iZ G d de      Zy)zTokenization classes for .    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.modelc            
       \    e Zd ZdZeZddgZ	 	 	 	 	 	 	 ddeee	e
f      ddf fdZd Zd	 Z	 dd
ee   deee      dee   fdZ	 dd
ee   deee      dedee   f fdZ	 dd
ee   deee      dee   fdZed        Zd Zde	dee	   fdZd Zd Zd Zdde	dee	   dee	   fdZ xZS )XGLMTokenizera  
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_maskNsp_model_kwargsreturnc	                 R   |i n|| _         d| _        t        | j                        D 
cg c]  }
d|
 d
 }}
|	j                  dg       xs g |	d<   |	dxx   |D cg c]  }||	d   vs| c}z  cc<   t	        j
                  di | j                   | _        | j                  j                  t        |             || _	        d| _
        ddddd	| _        t        | j                        }t        | j                        D 
ci c]  }
d|
 d||
z   | j                  z    }}
| j                  j                  |       | j                  j                         D ci c]  \  }}||
 c}}| _        t!        | D  d||||||| j                   d
|	 y c c}
w c c}w c c}
w c c}}w )N   z<madeupword>additional_special_tokens   r      r	   )<s><pad></s><unk>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr    )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr   fairseq_offsetfairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)selfr   r   r   r    r!   r   r"   r   kwargsimadeup_wordswordsp_sizekv	__class__s                   ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/xglm/tokenization_xglm.pyr3   zXGLMTokenizer.__init__e   s    &5%<r/ !"49$:O:O4PQq+aS*QQ.4jj9TVX.Y._]_*+*+)0
T@[9\-\D0
 	
+ 22JT5I5IJ3z?+$   ./APQ%R"dmm$W\]a]r]rWstRS+aS*GaK$:M:M,MMtt"")),77;7Q7Q7W7W7Y%Ztq!ad%Z" 		
 00		
 		
; R0
* u &[s   FF
*F
FF#c                 ~    | j                   j                         }d |d<   | j                  j                         |d<   |S )Nr)   sp_model_proto)__dict__copyr)   serialized_model_proto)r4   states     r=   __getstate__zXGLMTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr   r#   )r@   hasattrr   r'   r(   r)   LoadFromSerializedProtor?   )r4   ds     r=   __setstate__zXGLMTokenizer.__setstate__   sQ     t./#%D 22JT5I5IJ--d.A.ABrE   token_ids_0token_ids_1c                 \    || j                   g|z   S | j                   g}||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr4   rK   rL   seps       r=    build_inputs_with_special_tokensz.XGLMTokenizer.build_inputs_with_special_tokens   sF    ( %%&44  ![ 3&,{::rE   already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   S dgdgt        |      z  z   ddgz   dgt        |      z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rK   rL   rR   r   r   )r2   get_special_tokens_maskr.   )r4   rK   rL   rR   r<   s       r=   rT   z%XGLMTokenizer.get_special_tokens_mask   s{    & &72'[]a 3   31#K 0011sqcC,,-A61#K@P:PQQrE   c                 z    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        r   )rN   r.   rO   s       r=   $create_token_type_ids_from_sequencesz2XGLMTokenizer.create_token_type_ids_from_sequences   sU    $   !s[()QC//3$s*S0;>?1#EErE   c                 `    t        | j                        | j                  z   | j                  z   S N)r.   r)   r,   r$   )r4   s    r=   
vocab_sizezXGLMTokenizer.vocab_size   s'    4==!D$7$77$:O:OOOrE   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w rX   )r%   rY   convert_ids_to_tokensr/   added_tokens_encoder)r4   r6   vocabs      r=   	get_vocabzXGLMTokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atextc                 D    | j                   j                  |t              S )N)out_type)r)   encoder+   )r4   r_   s     r=   	_tokenizezXGLMTokenizer._tokenize  s    }}##D3#77rE   c                     || j                   v r| j                   |   S | j                  j                  |      }|r|| j                  z   S | j                  S )z0Converts a token (str) in an id using the vocab.)r-   r)   	PieceToIdr,   unk_token_id)r4   tokenspm_ids      r=   _convert_token_to_idz"XGLMTokenizer._convert_token_to_id  sU    D...--e44((/ 06v+++L4;L;LLrE   c                     || j                   v r| j                   |   S | j                  j                  || j                  z
        S )z=Converts an index (integer) in a token (str) using the vocab.)r1   r)   	IdToPiecer,   )r4   indexs     r=   _convert_id_to_tokenz"XGLMTokenizer._convert_id_to_token  sA    D...--e44}}&&ut/B/B'BCCrE   c                 l    dj                  |      j                  t        d      j                         }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r4   tokens
out_strings      r=   convert_tokens_to_stringz&XGLMTokenizer.convert_tokens_to_string  s,    WWV_,,-=sCIIK
rE   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-ro   r   wb)ospathisdirloggererrorrq   VOCAB_FILES_NAMESabspathr   isfiler   openr)   rB   write)r4   rx   ry   out_vocab_fileficontent_spiece_models         r=   save_vocabularyzXGLMTokenizer.save_vocabulary  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0)r   r   r   r   r   r   NrX   )NF)__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r+   r   r3   rD   rJ   r   intrQ   boolrT   rV   propertyrY   r^   rc   ri   rm   rw   r   r   __classcell__)r<   s   @r=   r   r   "   s~   =~ *$&67
 486
 "$sCx.16
 
6
pC JN;9;3;DI3F;	c;4 sxR9R3;DI3FRkoR	cR: JNF9F3;DI3FF	cF0 P P
8c 8d3i 8MD
!c !HSM !]bcf]g !rE   r   )r   r}   shutilr   typingr   r   r   r   r   sentencepiecer'   tokenization_utilsr
   utilsr   
get_loggerr   r   rs   r   r   r#   rE   r=   <module>r      sT    ! 	  3 3  5  
		H	% !#<= G!' G!rE   