
    sg'                         d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ  e       rd	d
lmZ ndZ ej"                  e      ZdddZdZ G d de      Zy)z'Tokenization classes for RemBERT model.    N)copyfile)ListOptionalTuple   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )RemBertTokenizerzsentencepiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c            
           e Zd ZdZeZeZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Ze	de
fd       Z	 ddee   deee      dee   fdZ	 ddee   deee      de
dee   fd	Z	 ddee   deee      dee   fd
Zddedee   dee   fdZ xZS )RemBertTokenizerFasta  
    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
            that is used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                     t        |t              rt        |dd      n|}t        |   |f||||||||	|
||d| || _        || _        || _        || _        y )NTF)lstriprstrip)r   do_lower_caseremove_spacekeep_accents	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 h/var/www/html/venv/lib/python3.12/site-packages/transformers/models/rembert/tokenization_rembert_fast.pyr"   zRemBertTokenizerFast.__init__X   s    " KUU_adJeZ
4Fku
	
)'%%!	
 	
  +(($    returnc                 p    | j                   r)t        j                  j                  | j                         S dS NF)r   ospathisfile)r#   s    r&   can_save_slow_tokenizerz,RemBertTokenizerFast.can_save_slow_tokenizer   s$    26//rww~~doo.LuLr'   token_ids_0token_ids_1c                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RemBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_idr#   r/   r0   sepclss        r&    build_inputs_with_special_tokensz5RemBertTokenizerFast.build_inputs_with_special_tokens   sP    &   !  !$s**[ 3&4s::r'   already_has_special_tokensc                 
   |r9|t        d      |D cg c]   }|| j                  | j                  fv rdnd" c}S |+dgdgt        |      z  z   dgz   dgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   S c c}w )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Set to True if the token list is already formatted with special tokens for the model

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.r   r   )
ValueErrorr2   r3   len)r#   r/   r0   r8   xs        r&   get_special_tokens_maskz,RemBertTokenizerFast.get_special_tokens_mask   s    & && R  VaaPQt00$2C2CDDA!Kaa"31#K 001QC7A3[AQ;QRVWUXXXsqcC,,-33	 bs   %B c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z         dgz  t        ||z         dgz  z   S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r   )r2   r3   r;   r4   s        r&   $create_token_type_ids_from_sequencesz9RemBertTokenizerFast.create_token_type_ids_from_sequences   st    .   !  !s[(3./1#553$s*+qc1Cc8I4JaS4PPPr'   save_directoryfilename_prefixc                    t         j                  j                  |      s%t        j	                  dj                  |             y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       |fS )Nz*Vocabulary path ({}) should be a directory- r   )r+   r,   isdirloggererrorformatjoinVOCAB_FILES_NAMESabspathr   r   )r#   r@   rA   out_vocab_files       r&   save_vocabularyz$RemBertTokenizerFast.save_vocabulary   s    ww}}^,LLELL^\]o_s22QbcoQpp
 77??4??+rww~/NNT__n5  r'   )NNTTF[CLS][SEP]z<unk>rO   z<pad>rN   z[MASK])Nr*   )__name__
__module____qualname____doc__rJ   vocab_files_namesr   slow_tokenizer_classr"   propertyboolr.   r   intr   r7   r=   r?   r    r   rM   __classcell__)r%   s   @r&   r   r   &   s2   ,\ *+ &%P M M M JN;9;3;DI3F;	c;4 sx4943;DI3F4ko4	c4@ JNQ9Q3;DI3FQ	cQ<!c !HSM !]bcf]g !r'   r   )rS   r+   shutilr   typingr   r   r   tokenization_utilsr   tokenization_utils_fastr	   utilsr
   r   tokenization_rembertr   
get_loggerrP   rF   rJ   SPIECE_UNDERLINEr    r'   r&   <module>rc      se    . 	  ( ( , > 8 6			H	%#8L\]   !2 !r'   