
    sg"                         d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ  e       rd	d
lmZ ndZ ej"                  e      ZdddZdZ G d de      ZdgZy)z&Tokenization classes for ALBERT model.    N)copyfile)ListOptionalTuple   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )AlbertTokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c                        e Zd ZdZeZeZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Ze	de
fd       Z	 ddee   deee      dee   fdZ	 ddee   deee      dee   fdZdd	ed
ee   dee   fdZ xZS )AlbertTokenizerFasta  
    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `False`):
            Whether or not to keep accents when tokenizing.
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
            that is used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                     t        |t              rt        |ddd      n|}t        |   |f||||||||	|
||d| || _        || _        || _        || _        y )NTF)lstriprstrip
normalized)r   do_lower_caseremove_spacekeep_accents	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 f/var/www/html/venv/lib/python3.12/site-packages/transformers/models/albert/tokenization_albert_fast.pyr#   zAlbertTokenizerFast.__init__X   s    ( *c* z$uO 	 		
)'%%!	
 	
  +(($    returnc                 p    | j                   r)t        j                  j                  | j                         S dS )NF)r   ospathisfile)r$   s    r'   can_save_slow_tokenizerz+AlbertTokenizerFast.can_save_slow_tokenizer   s$    26//rww~~doo.LuLr(   token_ids_0token_ids_1c                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An ALBERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_idr$   r/   r0   sepclss        r'    build_inputs_with_special_tokensz4AlbertTokenizerFast.build_inputs_with_special_tokens   sP    &   !  !$s**[ 3&4s::r(   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z         dgz  t        ||z         dgz  z   S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r   )r2   r3   lenr4   s        r'   $create_token_type_ids_from_sequencesz8AlbertTokenizerFast.create_token_type_ids_from_sequences   st    .   !  !s[(3./1#553$s*+qc1Cc8I4JaS4PPPr(   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   )r.   
ValueErrorr+   r,   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r$   r;   r<   out_vocab_files       r'   save_vocabularyz#AlbertTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r(   )NNTTF[CLS][SEP]z<unk>rJ   z<pad>rI   z[MASK])N)__name__
__module____qualname____doc__rE   vocab_files_namesr   slow_tokenizer_classr#   propertyboolr.   r   intr   r7   r:   r!   r   rH   __classcell__)r&   s   @r'   r   r   &   s    ,\ ** +%Z M M M JN;9;3;DI3F;	c;4 JNQ9Q3;DI3FQ	cQ<!c !HSM !]bcf]g !r(   r   )rN   r+   shutilr   typingr   r   r   tokenization_utilsr   tokenization_utils_fastr	   utilsr
   r   tokenization_albertr   
get_loggerrK   rB   rE   SPIECE_UNDERLINEr   __all__ r(   r'   <module>r_      so    - 	  ( ( , > 8 4O			H	%#1EUV   k!1 k!\ !
!r(   