
    sg&                         d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	  ej                  e      Zddd	d
Z G d de      Zy)    )ListOptionalTuple   )PreTrainedTokenizerFast)logging   )HerbertTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec            
            e Zd ZdZeZeZ	 	 	 	 	 	 	 	 d fd	Z	 dde	e
   dee	e
      de	e
   fdZ	 dde	e
   dee	e
      dede	e
   f fdZ	 dde	e
   dee	e
      de	e
   fd	Zdd
edee   dee   fdZ xZS )HerbertTokenizerFastam  
    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).

    Peculiarities:

    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
      a punctuation character will be treated separately.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
    c	           
      6    t        
|   ||f||||||d|	 y )N)r   	cls_token	unk_token	pad_token
mask_token	sep_token)super__init__)selfr   r   r   r   r   r   r   r   kwargs	__class__s             h/var/www/html/venv/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert_fast.pyr   zHerbertTokenizerFast.__init__2   s;     	
	
 *!
	
 
	
    token_ids_0token_ids_1returnc                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An HerBERT, like BERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)r   r   r   clsseps        r    build_inputs_with_special_tokensz5HerbertTokenizerFast.build_inputs_with_special_tokensJ   sP    (   !  !$s**[ 3&4s::r   already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r&   r	   r   )r   get_special_tokens_masklen)r   r   r   r&   r   s       r   r(   z,HerbertTokenizerFast.get_special_tokens_maske   s    $ &72'[]a 3   31#K 001QC77sqcC,,-3sS=M7MNRSQTTTr   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z         dgz  t        ||z         dgz  z   S )a{  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
        BERT sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r	   )r"   r!   r)   )r   r   r   r$   r#   s        r   $create_token_type_ids_from_sequencesz9HerbertTokenizerFast.create_token_type_ids_from_sequences   st    *   !  !s[(3./1#553$s*+qc1Cc8I4JaS4PPPr   save_directoryfilename_prefixc                 f    | j                   j                  j                  ||      }t        |      S )N)name)
_tokenizermodelsavetuple)r   r,   r-   filess       r   save_vocabularyz$HerbertTokenizerFast.save_vocabulary   s+    %%**>*PU|r   )NNNz<s>z<unk>z<pad>z<mask>z</s>)N)NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesr
   slow_tokenizer_classr   r   intr   r%   boolr(   r+   strr   r5   __classcell__)r   s   @r   r   r      s
   $ *+ 
2 JN;9;3;DI3F;	c;8 sxU9U3;DI3FUkoU	cU8 JNQ9Q3;DI3FQ	cQ8c HSM ]bcf]g r   r   N)typingr   r   r   tokenization_utils_fastr   utilsr   tokenization_herbertr
   
get_loggerr6   loggerr:   r    r   r   <module>rH      sG     ) ( >  2 
		H	%#/`pq B2 Br   