
    sgG$                     (   U d Z ddlmZmZmZ ddlmZmZ ddlm	Z	  e	j                  e      ZdZdZdZdZd	Zd
ZdZedededededediZeeef   ed<   ej1                         D  ci c]  \  } }|| 
 c}} Zeeef   ed<    G d de      Zyc c}} w )z Tokenization classes for CANINE.    )DictListOptional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSSPECIAL_CODEPOINTS_BY_NAMEc            
           e Zd ZdZ ee       ee       ee       ee       ee       ee      ddf fd	Z	e
defd       Zd Zdedee   fd	Zd
edefdZdedefdZd Z	 ddee   deee      dee   fdZ	 ddee   deee      dedee   f fdZ	 ddee   deee      dee   fdZddedee   fdZ xZS )CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                    t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}i | _        t        j                         D ]  \  }
}|
| j                  |<    | j                  j                         D 
ci c]  \  }}
|
|
 c}
}| _        t        | _        t        | j                        | _
        t        | 0  d||||||||d|	 y c c}
}w )NF)lstriprstripT)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_length )
isinstancestrr   _special_codepointsr
   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargs	codepointname	__class__s               a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.pyr$   zCanineTokenizer.__init__H   sp    JTT]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir	 KUU_adJeZ
4Fku
 46 1779 	7OIt-6D$$T*	7
 483K3K3Q3Q3S;
 /iItO;
' $6 #&t'?'?#@  
	
!--
	
 
	
;
s   E%returnc                     | j                   S N)r    )r%   s    r*   
vocab_sizezCanineTokenizer.vocab_sizev   s    '''    c                     t        | j                        D ci c]  }t        |      | }}|j                  | j                         |S c c}w r-   )ranger.   chrupdateadded_tokens_encoder)r%   ivocabs      r*   	get_vocabzCanineTokenizer.get_vocabz   sB    $)$//$:;qQ;;T../ <s   A
textc                     t        |      S )z5Tokenize a string (i.e. perform character splitting).)list)r%   r8   s     r*   	_tokenizezCanineTokenizer._tokenize   s    Dzr/   tokenc                 R    	 t        |      S # t        $ r t        d| d      w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r%   r<   s     r*   _convert_token_to_idz$CanineTokenizer._convert_token_to_id   s5    	:u: 	:/wa899	:s   
 &indexc                 r    	 |t         v r	t         |   S t        |      S # t        $ r t        d|       w xY w)z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r
   r2   r@   rA   )r%   rC   s     r*   _convert_id_to_tokenz$CanineTokenizer._convert_id_to_token   sF    
	5**)%00u: 	5|E7344	5s    
 6c                 $    dj                  |      S )N )join)r%   tokenss     r*   convert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string   s    wwvr/   token_ids_0token_ids_1c                 ^    | j                   g}| j                  g}||z   |z   }||||z   z  }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_idr%   rK   rL   sepclsresults         r*    build_inputs_with_special_tokensz0CanineTokenizer.build_inputs_with_special_tokens   sI    &   !  !{"S("kC''Fr/   already_has_special_tokensc                     |rt         |   ||d      S dgdgt        |      z  z   dgz   }||dgt        |      z  dgz   z  }|S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rK   rL   rU      r   )r#   get_special_tokens_maskr!   )r%   rK   rL   rU   rS   r)   s        r*   rX   z'CanineTokenizer.get_special_tokens_mask   sp    $ &72'[]a 3   c+../1#5"sS--!44Fr/   c                     | j                   g}| j                  g}t        ||z   |z         dgz  }||t        ||z         dgz  z  }|S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   rW   )rN   rO   r!   rP   s         r*   $create_token_type_ids_from_sequencesz4CanineTokenizer.create_token_type_ids_from_sequences   sa    .   !  !S;&,-3"c++,s22Fr/   save_directoryfilename_prefixc                      y)Nr   r   )r%   r[   r\   s      r*   save_vocabularyzCanineTokenizer.save_vocabulary   s    r/   r-   )NF)__name__
__module____qualname____doc__r2   CLSSEPPADMASKr$   propertyintr.   r7   r   r   r;   rB   rE   rJ   r   rT   boolrX   rZ   r^   __classcell__)r)   s   @r*   r   r   :   sa    c(c(c(c(c(t9,
\ (C ( (
c d3i :# :# :
5# 
5# 
5 JN93;DI3F	c8 sx93;DI3Fko	c: JN93;DI3F	c@c HSM r/   r   N)rb   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerr_   loggerr   re   rc   rd   BOSrf   RESERVEDr
   rh   r   __annotations__r   r   r   )r'   r(   s   00r*   <module>rs      s    ' ' ' A  
		H	%    (l& DcN   VhUmUmUo-p/)TdIo-p DcN pw) w .qs   'B