Ë
    ©sgG$  ã                   ó(  — U d Z ddlmZmZmZ ddlmZmZ ddlm	Z	  e	j                  e«      ZdZdZdZdZd	Zd
ZdZedededededediZeeef   ed<   ej1                  «       D  ci c]  \  } }|| “Œ
 c}} Zeeef   ed<    G d„ de«      Zyc c}} w )z Tokenization classes for CANINE.é    )ÚDictÚListÚOptionalé   )Ú
AddedTokenÚPreTrainedTokenizer)Úloggingi   i à  ià  ià  ià  ià  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]ÚSPECIAL_CODEPOINTSÚSPECIAL_CODEPOINTS_BY_NAMEc            
       ó„  ‡ — e Zd ZdZ ee«       ee«       ee«       ee«       ee«       ee«      ddfˆ fd„	Z	e
defd„«       Zd„ Zdedee   fd	„Zd
edefd„Zdedefd„Zd„ Z	 ddee   deee      dee   fd„Z	 ddee   deee      dedee   fˆ fd„Z	 ddee   deee      dee   fd„Zddedee   fd„Zˆ xZS )ÚCanineTokenizeraé  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                 óÖ  •— t        |t        «      rt        |dd¬«      n|}t        |t        «      rt        |dd¬«      n|}t        |t        «      rt        |dd¬«      n|}t        |t        «      rt        |dd¬«      n|}t        |t        «      rt        |dd¬«      n|}t        |t        «      rt        |dd¬«      n|}i | _        t        j                  «       D ]  \  }
}|
| j                  |<   Œ | j                  j                  «       D 
ci c]  \  }}
|
|“Œ
 c}
}| _        t        | _        t        | j                  «      | _
        t        ‰| 0  d||||||||dœ|	¤Ž y c c}
}w )NF)ÚlstripÚrstripT)Ú	bos_tokenÚ	eos_tokenÚ	sep_tokenÚ	cls_tokenÚ	pad_tokenÚ
mask_tokenÚadd_prefix_spaceÚmodel_max_length© )Ú
isinstanceÚstrr   Ú_special_codepointsr
   ÚitemsÚ_special_codepoint_stringsÚUNICODE_VOCAB_SIZEÚ_unicode_vocab_sizeÚlenÚ_num_special_tokensÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   ÚkwargsÚ	codepointÚnameÚ	__class__s               €úa/var/www/html/venv/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.pyr$   zCanineTokenizer.__init__H   sp  ø€ ô JTÐT]Ô_bÔIc”J˜y°¸uÕEÐirˆ	ÜISÐT]Ô_bÔIc”J˜y°¸uÕEÐirˆ	ÜISÐT]Ô_bÔIc”J˜y°¸uÕEÐirˆ	ÜISÐT]Ô_bÔIc”J˜y°¸uÕEÐirˆ	ÜISÐT]Ô_bÔIc”J˜y°¸uÕEÐirˆ	ô KUÐU_ÔadÔJe”Z 
°4ÀÕFÐkuˆ
ð 46ˆÔ Ü1×7Ñ7Ó9ò 	7‰OˆItØ-6ˆD×$Ñ$ TÒ*ð	7ð
 48×3KÑ3K×3QÑ3QÓ3S÷;
Ù /  iˆIt‰Oó;
ˆÔ'ô $6ˆÔ Ü#& t×'?Ñ'?Ó#@ˆÔ ä‰Ñð 
	
ØØØØØØ!Ø-Ø-ñ
	
ð ó
	
ùó;
s   ÄE%Úreturnc                 ó   — | j                   S ©N)r    )r%   s    r*   Ú
vocab_sizezCanineTokenizer.vocab_sizev   s   € à×'Ñ'Ð'ó    c                 óž   — t        | j                  «      D ci c]  }t        |«      |“Œ }}|j                  | j                  «       |S c c}w r-   )Úranger.   ÚchrÚupdateÚadded_tokens_encoder)r%   ÚiÚvocabs      r*   Ú	get_vocabzCanineTokenizer.get_vocabz   sB   € Ü$)¨$¯/©/Ó$:Ö;˜q”Q“˜‘Ð;ˆÐ;Ø‰T×.Ñ.Ô/Øˆùò <s   ˜A
Útextc                 ó   — t        |«      S )z5Tokenize a string (i.e. perform character splitting).)Úlist)r%   r8   s     r*   Ú	_tokenizezCanineTokenizer._tokenize   s   € äD‹zÐr/   Útokenc                 óR   — 	 t        |«      S # t        $ r t        d|› d«      ‚w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: 'ú')ÚordÚ	TypeErrorÚ
ValueError)r%   r<   s     r*   Ú_convert_token_to_idz$CanineTokenizer._convert_token_to_idƒ   s5   € ð	:Üu“:ÐøÜò 	:ÜÐ/°¨w°aÐ8Ó9Ð9ð	:ús   ‚
 &Úindexc                 ór   — 	 |t         v r	t         |   S t        |«      S # t        $ r t        d|› «      ‚w xY w)z˜
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r
   r2   r@   rA   )r%   rC   s     r*   Ú_convert_id_to_tokenz$CanineTokenizer._convert_id_to_tokenŠ   sF   € ð
	5ØÔ*Ñ*Ü)¨%Ñ0Ð0Üu“:ÐøÜò 	5Ü˜|¨E¨7Ð3Ó4Ð4ð	5ús   ‚ “
 ž6c                 ó$   — dj                  |«      S )NÚ )Újoin)r%   Útokenss     r*   Úconvert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string–   s   € Øw‰wv‹Ðr/   Útoken_ids_0Útoken_ids_1c                 ó^   — | j                   g}| j                  g}||z   |z   }||||z   z  }|S )a˜  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )Úsep_token_idÚcls_token_id©r%   rK   rL   ÚsepÚclsÚresults         r*   Ú build_inputs_with_special_tokensz0CanineTokenizer.build_inputs_with_special_tokens™   sI   € ð& × Ñ Ð!ˆØ× Ñ Ð!ˆà{Ñ" SÑ(ˆØÐ"Øk CÑ'Ñ'ˆFØˆr/   Úalready_has_special_tokensc                 óŽ   •— |rt         ‰|   ||d¬«      S dgdgt        |«      z  z   dgz   }||dgt        |«      z  dgz   z  }|S )aÄ  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rK   rL   rU   é   r   )r#   Úget_special_tokens_maskr!   )r%   rK   rL   rU   rS   r)   s        €r*   rX   z'CanineTokenizer.get_special_tokens_mask´   sp   ø€ ñ$ &Ü‘7Ñ2Ø'°[Ð]að 3ó ð ð ˜˜œc +Ó.Ñ.Ñ/°1°#Ñ5ˆØÐ"Ø˜sœS Ó-Ñ-°!°Ñ4Ñ4ˆFØˆr/   c                 ó’   — | j                   g}| j                  g}t        ||z   |z   «      dgz  }||t        ||z   «      dgz  z  }|S )aÓ  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   rW   )rN   rO   r!   rP   s         r*   Ú$create_token_type_ids_from_sequencesz4CanineTokenizer.create_token_type_ids_from_sequencesÐ   sa   € ð. × Ñ Ð!ˆØ× Ñ Ð!ˆäS˜;Ñ&¨Ñ,Ó-°°Ñ3ˆØÐ"Ø”c˜+¨Ñ+Ó,°¨sÑ2Ñ2ˆFØˆr/   Úsave_directoryÚfilename_prefixc                  ó   — y)Nr   r   )r%   r[   r\   s      r*   Úsave_vocabularyzCanineTokenizer.save_vocabularyð   s   € Ør/   r-   )NF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r2   ÚCLSÚSEPÚPADÚMASKr$   ÚpropertyÚintr.   r7   r   r   r;   rB   rE   rJ   r   rT   ÚboolrX   rZ   r^   Ú__classcell__)r)   s   @r*   r   r   :   sa  ø„ ññ c“(Ùc“(Ùc“(Ùc“(Ùc“(Ùt“9ØØõ,
ð\ ð(˜Cò (ó ð(òð
˜cð  d¨3¡ió ð:¨#ð :°#ó :ð
5¨#ð 
5°#ó 
5òð JNñØ ™9ðØ3;¸DÀ¹IÑ3Fðà	ˆc‰óð8 sxñØ ™9ðØ3;¸DÀ¹IÑ3FðØkoðà	ˆc‰õð: JNñØ ™9ðØ3;¸DÀ¹IÑ3Fðà	ˆc‰óñ@¨cð ÀHÈSÁM÷ r/   r   N)rb   Útypingr   r   r   Útokenization_utilsr   r   Úutilsr	   Ú
get_loggerr_   Úloggerr   re   rc   rd   ÚBOSrf   ÚRESERVEDr
   rh   r   Ú__annotations__r   r   r   )r'   r(   s   00r*   ú<module>rs      sÑ   ðò 'ç 'Ñ 'ç AÝ ð 
ˆ×	Ñ	˜HÓ	%€ð Ð ð €Ø€Ø€Ø€Ø€Ø€ð ˆØˆØˆØˆ(ØˆØˆlð&Ð D˜˜c˜‘Nó ð  Vh×UmÑUmÓUo×-pÁ/À)ÈT¨d°I©oÓ-pÐ ˜D  c ™NÓ pôwÐ)õ wùó .qs   Á'B