
    sg	                         d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	  ej                  e      Zdd	d
dZ G d de      Zy)z)Fast Tokenization classes for OpenAI GPT.    )OptionalTuple   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                   f     e Zd ZdZeZddgZeZd
 fd	Z	e
d        Zddedee   dee   fd	Z xZS )OpenAIGPTTokenizerFasta  
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_maskc                 .    t        |   ||f||d| y )N)r   	unk_token)super__init__)selfr
   r   r   r   kwargs	__class__s         f/var/www/html/venv/lib/python3.12/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   zOpenAIGPTTokenizerFast.__init__6   s     [o[dohno    c                      y)NT )r   s    r   do_lower_casez$OpenAIGPTTokenizerFast.do_lower_case9   s    r   save_directoryfilename_prefixreturnc                 f    | j                   j                  j                  ||      }t        |      S )N)name)
_tokenizermodelsavetuple)r   r   r   filess       r   save_vocabularyz&OpenAIGPTTokenizerFast.save_vocabulary=   s+    %%**>*PU|r   )NNNz<unk>)N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   slow_tokenizer_classr   propertyr   strr   r   r'   __classcell__)r   s   @r   r   r      sa    ( *$&67-p  c HSM ]bcf]g r   r   N)r+   typingr   r   tokenization_utils_fastr   utilsr   tokenization_openair	   
get_loggerr(   loggerr,   r   r   r   r   <module>r9      sE    0 " >  3 
		H	%#/`pq "4 "r   