
    sgZ                         d Z ddlmZmZmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ  e
j                  e      Zd	d
ddZ G d de      Zy)z$Tokenization classes for OpenAI GPT.    )ListOptionalTuple)pre_tokenizers   )PreTrainedTokenizerFast)logging   )CLIPTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                        e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 d fd	Z	d Z
	 ddee   deee      dee   fd	Z	 ddee   deee      dee   fd
Zddedee   dee   fdZ xZS )CLIPTokenizerFasta  
    Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            The path to a tokenizer file to use instead of the vocab file.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskc           	          t        	|   ||f|||||d| t        | j                  j                  t
        j                        st        d      | j                          y )N)r   	unk_token	bos_token	eos_token	pad_tokena  The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using to be compatible with this version.The easiest way to do so is `CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of transformers.)	super__init__
isinstancebackend_tokenizerpre_tokenizerr   Sequence
ValueError%_wrap_decode_method_backend_tokenizer)
selfr   r   r   r   r   r   r   kwargs	__class__s
            b/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip_fast.pyr   zCLIPTokenizerFast.__init__=   sv     			
 *		
 		
 $00>>@W@WX!  	224    c                     | j                   j                  | j                   j                  j                  fd}|| j                   _        y )Nc                  X     | i |}|j                  d      j                         }|S )N )replacestrip)argsr!   textend_of_word_suffixorig_decode_methods      r#   new_decode_methodzRCLIPTokenizerFast._wrap_decode_method_backend_tokenizer.<locals>.new_decode_methodf   s1    %t6v6D<< 2C8>>@DKr$   )r   decodemodelr,   )r    r.   r,   r-   s     @@r#   r   z7CLIPTokenizerFast._wrap_decode_method_backend_tokenizer_   sD    !33:: "3399LL	
 ):%r$   token_ids_0token_ids_1returnc                 l    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )bos_token_ideos_token_idr    r1   r2   r   r   s        r#    build_inputs_with_special_tokensz2CLIPTokenizerFast.build_inputs_with_special_tokensm   sU    ( &&'	&&'	{*Y66;&2Y>LyXXr$   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )r5   r6   lenr7   s        r#   $create_token_type_ids_from_sequencesz6CLIPTokenizerFast.create_token_type_ids_from_sequences   sp      &&'	&&'	y;.:;qcAA9{*Y6B[PS\\]ab`cccr$   save_directoryfilename_prefixc                 f    | j                   j                  j                  ||      }t        |      S )N)name)
_tokenizerr0   savetuple)r    r<   r=   filess       r#   save_vocabularyz!CLIPTokenizerFast.save_vocabulary   s+    %%**>*PU|r$   )NNN<|endoftext|>z<|startoftext|>rE   rE   )N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   slow_tokenizer_classr   r   r   intr   r8   r;   strr   rD   __classcell__)r"   s   @r#   r   r      s    2 *$&67( !#!!5D: JNY9Y3;DI3FY	cY8 JNd9d3;DI3Fd	cd.c HSM ]bcf]g r$   r   N)rI   typingr   r   r   
tokenizersr   tokenization_utils_fastr   utilsr	   tokenization_clipr   
get_loggerrF   loggerrJ   r    r$   r#   <module>rY      sM    + ( ( % >  , 
		H	%#/`pq B/ Br$   