
    sg`P                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZmZmZmZmZ ddlmZ  ej(                  e      Zdd	d
Z e       d        Zd Zd Zd Z G d d      Z G d de      Zy)zTokenization classes for CLIP.    N)	lru_cache)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~      ¡   ¬   ®   ÿNr      )listrangeordappendchrdictzip)bscsnbs       ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip.pybytes_to_unicoder%   %   s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r$   	get_pairsr-   >   sF     EEQIQR 		9d#$	 L    c                 T    t        j                  dd|       } | j                         } | S )Nz\s+ )resubstrip)texts    r$   whitespace_cleanr5   L   s$    66&#t$D::<DKr.   c                 N    | j                         } | sg S | j                         }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r3   split)r4   tokenss     r$   whitespace_tokenizer9   S   s%    ::<D	ZZ\FMr.   c                   J    e Zd ZdZ	 	 	 	 	 d
dZddZd ZddZd Zd Z	d	 Z
y)BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    Nc                 d    |g }|| _         t        |      | _        || _        || _        || _        y N)do_lower_caser'   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr>   r?   r@   rA   rB   s         r$   __init__zBasicTokenizer.__init__t   s<     K*{+&<#* 0r.   c                 J   |r$| j                   j                  t        |            n| j                   }| j                  |      }| j                  r| j                  |      }t        j                  d|      }t        |      }g }|D ]  }||vrY| j                  r0|j                         }| j                  dur/| j                  |      }n| j                  r| j                  |      }|j                  | j                  ||              t        dj                  |            }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCFr0   )r?   unionr'   _clean_textr@   _tokenize_chinese_charsunicodedata	normalizer9   r>   lowerrA   _run_strip_accentsextend_run_split_on_puncjoin)rC   r4   r?   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r$   tokenizezBasicTokenizer.tokenize   s    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB  	MEK'%%!KKME))6 $ 7 7 >'' 33E:E 7 7{ KL	M ,CHH\,BCr.   c                     t        j                  d|      }g }|D ].  }t        j                  |      }|dk(  r|j                  |       0 dj	                  |      S )z$Strips accents from a piece of text.NFDMn )rJ   rK   categoryr   rP   )rC   r4   outputr,   cats        r$   rM   z!BasicTokenizer._run_strip_accents   s^    $$UD1 	 D&&t,Cd{MM$		 
 wwvr.   c                 v   | j                   r|||v r|gS t        |      }d}d}g }|t        |      k  rb||   }t        |      r|j	                  |g       d}n)|r|j	                  g        d}|d   j	                  |       |dz  }|t        |      k  rb|D cg c]  }dj                  |       c}S c c}w )z&Splits punctuation on a piece of text.r   TFr   rZ   )rB   r   lenr   r   rP   )	rC   r4   r?   charsistart_new_wordr\   r,   xs	            r$   rO   z!BasicTokenizer._run_split_on_punc   s    $$)@T[EX6MT
#e*n8Dt$tf%!%!MM"%!&r
!!$'FA #e*n %++q
+++s   B6c                     g }|D ]c  }t        |      }| j                  |      r4|j                  d       |j                  |       |j                  d       S|j                  |       e dj                  |      S )z)Adds whitespace around any CJK character.r0   rZ   )r   _is_chinese_charr   rP   rC   r4   r\   r,   cps        r$   rI   z&BasicTokenizer._tokenize_chinese_chars   sm     	$DTB$$R(c"d#c"d#	$ wwvr.   c                     |dk\  r|dk  sF|dk\  r|dk  s<|dk\  r|dk  s2|dk\  r|dk  s(|d	k\  r|d
k  s|dk\  r|dk  s|dk\  r|dk  s
|dk\  r|dk  ryy)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )rC   rh   s     r$   rf   zBasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r.   c                     g }|D ]Q  }t        |      }|dk(  s|dk(  st        |      r$t        |      r|j                  d       A|j                  |       S dj	                  |      S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r0   rZ   )r   r
   r   r   rP   rg   s        r$   rH   zBasicTokenizer._clean_text   sf     	$DTBQw",+d*;d#c"d#	$ wwvr.   )TNTNTr=   )__name__
__module____qualname____doc__rD   rV   rM   rO   rI   rf   rH   rj   r.   r$   r;   r;   ]   s<    0 #1 $L	,,0r.   r;   c            
       (    e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zed        Z	d Z
	 ddee   deee      d	ee   fd
Z	 ddee   deee      ded	ee   f fdZ	 ddee   deee      d	ee   fdZd Zd Zd Zd Zd Zddedee   d	ee   fdZ xZS )CLIPTokenizera  
    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskc           
         t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}	 dd l}	|	j                  | _        t        |d      5 }
t        j                  |
      | _        d d d        | j                  j                         D ci c]  \  }}||
 c}}| _        || _        t#               | _        | j$                  j                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j)                         j+                         j-                  d      d	d
 }d d d        D cg c]  }t/        |j-                                }}t1        t3        |t5        t7        |                        | _        ddd| _        t=        j>                  dt<        j@                        | _!        tE        |   d|||||d| y # t
        $ r2 t        j                  d       t        dd      | _	        d | _        Y w xY w# 1 sw Y   xY wc c}}w c c}}w # 1 sw Y   xY wc c}w )NF)lstriprstripr   zKftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.)rA   rB   utf-8encoding
r   i  <|startoftext|><|endoftext|>)r{   r|   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)errors	unk_token	bos_token	eos_token	pad_tokenrj   )$
isinstancestrr   ftfyfix_textImportErrorloggerinfor;   nlpopenjsonloadencoderitemsdecoderr}   r%   byte_encoderbyte_decoderreadr3   r7   tupler   r   r   r`   	bpe_rankscacher1   compile
IGNORECASEpatsuperrD   )rC   r   r   r}   r~   r   r   r   kwargsr   vocab_handlekvmerges_handle
bpe_mergesmerge	__class__s                   r$   rD   zCLIPTokenizer.__init__  s    JTT]_bIcJyuEir	IST]_bIcJyuEir	IST]_bIcJyuEir		! MMDM *w/ 	3<99\2DL	3)-););)=>A1>,..2.?.?.E.E.GHdaQTH+0 	[M&++-335;;DA!FYZJ	[8BCueEKKM*C
Cc*eC
O.DEF):_]
::nMM

 	 	
	
 	
-  	!KKef%EERDH DM	!
	3 	3> I	[ 	[Cs<   #H III11I. I(7I ?I II%c                 ,    t        | j                        S r=   )r`   r   rC   s    r$   
vocab_sizezCLIPTokenizer.vocab_sizeL  s    4<<  r.   c                 B    t        | j                  fi | j                  S r=   )r   r   added_tokens_encoderr   s    r$   	get_vocabzCLIPTokenizer.get_vocabP  s    DLL>D$=$=>>r.   token_ids_0token_ids_1returnc                 l    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )bos_token_ideos_token_idrC   r   r   r   r   s        r$    build_inputs_with_special_tokensz.CLIPTokenizer.build_inputs_with_special_tokensS  sU    ( &&'	&&'	{*Y66;&2Y>LyXXr.   already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r   r   )r   get_special_tokens_maskr`   )rC   r   r   r   r   s       r$   r   z%CLIPTokenizer.get_special_tokens_maskn  s    & &72'[]a 3   31#K 001QC77sqcC,,-3qc9aS3{CS=STXYWZZZr.   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )r   r   r`   r   s        r$   $create_token_type_ids_from_sequencesz2CLIPTokenizer.create_token_type_ids_from_sequences  sp      &&'	&&'	y;.:;qcAA9{*Y6B[PS\\]ab`cccr.   c                     | j                   v r j                   |   S t        |d d       |d   dz   fz   }t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }| j                   |<   |S # t        $ r |j                  ||d         Y iw xY w)	Nr_   </w>c                 N    j                   j                  | t        d            S )Ninf)r   getfloat)pairrC   s    r$   <lambda>z#CLIPTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1W r.   keyr   r      r0   )r   r   r-   minr   r`   indexrN   
ValueErrorr   rP   )
rC   rT   r)   r*   bigramfirstsecondnew_wordrb   js
   `         r$   bpezCLIPTokenizer.bpe  s   DJJ::e$$U3BZ E"I$6#88$6>!$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~ 

5+ " OODH-s   E E98E9c                     g } j                   +dj                   j                  j                  |            }n(t	         j                  |            j                         }t        j                   j                  |      D ]a  }dj                   fd|j                  d      D              }|j                  d  j                  |      j                  d      D               c |S )zTokenize a string.r0   rZ   c              3   <   K   | ]  }j                   |     y wr=   )r   ).0r#   rC   s     r$   	<genexpr>z*CLIPTokenizer._tokenize.<locals>.<genexpr>  s!      )*!!!$s   rw   c              3       K   | ]  }|  y wr=   rj   )r   	bpe_tokens     r$   r   z*CLIPTokenizer._tokenize.<locals>.<genexpr>  s     TIiTs   )r   rP   r   rV   r5   rL   r1   findallr   encoderN   r   r7   )rC   r4   
bpe_tokensrT   s   `   r$   	_tokenizezCLIPTokenizer._tokenize  s    
== 88DHH--d34D#DMM$$78>>@DZZ$/ 	UEGG .3ll7.C E T%9N9Ns9STT		U
 r.   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r   r   r~   )rC   rT   s     r$   _convert_token_to_idz"CLIPTokenizer._convert_token_to_id  s,    ||t||'7'7'GHHr.   c                 8    | j                   j                  |      S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   )rC   r   s     r$   _convert_id_to_tokenz"CLIPTokenizer._convert_id_to_token  s    ||&&r.   c                     dj                  |      }t        |D cg c]  }| j                  |    c}      }|j                  d| j                        j                  dd      j                         }|S c c}w )z:Converts a sequence of tokens (string) in a single string.rZ   rw   )r}   r   r0   )rP   	bytearrayr   decoder}   replacer3   )rC   r8   r4   c
byte_arrays        r$   convert_tokens_to_stringz&CLIPTokenizer.convert_tokens_to_string  si    wwvdC 1 1! 4CD
   =EEfcRXXZ  Ds   A5save_directoryfilename_prefixc           	      |   t         j                  j                  |      s%t        j	                  dj                  |             y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd      5 }|j                  t        j                  | j                  d	d
d      dz          d d d        d}t        |dd      5 }|j                  d       t        | j                  j                         d       D ]X  \  }}	||	k7  r&t        j!                  dj                  |             |	}|j                  dj                  |      dz          |dz  }Z 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)Nz*Vocabulary path ({}) should be a directory-rZ   r   r   wrw   rx   r   TF)indent	sort_keysensure_asciirz   r   z#version: 0.2
c                     | d   S )Nr   rj   )kvs    r$   r   z/CLIPTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r.   r   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r0   r   )ospathisdirr   errorformatrP   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rC   r   r   r   
merge_filefr   writerr   token_indexs
             r$   save_vocabularyzCLIPTokenizer.save_vocabulary  s   ww}}^,LLELL^\]WW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 
	LL*++1$..2F2F2HN^+_ '
KK'NNMMSVT^M_ (ESXXj1D89

	 :%%!	c 	c
	 :%%s   56F#BF/#F,/F;)r   r|   r{   r|   r|   r=   )NF)rl   rm   rn   ro   r   vocab_files_namesmodel_input_namesrD   propertyr   r   r   intr   r   boolr   r   r   r   r   r   r   r   r   r   __classcell__)r   s   @r$   rq   rq      s@   2 *$&67 !#!!/
b ! !? JNY9Y3;DI3FY	cY8 sx[9[3;DI3F[ko[	c[: JNd9d3;DI3Fd	cd.(TI'&c &HSM &]bcf]g &r.   rq   )ro   r   r   rJ   	functoolsr   typingr   r   r   regexr1   tokenization_utilsr   r	   r
   r   r   utilsr   
get_loggerrl   r   r   r%   r-   r5   r9   r;   rq   rj   r.   r$   <module>r     s    %  	   ( (  o o  
		H	%    0^ ^BF&' F&r.   