
    sgP>                         d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZmZ  ej4                  e      ZddiZdZ G d de      Zy)z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends
vocab_filezspiece.modelu   ▁c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 d#deee	e
f      ddf fdZd Zed	        Zd
 Z	 d$dee   deee      dedee   f fdZdee   dee   fdZ	 d%dee   deee      dee   fdZ	 d%dee   deee      dee   fdZd Zd Zde	de	fdZdddZd&dddee	   f fdZed        Zd Zd Zd Zd Z d%d e	d!ee	   de!e	   fd"Z" xZ#S )'SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    	input_idsattention_maskNsp_model_kwargsreturnc	                    t        | d       t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}|i n|| _        || _        || _        | j                         | _        || _        t        
| (  d||||| j                  ||d|	 y )NprotobufTF)rstriplstrip
normalizedspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r#   r   get_spm_processorsp_modelsuper__init__)selfr   r   r   r    r!   r   r"   r#   kwargs	__class__s             a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/siglip/tokenization_siglip.pyr*   zSiglipTokenizer.__init__X   s     	$
+ )S) yduVZ[ 	 )S) yduVZ[ 	 )S) yduVZ[ 	 &5%<r/*$..0$ 		
&? 00-'		
 		
    c                    t        j                  di | j                  }t        | j                  d      5 }|j                         }t               }|j                  j                  |      }|j                         }d|_
        |j                  j                  |       |j                         }|j                  |       d d d        |S # 1 sw Y   |S xY w)NrbFr$   )spmSentencePieceProcessorr   openr   readr   
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r+   	tokenizerfr(   	model_pb2modelr:   s          r.   r'   z!SiglipTokenizer.get_spm_processor   s    ..F1E1EF	$//4( 	8AvvxH')I((33H=E'668O/4O,!!++O<..0H--h7	8 	8 s   B	C		Cc                 6    | j                   j                         S N)r(   get_piece_sizer+   s    r.   
vocab_sizezSiglipTokenizer.vocab_size   s     }}++--r/   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w rC   )rangerF   convert_ids_to_tokensupdateadded_tokens_encoder)r+   ivocabs      r.   	get_vocabzSiglipTokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rO   rP   rQ   r      )r)   get_special_tokens_masklen)r+   rO   rP   rQ   r-   s       r.   rT   z'SiglipTokenizer.get_special_tokens_mask   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr/   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rU   eos_token_idwarningswarnr   )r+   rV   s     r.   _add_eos_if_not_presentz'SiglipTokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r/   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )rY   rU   )r+   rO   rP   eoss       r.   $create_token_type_ids_from_sequencesz4SiglipTokenizer.create_token_type_ids_from_sequences   sP        !{S()QC//;${2S89QC??r/   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r\   )r+   rO   rP   s      r.    build_inputs_with_special_tokensz0SiglipTokenizer.build_inputs_with_special_tokens   s;    & 22;?66{CK,,r/   c                 D    | j                   j                         }d |d<   |S )Nr(   )__dict__copy)r+   states     r.   __getstate__zSiglipTokenizer.__getstate__   s#    ""$ jr/   c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr   r$   )rc   hasattrr   r2   r3   r(   Loadr   )r+   ds     r.   __setstate__zSiglipTokenizer.__setstate__  sO     t./#%D 22JT5I5IJ4??+r/   textc                 j    |j                  t        j                  ddt        j                              S )N )	translater&   	maketransstringpunctuation)r+   rl   s     r.   remove_punctuationz"SiglipTokenizer.remove_punctuation  s$    ~~cmmBF4F4FGHHr/   keep_punctuation_exact_stringc                     |r*|j                   fd|j                  |      D              }n j                  |      }t        j                  dd|      }|j                         }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c              3   @   K   | ]  }j                  |        y wrC   )rs   ).0partr+   s     r.   	<genexpr>z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>  s!      626''-6s   z\s+ )joinsplitrs   resubstrip)r+   rl   ru   s   `  r.   canonicalize_textz!SiglipTokenizer.canonicalize_text  sc     )055 6:>**Eb:c6 D **40Dvvfc4(zz|r/   r   c                     t        |   t        |j                  t        d      z   fi |}t	        |      dkD  r"|d   t        k(  r|d   | j
                  v r|dd }|S )z8
        Converts a string to a list of tokens.
        r{   rS   r   N)r)   tokenizeSPIECE_UNDERLINEreplacerU   all_special_tokens)r+   rl   add_special_tokensr,   tokensr-   s        r.   r   zSiglipTokenizer.tokenize&  se     !"2T\\BRTW5X"Xc\bcv;?vay,<<dNeNeAeABZFr/   c                 p    t        | j                  j                  t        | j                                    S rC   )rU   r(   encoder&   r   rE   s    r.   unk_token_lengthz SiglipTokenizer.unk_token_length0  s'     4==''DNN(;<==r/   c                    | j                  |d      }| j                  j                  |t              }| j                  j                  | j                  |z   t              }t        |      | j                  k\  r|| j                  d S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nrt   )out_type)r   r(   r   r&   r   rU   r   )r+   rl   r,   r   s       r.   	_tokenizezSiglipTokenizer._tokenize5  s     %%d$%O%%dS%9 %%dnnt&;c%J25f+AVAV2Vvd++-.b\bbr/   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r(   piece_to_id)r+   tokens     r.   _convert_token_to_idz$SiglipTokenizer._convert_token_to_idJ  s    }}((//r/   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r(   	IdToPiece)r+   indexr   s      r.   _convert_id_to_tokenz$SiglipTokenizer._convert_id_to_tokenO  s    ''.r/   c                    g }d}d}|D ]P  }|| j                   v r-|s|dz  }|| j                  j                  |      |z   z  }d}g }>|j                  |       d}R || j                  j                  |      z  }|j	                         S )z:Converts a sequence of tokens (string) in a single string.rn   Fr{   T)r   r(   decodeappendr   )r+   r   current_sub_tokens
out_stringprev_is_specialr   s         r.   convert_tokens_to_stringz(SiglipTokenizer.convert_tokens_to_stringT  s    
 
	(E///&#%Jdmm223EFNN
"&%'""))%0"'
	( 	dmm**+=>>
!!r/   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-rn   r   wb)ospathisdirloggererrorr|   VOCAB_FILES_NAMESabspathr   isfiler   r4   r(   serialized_model_protowrite)r+   r   r   out_vocab_fileficontent_spiece_models         r.   save_vocabularyzSiglipTokenizer.save_vocabularyh  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0)</s>z<unk>r   NN@   T)NFrC   )F)$__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r&   r   r*   r'   propertyrF   rN   r   intboolrT   r\   r_   ra   rf   rk   rs   r   r   r   r   r   r   r   r	   r   __classcell__)r-   s   @r.   r   r   ,   s   &P *$&67
 "&48/
 "$sCx.1/
 
/
b . . sxO9O3;DI3FOkoO	cO:	3c 	3tCy 	3 JN@9@3;DI3F@	c@0 JN-9-3;DI3F-	c-6,Is Is I HL *[ QUVYQZ  > >c*0

"(!c !HSM !]bcf]g !r/   r   ) r   r   r~   rq   rZ   shutilr   typingr   r   r   r   r   r	   sentencepiecer2   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   r   
get_loggerr   r   r   r   r   r$   r/   r.   <module>r      sl    + 	 	    B B  5 5 1 4 / 
		H	%!>2   K!) K!r/   