
    sgY6                         d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z
ddlmZmZ ddlmZ  ej                   e      Zdd	d
ZddiZdZ e       d        Zd Z G d de      Zy)zTokenization classes for Qwen2.    N)	lru_cache)OptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filezqwen/qwen-tokenizeri   zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~      ¡   ¬   ®   ÿNr      )listrangeordappendchrdictzip)bscsnbs       _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/qwen2/tokenization_qwen2.pybytes_to_unicoder!   *   s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r    	get_pairsr)   E   sF     EEQIQR 		9d#$	 L    c            	            e Zd ZdZeZddgZ	 	 	 	 	 	 	 d fd	Zede	fd       Z
d Zd Zd	 Zd
 Zd Zd Z	 	 	 ddedee   dedef fdZddedee   dee   fdZd Z xZS )Qwen2Tokenizera
  
    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.

    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```python
    >>> from transformers import Qwen2Tokenizer

    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
    >>> tokenizer("Hello world")["input_ids"]
    [9707, 1879]

    >>> tokenizer(" Hello world")["input_ids"]
    [21927, 1879]
    ```
    This is expected.

    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not applicable for this tokenizer.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
        split_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not the special tokens should be split during the tokenization process. The default behavior is
            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
    	input_idsattention_maskc
                    t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}t        |t              rt        |dddd      n|}t        |d      5 }t	        j
                  |      | _        d d d        | j                  j                         D ci c]  \  }}||
 c}}| _        || _	        t               | _        | j                  j                         D ci c]  \  }}||
 c}}| _        g }t        |d      5 }t        |      D ]V  \  }}|j                         }|dk(  r|j                  d      s|s/|j!                  t#        |j%                                      X 	 d d d        t'        t)        |t+        t-        |                        | _        i | _        t3        j4                  t6              | _        |
j;                  dd      r,t<        j?                  | j@                  jB                   d	       tE        |   d|||||||	d
|
 y # 1 sw Y   xY wc c}}w c c}}w # 1 sw Y   xY w)NFT)lstriprstripspecial
normalizedutf-8encodingr   z	#version:add_prefix_spacezG does not support `add_prefix_space`, setting it to True has no effect.)errors	bos_token	eos_token	pad_token	unk_tokenclean_up_tokenization_spacessplit_special_tokens )$
isinstancestrr   openjsonloadencoderitemsdecoderr8   r!   byte_encoderbyte_decoder	enumeratestrip
startswithr   tuplesplitr   r   r   len	bpe_rankscacherecompilePRETOKENIZE_REGEXpatgetloggerwarning_once	__class___Qwen2Tokenizer__namesuper__init__)selfr
   r   r8   r<   r9   r:   r;   r=   r>   kwargsvocab_handlekv
bpe_mergesmerges_handleilinerY   s                     r    r\   zQwen2Tokenizer.__init__   sT     )S) yudW\] 	 )S) yudW\] 	 )S) yudW\] 	 )S) yudW\] 	 *w/ 	3<99\2DL	3)-););)=>A1>,..2.?.?.E.E.GHdaQTH
+0 	7M$]3 74zz|Ft{;D!!%

"56	7	7 c*eC
O.DEF
 
::/0::(%0>>(())pq 	 		
)E!5		
 		
7	3 	3> I	7 	7s%   II' I-A%I3I$3I<returnc                 ,    t        | j                        S N)rO   rE   r]   s    r    
vocab_sizezQwen2Tokenizer.vocab_size   s    4<<  r*   c                 B    t        | j                  fi | j                  S rh   )r   rE   added_tokens_encoderri   s    r    	get_vocabzQwen2Tokenizer.get_vocab   s    DLL>D$=$=>>r*   c                     | j                   v r j                   |   S t        |      }t        |      }|s|S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }| j                   |<   |S # t        $ r |j                  ||d         Y iw xY w)Nc                 N    j                   j                  | t        d            S )Ninf)rP   rV   float)pairr]   s    r    <lambda>z$Qwen2Tokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W r*   keyr   r       )rQ   rM   r)   minrP   rO   indexextend
ValueErrorr   join)
r]   tokenr%   r&   bigramfirstsecondnew_wordrd   js
   `         r    bpezQwen2Tokenizer.bpe   s   DJJ::e$$U|$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~ 

5+ " OODH-s   1E	 	E)(E)c                     g }t        j                   j                  |      D ]a  }dj                   fd|j	                  d      D              }|j                  d  j                  |      j                  d      D               c |S )zTokenize a string. c              3   <   K   | ]  }j                   |     y wrh   )rH   ).0r   r]   s     r    	<genexpr>z+Qwen2Tokenizer._tokenize.<locals>.<genexpr>
  s!      )*!!!$s   r4   c              3       K   | ]  }|  y wrh   r?   )r   	bpe_tokens     r    r   z+Qwen2Tokenizer._tokenize.<locals>.<genexpr>  s     TIiTs   rw   )rR   findallrU   r|   encoderz   r   rN   )r]   text
bpe_tokensr}   s   `   r    	_tokenizezQwen2Tokenizer._tokenize  s}    
ZZ$/ 	UEGG .3ll7.C E T%9N9Ns9STT		U
 r*   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rE   rV   r<   )r]   r}   s     r    _convert_token_to_idz#Qwen2Tokenizer._convert_token_to_id  s,    ||t||'7'7'GHHr*   c                 8    | j                   j                  |      S )z=Converts an index (integer) in a token (str) using the vocab.)rG   rV   )r]   ry   s     r    _convert_id_to_tokenz#Qwen2Tokenizer._convert_id_to_token  s    ||&&r*   c                     dj                  |      }t        |D cg c]  }| j                  |    c}      j                  d| j                        }|S c c}w )z:Converts a sequence of tokens (string) in a single string.r   r4   )r8   )r|   	bytearrayrI   decoder8   )r]   tokensr   cs       r    convert_tokens_to_stringz'Qwen2Tokenizer.convert_tokens_to_string  sP    wwv=1$++A.=>EEgVZVaVaEb >s   Askip_special_tokensr=   spaces_between_special_tokensc                 ,    t        |   |f|||d|S )N)r   r=   r   )r[   r   )r]   	token_idsr   r=   r   r^   rY   s         r    r   zQwen2Tokenizer.decode!  s1     w~
 3)E*G	

 
 	
r*   save_directoryfilename_prefixc           	      P   t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	      5 }|j                  t        j                  | j                  d
dd      dz          d d d        d}t        |dd	      5 }|j                  d       t        | j                  j                         d       D ]M  \  }}	||	k7  rt        j                  d| d       |	}|j                  dj                  |      dz          |dz  }O 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)NzVocabulary path (z) should be a directory-r   r
   r   wr4   r5   rv   TF)indent	sort_keysensure_ascii
r   z#version: 0.2
c                     | d   S )Nr   r?   )kvs    r    rs   z0Qwen2Tokenizer.save_vocabulary.<locals>.<lambda>E  s    Y[\]Y^ r*   rt   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rw   r   )ospathisdirrW   errorr|   VOCAB_FILES_NAMESrB   writerC   dumpsrE   sortedrP   rF   warning)
r]   r   r   r
   
merge_filefry   writerr   token_indexs
             r    save_vocabularyzQwen2Tokenizer.save_vocabulary4  s   ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 
	LL*++1$..2F2F2HN^+_ '
KK'NN/
| <M M (ESXXj1D89

	 :%%!	c 	c
	 :%%s   *6F8BFFF%c                 6    t        j                  d|      }||fS )NNFC)unicodedata	normalize)r]   r   r^   s      r    prepare_for_tokenizationz'Qwen2Tokenizer.prepare_for_tokenizationQ  s    $$UD1f~r*   )replace<|endoftext|>Nr   r   FF)FFFrh   )__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr\   propertyintrj   rm   r   r   r   r   r   boolr   rA   r   r   r   r   __classcell__)rY   s   @r    r,   r,   S   s    0d *$&67 !!!%*"G
R !C ! !?(VI
'
 %*7<.3
 "
 '/tn	

 (,
 

&&c &HSM &]bcf]g &:r*   r,   )r   rC   r   r   	functoolsr   typingr   r   regexrR   tokenization_utilsr   r   utilsr	   
get_loggerr   rW   r   MAX_MODEL_INPUT_SIZESrT   r!   r)   r,   r?   r*   r    <module>r      s    &  	   "  A  
		H	%   /6  J   2@( @r*   