
    sga                         d dl Z d dlZd dlZd dlZd dlmZmZmZ ddlm	Z	m
Z
mZmZ ddlmZ  ej                  e      ZdddZd	 Zd
 Zd Zd Z G d d      Z G d de	      Zy)    N)ListOptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert.py	get_pairsr   "   sF    
 EEQIQR 		9d#$	 L    c                 *   | j                  dd      } t        j                  dd|       } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d	d
      } | j                  dd
      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d d!      } | j                  d"d#      } | j                  d$d%      } | j                  d&d'      } | j                  d(d)      } | j                  d*d+      } | j                  d,d-      } t        j                  d.d|       } | j                  d/d0      } | j                  d1d2      } | j                  d3d4      } | j                  d5d6      } | j                  d7d8      } | j                  d9d:      } | j                  d;d<      } | j                  d=d>      } | j                  d?d@      } | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr8   0   sM    <<s#D66)T4(D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D66)T4(D<<s#D<<s#D<<u%D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#DKr   c                     g }| D ]:  }t        j                  |      }|j                  d      r*|j                  |       < dj	                  |      S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    C )unicodedatacategory
startswithappendjoin)r7   outputr   cats       r   remove_non_printing_charrC   \   sS     F ""4(>>#d	
 776?r   c                 N    | j                         } | sg S | j                         }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)r7   tokenss     r   whitespace_tokenizerH   j   s%    ::<D	ZZ\FMr   c                   J    e Zd ZdZ	 	 	 	 	 d
dZddZd ZddZd Zd Z	d	 Z
y)BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    Nc                 d    |g }|| _         t        |      | _        || _        || _        || _        y N)do_lower_caser   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfrM   rN   rO   rP   rQ   s         r   __init__zBasicTokenizer.__init__   s<     K*{+&<#* 0r   c                 J   |r$| j                   j                  t        |            n| j                   }| j                  |      }| j                  r| j                  |      }t        j                  d|      }t        |      }g }|D ]  }||vrY| j                  r0|j                         }| j                  dur/| j                  |      }n| j                  r| j                  |      }|j                  | j                  ||              t        dj                  |            }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCF )rN   unionr   _clean_textrO   _tokenize_chinese_charsr<   	normalizerH   rM   lowerrP   _run_strip_accentsextend_run_split_on_puncr@   )rR   r7   rN   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizezBasicTokenizer.tokenize   s    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB  	MEK'%%!KKME))6 $ 7 7 >'' 33E:E 7 7{ KL	M ,CHH\,BCr   c                     t        j                  d|      }g }|D ].  }t        j                  |      }|dk(  r|j                  |       0 dj	                  |      S )z$Strips accents from a piece of text.NFDMnr;   )r<   rZ   r=   r?   r@   )rR   r7   rA   r   rB   s        r   r\   z!BasicTokenizer._run_strip_accents   s^    $$UD1 	 D&&t,Cd{MM$		 
 wwvr   c                 v   | j                   r|||v r|gS t        |      }d}d}g }|t        |      k  rb||   }t        |      r|j	                  |g       d}n)|r|j	                  g        d}|d   j	                  |       |dz  }|t        |      k  rb|D cg c]  }dj                  |       c}S c c}w )z&Splits punctuation on a piece of text.r   TFr   r;   )rQ   listlenr	   r?   r@   )	rR   r7   rN   charsistart_new_wordrA   r   xs	            r   r^   z!BasicTokenizer._run_split_on_punc   s    $$)@T[EX6MT
#e*n8Dt$tf%!%!MM"%!&r
!!$'FA #e*n %++q
+++s   B6c                     g }|D ]c  }t        |      }| j                  |      r4|j                  d       |j                  |       |j                  d       S|j                  |       e dj                  |      S )z)Adds whitespace around any CJK character.rV   r;   )ord_is_chinese_charr?   r@   rR   r7   rA   r   cps        r   rY   z&BasicTokenizer._tokenize_chinese_chars   sm     	$DTB$$R(c"d#c"d#	$ wwvr   c                     |dk\  r|dk  sF|dk\  r|dk  s<|dk\  r|dk  s2|dk\  r|dk  s(|d	k\  r|d
k  s|dk\  r|dk  s|dk\  r|dk  s
|dk\  r|dk  ryy)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )rR   rt   s     r   rr   zBasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r   c                     g }|D ]Q  }t        |      }|dk(  s|dk(  st        |      r$t        |      r|j                  d       A|j                  |       S dj	                  |      S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rV   r;   )rq   r   r
   r?   r@   rs   s        r   rX   zBasicTokenizer._clean_text  sf     	$DTBQw",+d*;d#c"d#	$ wwvr   )TNTNTrL   )__name__
__module____qualname____doc__rS   rd   r\   r^   rY   rr   rX   rv   r   r   rJ   rJ   t   s<    0 #1 $L	,,0r   rJ   c                   d    e Zd ZdZeZddddddddg d	ddf fd
	Zed        Zd Z	d Z
d Zd Zed        Zd Zd Zd Zd Zd Zd Z	 d#dee   deee      dee   fdZ	 d$dee   deee      dedee   f fdZ	 d#dee   deee      dee   fdZd#dedee   dee   fd Zd! Zd" Z xZ S )%HerbertTokenizera  
    Construct a BPE tokenizer for HerBERT.

    Peculiarities:

    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
      punctuation character will be treated separately.

    - Such pretokenized input is BPE subtokenized

    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.
    Nz<s>z<unk>z<pad>z<mask>z</s>F)
z
<special0>z
<special1>z
<special2>z
<special3>z
<special4>z
<special5>z
<special6>z
<special7>z
<special8>z
<special9>c                    	 dd l }|| _        i | _        i | _        h d| _        |
| _        || _        || _        ||t        |      t        |      k(  sJ d | _
        d | _        t        |d      5 }t        j                  |      | _        d d d        | j                  j!                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j%                         j'                  d      d d }d d d        D cg c]  }t)        |j'                         d d         }}t+        t-        |t/        t        |                        | _        i | _        t5        | l  d||	||||||||
d d	| t9        d
| j:                  d
d
      | _        y # t        $ r t        d      w xY w# 1 sw Y   #xY wc c}}w # 1 sw Y   xY wc c}w )Nr   zrYou need to install sacremoses to use HerbertTokenizer. See https://pypi.org/project/sacremoses/ for installation.>   jathzhutf-8encoding
ri      )	unk_token	bos_token	sep_token	pad_token	cls_token
mask_tokenadditional_special_tokenslang2idid2langdo_lowercase_and_remove_accenttokenizer_fileF)rM   rN   rO   rP   rv   )
sacremosesImportErrorsmcache_moses_punct_normalizercache_moses_tokenizerlang_with_custom_tokenizerr   r   r   rk   ja_word_tokenizerzh_word_tokenizeropenjsonloadencoderitemsdecoderreadrF   tupledictziprange	bpe_rankscachesuperrS   rJ   all_special_tokensbert_pre_tokenizer)rR   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsr   vocab_handlekvmerges_handlemergesmerge	__class__s                         r   rS   zHerbertTokenizer.__init__&  s   8	  -/)%'"*<'.L+7#6w<3w<///!%!%*w/ 	3<99\2DL	3)-););)=>A1>+0 	;M"'')//5cr:F	;8>?u%bq)*??c&%F*<=>
 	
!&?+I	
 	
 #1//#(	#
_  	M 	.	3 	3>	; 	;?s/   F 2F.2F;#G#GF+.F8G
c                     | j                   S rL   )r   rR   s    r   rM   zHerbertTokenizer.do_lower_casez  s     222r   c                     || j                   vr,| j                  j                  |      }|| j                   |<   n| j                   |   }|j                  |      S )Nlang)r   r   MosesPunctNormalizerrZ   )rR   r7   r   punct_normalizers       r   moses_punct_normz!HerbertTokenizer.moses_punct_norm  sZ    t888#ww;;;F6FD--d3#@@F))$//r   c                     || j                   vr,| j                  j                  |      }|| j                   |<   n| j                   |   }|j                  |dd      S )Nr   F)
return_strescape)r   r   MosesTokenizerrd   )rR   r7   r   moses_tokenizers       r   moses_tokenizezHerbertTokenizer.moses_tokenize  s_    t111"gg44$4?O/>D&&t,"88>O''u'MMr   c                 V    t        |      }| j                  ||      }t        |      }|S rL   )r8   r   rC   )rR   r7   r   s      r   moses_pipelinezHerbertTokenizer.moses_pipeline  s-    $T*$$T40'-r   c                    | j                   <	 dd l}|j                  dt        j                  j	                  d       d      | _         t        | j                   j                  |            S # t
        t        f$ r t        j                  d       t        j                  d       t        j                  d       t        j                  d       t        j                  d	       t        j                  d
        w xY w)Nr   z-model r,   z/local/share/kytea/model.binzMake sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following stepsz81. git clone git@github.com:neubig/kytea.git && cd kyteaz2. autoreconf -iz#3. ./configure --prefix=$HOME/localz4. make && make installz5. pip install kytea)r   Mykyteaospath
expanduserAttributeErrorr   loggererrorrj   getWS)rR   r7   r   s      r   ja_tokenizezHerbertTokenizer.ja_tokenize  s    !!))0bgg00566RS*& D**00677 #K0 
[ WX/0BC6734
s   ;A- -BC<c                 ,    t        | j                        S rL   )rk   r   r   s    r   
vocab_sizezHerbertTokenizer.vocab_size  s     4<<  r   c                 B    t        | j                  fi | j                  S rL   )r   r   added_tokens_encoderr   s    r   	get_vocabzHerbertTokenizer.get_vocab  s    DLL>D$=$=>>r   c                     t        |d d       |d   dz   fz   }| j                  v r j                  |   S t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d	k(  rd
}| j                  |<   |S # t        $ r |j                  ||d         Y pw xY w)Nri   </w>c                 N    j                   j                  | t        d            S )Ninf)r   getfloat)pairrR   s    r   <lambda>z&HerbertTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1W r   keyr   r   r   rV   z
  </w>z
</w>)r   r   r   minr   rk   indexr]   
ValueErrorr?   r@   )
rR   rb   r   r   bigramfirstsecondnew_wordrm   js
   `         r   bpezHerbertTokenizer.bpe  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E   F ?F c                     | j                   j                  |      }g }|D ]=  }|s|j                  t        | j	                  |      j                  d                   ? |S )NrV   )r   rd   r]   rj   r   rF   )rR   r7   
pre_tokensra   rb   s        r   	_tokenizezHerbertTokenizer._tokenize  s_    ,,55d;
 	FE##D%)>)>s)C$DE	F r   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r   r   r   )rR   rb   s     r   _convert_token_to_idz%HerbertTokenizer._convert_token_to_id  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   r   )rR   r   s     r   _convert_id_to_tokenz%HerbertTokenizer._convert_id_to_token  s    ||t~~66r   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r;   r   rV   )r@   r4   rE   )rR   rG   
out_strings      r   convert_tokens_to_stringz)HerbertTokenizer.convert_tokens_to_string  s+    WWV_,,VS9??A
r   token_ids_0token_ids_1returnc                 f    | j                   g}| j                  g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.

        )bos_token_idsep_token_id)rR   r   r   bosseps        r    build_inputs_with_special_tokensz1HerbertTokenizer.build_inputs_with_special_tokens  sP    (   !  !$s**[ 3&4s::r   already_has_special_tokensc                     |rt         |   ||d      S |+dgdgt        |      z  z   dgz   dgt        |      z  z   dgz   S dgdgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r   r   )r   get_special_tokens_maskrk   )rR   r   r   r   r   s       r   r   z(HerbertTokenizer.get_special_tokens_mask  s    & &72'[]a 3   "31#K 001QC7A3[AQ;QRVWUXXXsqcC,,-33r   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z         dgz  t        ||z         dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r   )r   cls_token_idrk   )rR   r   r   r   clss        r   $create_token_type_ids_from_sequencesz5HerbertTokenizer.create_token_type_ids_from_sequences8  st    .   !  !s[(3./1#553$s*+qc1Cc8I4JaS4PPPr   save_directoryfilename_prefixc           	      .   t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	      5 }|j                  t        j                  | j                  d
dd      dz          d d d        d}t        |dd	      5 }t        | j                  j                         d       D ]M  \  }}	||	k7  rt        j                  d| d       |	}|j                  dj                  |      dz          |dz  }O 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)NzVocabulary path (z) should be a directoryr.   r;   r   r   wr   r   r   TF)indent	sort_keysensure_asciir   r   c                     | d   S )Nr   rv   )kvs    r   r   z2HerbertTokenizer.save_vocabulary.<locals>.<lambda>f  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rV   r   )r   r   isdirr   r   r@   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rR   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabularyz HerbertTokenizer.save_vocabularyV  s   ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 		+1$..2F2F2HN^+_ '
KK'NN/
| <M M (ESXXj1D89
		 :%%	c 	c		 :%%s   *6E<8A7F<FFc                 D    | j                   j                         }d |d<   |S )Nr   )__dict__copy)rR   states     r   __getstate__zHerbertTokenizer.__getstate__s  s"    ""$dr   c                 Z    || _         	 dd l}|| _        y # t        $ r t        d      w xY w)Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r  r   r   r   )rR   dr   s      r   __setstate__zHerbertTokenizer.__setstate__y  s>    	   	M 	s    *rL   )NF)!rx   ry   rz   r{   r  vocab_files_namesrS   propertyrM   r   r   r   r   r   r   r   r   r   r   r   r   intr   r   boolr   r   strr   r  r  r  __classcell__)r   s   @r   r}   r}     su    * ',#
 3R
h 3 30N8* ! !?*XI
7
 JN;9;3;DI3F;	c;: sx4943;DI3F4ko4	c4< JNQ9Q3;DI3FQ	cQ<&c &HSM &]bcf]g &:r   r}   )r   r   r5   r<   typingr   r   r   tokenization_utilsr   r   r	   r
   utilsr   
get_loggerrx   r   r  r   r8   rC   rH   rJ   r}   rv   r   r   <module>r%     sw     	 	  ( ( c c  
		H	%  
(X
^ ^Bo* or   