
    sg%                         d Z ddlZddlZddlmZmZmZ ddlmZm	Z	  e       rddl
Z
ddlmZ ddlmZ  ej                  e      Zdd	iZd
 Z G d d      Z G d de      Zy)z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     t        j                         }t        | dd      5 }|j                         }ddd       t	              D ]  \  }}|j                  d      }|||<    |S # 1 sw Y   4xY w)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   sw    ##%E	j#	0 $F!!#$!&) uT"e L$ $s   A''A0c                       e Zd ZddZd Zy)WordpieceTokenizerc                 .    || _         || _        || _        y N)r   	unk_tokenmax_input_chars_per_word)selfr   r#   r$   s       r   __init__zWordpieceTokenizer.__init__0   s    
"(@%    c                    t        |      }t        |      | j                  kD  r| j                  gS d}g }|t        |      k  rt        |      }d }||k  r0dj	                  |||       }|| j
                  v r|}n|dz  }||k  r0|!|j                  | j                         |dz  }n|j                  |       |}|t        |      k  r|S )Nr       )listlenr$   r#   joinr   append)r%   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizezWordpieceTokenizer.tokenize5   s    Uu:555NN##
c%j e*CJ#+uS!12TZZ'!'Jq #+ !!!$..1
!!*- c%j   r'   N)<unk>   )__name__
__module____qualname__r&   r5    r'   r   r    r    /   s    A
r'   r    c            
       L    e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 	 	 d fd	Ze	d        Z
e	d        Ze	d        Ze	d	efd
       Zd Zd Z fdZd Zdee   d	efdZd Zd Zddedee   d	ee   fdZddee   dee   d	ee   fdZ	 ddee   deee      ded	ee   f fdZ xZS )CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskFc                 X   t        | dg       || _        || _        t        |      | _        | j                  |	   | j                  d<   | j                  |   | j                  d<   | j                  |	= | j                  |= t        j                  t        | j                  j                         d             | _        | j                  j                         D ci c]  \  }}||
 c}}| _	        t        | j                  |      | _        t        | 4  d||||||||	|
d	| y c c}}w )	Njieba r   c                     | d   S Nr*   r;   xs    r   <lambda>z*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^ r'   key)r   r#   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr#   
line_tokenspace_tokenpadding_sider;   )r   rK   rL   r   encoderr   r   sorteditemsdecoderr    wordpiece_tokenizersuperr&   )r%   r   rK   rL   rM   rN   rO   r#   rP   rQ   rR   kwargskv	__class__s                 r   r&   zCpmAntTokenizer.__init__l   s    	$	*""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>A1>#5DLLT]#^  	
!#%	
 	
	 ?s   D&c                 4    | j                   | j                     S r"   )rS   rK   r%   s    r   bod_token_idzCpmAntTokenizer.bod_token_id       ||DNN++r'   c                 4    | j                   | j                     S r"   )rS   rL   r^   s    r   eod_token_idzCpmAntTokenizer.eod_token_id   r`   r'   c                      | j                   d   S )Nr   rS   r^   s    r   
newline_idzCpmAntTokenizer.newline_id   s    ||D!!r'   returnc                 ,    t        | j                        S r"   )r,   rS   r^   s    r   
vocab_sizezCpmAntTokenizer.vocab_size   s    4<<  r'   c                 B    t        | j                  fi | j                  S r"   )dictrS   added_tokens_encoderr^   s    r   	get_vocabzCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r'   c                     g }t        j                  |d      D ],  }|j                  | j                  j	                  |             . |S )zTokenize a string.F)cut_all)rA   cutextendrW   r5   )r%   textoutput_tokensrF   s       r   	_tokenizezCpmAntTokenizer._tokenize   sH    4/ 	GA  !9!9!B!B1!EF	Gr'   c                     |D cg c]
  }|dk\  s	| }}|D cg c]4  }|| j                   k7  s|| j                  k7  s#|| j                  k7  s3|6 }}t        |   |fi |S c c}w c c}w )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idrX   _decode)r%   	token_idsrY   irF   r\   s        r   rx   zCpmAntTokenizer._decode   s     )41Q!VQ4	4 
A):):$:qDDUDU?UZ[_c_p_pZpA
	 
 wy3F33	 5
s    
A&A&A+A+ A+A+c                     || j                   v S r"   rd   r%   r   s     r   checkzCpmAntTokenizer.check   s    $$r'   r   c                 $    dj                  |      S )Nr)   )r-   )r%   r   s     r   convert_tokens_to_stringz(CpmAntTokenizer.convert_tokens_to_string   s    wwvr'   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rS   getr#   r|   s     r   _convert_token_to_idz$CpmAntTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHr'   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)rV   r   r#   )r%   r   s     r   _convert_id_to_tokenz$CpmAntTokenizer._convert_id_to_token   s    ||t~~66r'   save_directoryfilename_prefixc                     t         j                  j                  |      r2t         j                  j                  ||r|dz   ndt        d   z         }n|r|dz   nd|z   }d}d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= d| j
                  v r)| j
                  d   | j
                  d<   | j
                  d= t        j                  t        | j
                  j                         d	 
            | _        t        |dd      5 }| j
                  j                         D ]>  \  }}||k7  rt        j                  d| d       |}|j                  |dz          |dz  }@ 	 d d d        |fS # 1 sw Y   |fS xY w)N-r)   r   r   rB   </_>r   </n>c                     | d   S rD   r;   rE   s    r   rG   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rH   r'   rI   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r*   )ospathisdirr-   VOCAB_FILES_NAMESrS   r   r   rT   rU   r   loggerwarningwrite)r%   r   r   r   r   writerr   token_indexs           r   save_vocabularyzCpmAntTokenizer.save_vocabulary   sx   77==(/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4 		&*ll&8&8&: "{K'NN/
| <N N (EUT\*
		 }		 }s   AFFtoken_ids_0token_ids_1c                 h    || j                   g|z   S | j                   g|z   | j                   gz   |z   S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        )rw   )r%   r   r   s      r    build_inputs_with_special_tokensz0CpmAntTokenizer.build_inputs_with_special_tokens   sE     %%&44!!"[0D4E4E3FFTTr'   already_has_special_tokensc                     |rt         |   ||d      S |'dgdgt        |      z  z   dgz   dgt        |      z  z   S dgdgt        |      z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r*   r   )rX   get_special_tokens_maskr,   )r%   r   r   r   r\   s       r   r   z'CpmAntTokenizer.get_special_tokens_mask   sy    " &72'[]a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r'   )	z<d>z</d>z<s>z</s>z<pad>r6   r   r   leftr"   )NF)r8   r9   r:   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer&   propertyr_   rb   re   intrh   rl   rs   rx   r}   r   strr   r   r   r   r   r   r   boolr   __classcell__)r\   s   @r   r=   r=   O   si   0 *$&67
 (
T , , , , " " !C ! !?4%tCy S I7c HSM ]bcf]g 6UDI UTXY\T] Uimnqir U& sx.9.3;DI3F.ko.	c. .r'   r=   )r   r   r   typingr   r   r   transformers.utilsr   r   rA   tokenization_utilsr	   utilsr
   
get_loggerr8   r   r   r   r    r=   r;   r'   r   <module>r      sh    '  	 ( ( D  5  
		H	%!;/  @|.) |.r'   