
    sg#3                         d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZ  ej                  e      Zdd	d
Zd Z G d de
      Zy)z Tokenization classes for PhoBERT    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 x    t               }| d   }| dd D ]  }|j                  ||f       |} t        |      }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/phobert/tokenization_phobert.py	get_pairsr   #   sO     EEQIQR 		9d#$	 JEL    c            
       *    e Zd ZdZeZ	 	 	 	 	 	 	 d fd	Z	 ddee   de	ee      dee   fdZ
	 ddee   de	ee      dedee   f fdZ	 ddee   de	ee      dee   fd	Zed
        Zd Zd Zd Zd Zd Zd Zddede	e   dee   fdZd Z xZS )PhobertTokenizeraO	  
    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        bos_token (`st`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c
                    || _         || _        i | _        d| j                  t        |      <   d| j                  t        |      <   d| j                  t        |      <   d| j                  t        |      <   | j	                  |       | j                  j                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d      d d }d d d        D cg c]  }t        |j                         d d         }}t        t        |t        t        |                        | _        i | _        t#        | H  d
|||||||	d	|
 y c c}}w # 1 sw Y   xY wc c}w )Nr   r      r   utf-8encoding
)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_token )r
   r   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr
   r   r    r!   r#   r$   r"   r%   r&   kwargskvmerges_handlemergesmerge	__class__s                   r   r8   zPhobertTokenizer.__init__f   sY    %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>A1>+0 	;M"'')//5cr:F	;9?@%cr*+@@c&%F*<=>
 		
!		
 		
 ?	; 	;@s   %E'#E-7#E9-E6token_ids_0token_ids_1returnc                     || j                   g|z   | j                  gz   S | j                   g}| j                  g}||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A PhoBERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)r9   rA   rB   clsseps        r    build_inputs_with_special_tokensz1PhobertTokenizer.build_inputs_with_special_tokens   sg    ( %%&48I8I7JJJ  !  ![ 3&,{:S@@r   already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   dgz   S dgdgt        |      z  z   ddgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rA   rB   rJ   r   r   )r7   get_special_tokens_maskr4   )r9   rA   rB   rJ   r@   s       r   rL   z(PhobertTokenizer.get_special_tokens_mask   s    & &72'[]a 3   31#K 001QC77sqcC,,-A61#K@P:PQUVTWWWr   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )rF   rE   r4   )r9   rA   rB   rH   rG   s        r   $create_token_type_ids_from_sequencesz5PhobertTokenizer.create_token_type_ids_from_sequences   sm    "   !  !s[(3./1#553$s*S0;>DEKKr   c                 ,    t        | j                        S N)r4   r(   r9   s    r   
vocab_sizezPhobertTokenizer.vocab_size   s    4<<  r   c                 B    t        | j                  fi | j                  S rP   )r1   r(   added_tokens_encoderrQ   s    r   	get_vocabzPhobertTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 $    | j                   v r j                   |   S t        |      }t        t        |d d       |d   dz   gz         }t        |      }|s|S 	 t	        | fd      }| j
                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d d	 }| j                   |<   |S # t        $ r |j                  ||d         Y nw xY w)
Nr   z</w>c                 N    j                   j                  | t        d            S )Ninf)r5   getfloat)pairr9   s    r   <lambda>z&PhobertTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W r   )keyr   r   r   @@ )r6   r0   listr   minr5   r4   indexextend
ValueErrorappendjoin)
r9   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezPhobertTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E/ /FFc                     g }t        j                  d|      }|D ]:  }|j                  t        | j	                  |      j                  d                   < |S )zTokenize a string.z\S+\n? )refindallrc   r`   rn   r/   )r9   textsplit_tokenswordsrg   s        r   	_tokenizezPhobertTokenizer._tokenize  sT    

9d+ 	BETXXe_%:%:3%? @A	Br   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r(   rY   r"   )r9   rg   s     r   _convert_token_to_idz%PhobertTokenizer._convert_token_to_id  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r,   rY   r"   )r9   rb   s     r   _convert_id_to_tokenz%PhobertTokenizer._convert_id_to_token!  s    ||t~~66r   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.rp   r^    )rf   replacestrip)r9   tokens
out_strings      r   convert_tokens_to_stringz)PhobertTokenizer.convert_tokens_to_string%  s,    XXf%--eR8>>@
r   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  r@t         j                  j                  | j                        rt        | j                  |       nit         j                  j                  | j                        s@t        |d      5 }| j                  j                         }|j                  |       d d d        t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       ||fS # 1 sw Y   lxY w)NzVocabulary path (z) should be a directory-r|   r
   r   wb)ospathisdirloggererrorrf   VOCAB_FILES_NAMESabspathr
   isfiler   r-   sp_modelserialized_model_protowriter   )r9   r   r   out_vocab_fileout_merge_fileficontent_spiece_models          r   save_vocabularyz PhobertTokenizer.save_vocabulary*  sr   ww}}^,LL,^,<<STUo_s22QbcoQpp
 o_s22QbcpQqq
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n50nd+ /r'+}}'K'K'M$-./ 77??4++,0OOT%%~6~--/ /s   ,G11G:c                    t        |t              r*	 t        |dd      5 }| j                  |       ddd       y|j                         }|D ]Z  }|j                         }|j                  d      }|dk(  rt        d	      |d| }t        | j                        | j                  |<   \ y# 1 sw Y   yxY w# t        $ r}|d}~wt
        $ r t        d| d      w xY w)
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr   r   NzIncorrect encoding detected in z, please rebuild the datasetrp   r   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer)   r-   r*   FileNotFoundErrorUnicodeError	Exception	readlinesr~   rfindrd   r4   r(   )	r9   ffdfnfelineslineTmplineidxr   s	            r   r*   zPhobertTokenizer.add_from_fileG  s     ac!S73 +r&&r*+  	3G==?D**S/Cby !XYY:D!$T\\!2DLL	3+ 	 % 
 c"A!D` abbcs3   B7 B+B7 +B40B7 4B7 7	C CC)<s></s>r   r   z<unk>z<pad>z<mask>rP   )NF)__name__
__module____qualname____doc__r   vocab_files_namesr8   r   intr   rI   boolrL   rN   propertyrR   rU   rn   rv   rx   rz   r   r)   r   r   r*   __classcell__)r@   s   @r   r   r   3   s@   .` * *
Z JNA9A3;DI3FA	cA6 sxX9X3;DI3FXkoX	cX: JNL9L3;DI3FL	cL0 ! !?*XI7
.c .HSM .]bcf]g .:3r   r   )r   r   rq   shutilr   typingr   r   r   tokenization_utilsr   utilsr	   
get_loggerr   r   r   r   r   r'   r   r   <module>r      sW     ' 	 	  ( ( 5  
		H	%   i3* i3r   