
    sgI                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ  ej"                  e      ZdZdd	d
Zg dg ddZddddddddZ G d de      Zy)    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygoc                   b    e Zd ZU dZeZddgZg Zee	   e
d<   g Zee	   e
d<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 d,deeeef      f fdZd	 Zd
 Zed        Zedefd       Zej,                  deddfd       Z	 d-dee	   deee	      dedee	   f fdZ	 d.dee	   deee	      dee	   fdZ	 d.dee	   deee	      dee	   fdZdedee   dee   fdZd Zdedee   fdZd Zd Zd Z d.d ed!ee   de!e   fd"Z"	 	 	 d/d#ee   ded$eee      dede#f
 fd%Z$d& Z%d' Z&d0d(Z'd)eddfd*Z(d)edefd+Z) xZ*S )1PLBartTokenizera  
    Construct an PLBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The start of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The cls token, which is a special token used as the first token for all tasks.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token(`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masking tasks. This
            is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
            downstream tasks.
        language_codes (`str`, *optional*, defaults to `"base"`):
            What language codes to use. Should be one of `"base"` or `"multi"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import PLBartTokenizer

    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
    >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
    >>> expected_translation_english = "Returns the maximum value of a b c."
    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                 F   t        |t              rt        |dd      n|}|i n|| _        | j	                  |      }| j	                  |      }t        j                  di | j                  | _        | j                  j                  t        |             || _	        |	| _
        t        | j                     }ddddd| _        d| _        t        | j                        | _        t!        |      D ci c]"  \  }}|| j                  |z   | j                  z   $ c}}| _        | j"                  j%                         D ci c]  \  }}||
 c}}| _        | j                  d	k(  rEt        | j                        t        | j"                        z   | j                  z   | j                  d
<   | j                  j)                  | j"                         | j                  j%                         D ci c]  \  }}||
 c}}| _        t-        | j"                  j/                               }|$|j1                  |D cg c]	  }||vs| c}       | j                  d	k(  r>|| _        | j2                  | j"                  | j2                     n| j2                  | _        n)||nd| _        | j"                  | j2                     | _        t7        | p  d||||||||	|
|||| j                  |d| || _        | j=                  | j2                         y c c}}w c c}}w c c}}w c c}w )NTF)lstriprstripr         r	   )<s><pad></s><unk>r   <mask>r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenlanguage_codesr   src_langtgt_langadditional_special_tokensr&   clean_up_tokenization_spaces )
isinstancestrr
   r&   !_convert_lang_code_special_formatspmSentencePieceProcessorsp_modelLoadr   r8   FAIRSEQ_LANGUAGE_CODESfairseq_tokens_to_idsfairseq_offsetlensp_model_size	enumeratelang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextend	_src_langcur_lang_code_idsuper__init__r:   set_src_lang_special_tokens)selfr   r1   r2   r4   r5   r3   r6   r7   r8   r   r9   r:   r&   r;   r<   kwargsfairseq_language_codesicodekv_additional_special_tokenst	__class__s                           a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/plbart/tokenization_plbart.pyrV   zPLBartTokenizer.__init__u   s   ( KUU_adJeZ
4Fku
%4%<r/99(C99(C22JT5I5IJ3z?+$,!78K8K!L ./APQ%R"   /NWXnNo 
CJ1dD$$$q(4+>+>>> 
 261E1E1K1K1MNA1N&(36t}}3EDL`L`Ha3adhdwdw3wD&&x0""))$*>*>?7;7Q7Q7W7W7Y%Ztq!ad%Z"%)$*>*>*C*C*E%F"$0&--5]qB\9\] &(%DN8<8R$$T^^4X\XfXf ! *2)=X;DN$($8$8$HD! 	
!))&@ 00)E	
 	
$ !((8] 
  O &[ ^s   3'L L,L6	L Lc                 ~    | j                   j                         }d |d<   | j                  j                         |d<   |S )NrC   sp_model_proto)__dict__copyrC   serialized_model_proto)rX   states     rb   __getstate__zPLBartTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr&   r=   )re   hasattrr&   rA   rB   rC   LoadFromSerializedProtord   )rX   ds     rb   __setstate__zPLBartTokenizer.__setstate__   sQ     t./#%D 22JT5I5IJ--d.A.ABrj   c                    | j                   dk(  r;t        | j                        t        | j                        z   | j                  z   dz   S t        | j                        t        | j                        z   | j                  z   S )Nr   r*   )r8   rH   rC   rK   rG   rX   s    rb   
vocab_sizezPLBartTokenizer.vocab_size   sn    &(DMM"S)=)=%>>ATATTWXX t}}%D,@,@(AADDWDWWWrj   returnc                     | j                   S N)rS   rq   s    rb   r9   zPLBartTokenizer.src_lang   s    ~~rj   new_src_langc                 j    | j                  |      }|| _        | j                  | j                         y ru   )r@   rS   rW   )rX   rv   s     rb   r9   zPLBartTokenizer.src_lang   s+    ==lK%((8rj   token_ids_0token_ids_1already_has_special_tokensc                    |rt         |   ||d      S dgt        | j                        z  }dgt        | j                        z  }||dgt        |      z  z   |z   S |dgt        |      z  z   dgt        |      z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rx   ry   rz   r*   r   )rU   get_special_tokens_maskrH   r$   r%   )rX   rx   ry   rz   prefix_onessuffix_onesra   s         rb   r|   z'PLBartTokenizer.get_special_tokens_mask   s    & &72'[]a 3   cC 2 233cC 2 2331#K(8"89KGGqcC$445!s;?O9OPS^^^rj   c                 |    || j                   |z   | j                  z   S | j                   |z   |z   | j                  z   S )ac  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r$   r%   )rX   rx   ry   s      rb    build_inputs_with_special_tokensz0PLBartTokenizer.build_inputs_with_special_tokens  sG    , %%3d6H6HHH!!K/+=@R@RRRrj   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )sep_token_idcls_token_idrH   )rX   rx   ry   sepclss        rb   $create_token_type_ids_from_sequencesz4PLBartTokenizer.create_token_type_ids_from_sequences/  sm    "   !  !s[(3./1#553$s*S0;>DEKKrj   return_tensorsr9   r:   c                     ||t        d      | j                  |      | _        | j                  |      | _         | |fd|d|}| j	                  | j                        }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorr@   r9   r:   convert_tokens_to_ids)rX   
raw_inputsr   r9   r:   extra_kwargsinputstgt_lang_ids           rb   _build_translation_inputsz)PLBartTokenizer._build_translation_inputsG  sz     x/`aa>>xH>>xHjiT.i\hi00?(3$%rj   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w ru   )rangerr   convert_ids_to_tokensrN   added_tokens_encoder)rX   r[   vocabs      rb   	get_vocabzPLBartTokenizer.get_vocabT  sK    ;@;QRa++A.1RRT../ Ss   Atextc                 D    | j                   j                  |t              S )N)out_type)rC   encoder?   )rX   r   s     rb   	_tokenizezPLBartTokenizer._tokenizeY  s    }}##D3#77rj   c                     || j                   v r| j                   |   S | j                  j                  |      }|r|| j                  z   S | j                  S )z0Converts a token (str) in an id using the vocab.)rF   rC   	PieceToIdrG   unk_token_id)rX   tokenspm_ids      rb   _convert_token_to_idz$PLBartTokenizer._convert_token_to_id\  sU    D...--e44((/ 06v+++L4;L;LLrj   c                     || j                   v r| j                   |   S | j                  j                  || j                  z
        S )z=Converts an index (integer) in a token (str) using the vocab.)rO   rC   	IdToPiecerG   )rX   indexs     rb   _convert_id_to_tokenz$PLBartTokenizer._convert_id_to_tokene  sA    D...--e44}}&&ut/B/B'BCCrj   c                 l    dj                  |      j                  t        d      j                         }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)rX   tokens
out_strings      rb   convert_tokens_to_stringz(PLBartTokenizer.convert_tokens_to_stringk  s,    WWV_,,-=sCIIK
rj   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrC   rg   write)rX   r   r   out_vocab_fileficontent_spiece_models         rb   save_vocabularyzPLBartTokenizer.save_vocabularyp  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0	src_texts	tgt_textsc                 ~    | j                  |      | _        | j                  |      | _        t        |   ||fi |S ru   )r@   r9   r:   rU   prepare_seq2seq_batch)rX   r   r9   r   r:   rY   ra   s         rb   r   z%PLBartTokenizer.prepare_seq2seq_batch  s@     >>xH>>xHw,Y	LVLLrj   c                 8    | j                  | j                        S ru   )rW   r9   rq   s    rb   _switch_to_input_modez%PLBartTokenizer._switch_to_input_mode      //>>rj   c                 8    | j                  | j                        S ru   )set_tgt_lang_special_tokensr:   rq   s    rb   _switch_to_target_modez&PLBartTokenizer._switch_to_target_mode  r   rj   c                     | j                  |      }|| j                  |   nd| _        g | _        | j                  | j                  | j                  g| _        y| j                  g| _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].Nr@   rK   cur_lang_coder$   eos_token_idr%   )rX   r9   s     rb   rW   z+PLBartTokenizer.set_src_lang_special_tokens  sk    99(C?G?ST11(;Y])"&"3"3T5G5G!HD"&"3"3!4Drj   langc                     | j                  |      }|| j                  |   nd| _        g | _        | j                  | j                  | j                  g| _        y| j                  g| _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   rX   r   s     rb   r   z+PLBartTokenizer.set_tgt_lang_special_tokens  sk    55d;;?;KT11$7QU)"&"3"3T5G5G!HD"&"3"3!4Drj   c                 L    |t         j                         v rt         |   }|S |}|S )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPrQ   r   s     rb   r@   z1PLBartTokenizer._convert_lang_code_special_format  s1    37;U;Z;Z;\3\)$/ cgrj   )r,   r.   r.   r,   r/   r-   r0   r   NNNNNT)NFru   )r   Nr   )rs   N)+__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr$   r   int__annotations__r%   r   r   r?   r   rV   ri   ro   propertyrr   r9   setterboolr|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   r@   __classcell__)ra   s   @rb   r!   r!   1   s   ;z *$&67!M49!!M49!
 48"&%)!\9 "$sCx.1\9|C X X #   __9S 9T 9 9 sx_9_3;DI3F_ko_	c_> JNS9S3;DI3FS	cS8 JNL9L3;DI3FL	cL0*-9A#RZ[^R_
8c 8d3i 8MD
!c !HSM !]bcf]g !(  )- 
M9
M 
M DI&	
M
 
M 

M??5	5 	5 	5c c rj   r!   )r   shutilr   typingr   r   r   r   r   sentencepiecerA   tokenization_utilsr
   r   r   utilsr   
get_loggerr   r   r   r   rE   r   r!   r=   rj   rb   <module>r      s     
  3 3  P P  
		H	% #<P`a  4g  "
 z) zrj   