
    sg7                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ  ej"                  e      ZdZdd	iZg d
Z G d de      Zy)    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.model)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                   N    e Zd ZU dZeZddgZg Zee	   e
d<   g Zee	   e
d<   	 	 	 	 	 	 	 	 	 	 	 	 d+deeeef      f fdZd	 Zd
 Zed        Zedefd       Zej,                  deddfd       Z	 d,dee	   deee	      dedee	   f fdZ	 d-dee	   deee	      dee	   fdZ	 d-dee	   deee	      dee	   fdZdedee   dee   fdZd Zdedee   fdZd Zd Zd Z d-d ed!ee   de!e   fd"Z"	 	 	 d.d#ee   ded$eee      dede#f
 fd%Z$d& Z%d' Z&d/d(Z'd)eddfd*Z( xZ)S )0MBartTokenizeruT  
    Construct an MBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                    t        |t              rt        |dd      n|}|i n|| _        t	        j
                  di | j                  | _        | j                  j                  t        |             || _        ddddd| _	        d| _
        t        | j                        | _        t        t              D ci c]"  \  }}|| j                  |z   | j                  z   $ c}}| _        | j                  j!                         D ci c]  \  }}||
 c}}| _        t        | j                        t        | j                        z   | j                  z   | j                  d	<   | j                  j%                  | j                         | j                  j!                         D ci c]  \  }}||
 c}}| _        t)        | j                  j+                               }|$|j-                  |D cg c]	  }||vs| c}       t/        | `  d|||||||d |
||| j                  d
| |
|
nd| _        | j                  | j2                     | _        || _        | j9                  | j2                         y c c}}w c c}}w c c}}w c c}w )NTF)lstrip
normalizedr         r	   )<s><pad></s><unk><mask>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr.   r    )
isinstancestrr
   r.   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsfairseq_offsetlensp_model_size	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langcur_lang_code_idrB   set_src_lang_special_tokens)selfr   r9   r:   r<   r=   r;   r>   r?   r@   rA   rB   r.   rC   kwargsicodekv_additional_special_tokenst	__class__s                        _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mbart/tokenization_mbart.pyrZ   zMBartTokenizer.__init__?   sd   & FPPZ\_E`Jz$5Afp 	 &5%<r/22JT5I5IJ3z?+$ ./APQ%R"   /NWXnNo 
CJ1dD$$$q(4+>+>>> 
 261E1E1K1K1MNA1N/24==/ACH\H\D]/]`d`s`s/s""8,""))$*>*>?7;7Q7Q7W7W7Y%Ztq!ad%Z"%)$*>*>*C*C*E%F"$0&--5]qB\9\] 	 	
!&@ 00	
 	
  &.%9w $ 4 4T^^ D ((8G 
  O &[ ^s   ;'I6I<%J/	J9Jc                 ~    | j                   j                         }d |d<   | j                  j                         |d<   |S )NrI   sp_model_proto)__dict__copyrI   serialized_model_proto)r^   states     rg   __getstate__zMBartTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     || _         t        | d      si | _        t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr.   rD   )rj   hasattrr.   rG   rH   rI   LoadFromSerializedProtori   )r^   ds     rg   __setstate__zMBartTokenizer.__setstate__   sQ     t./#%D 22JT5I5IJ--d.A.ABro   c                 x    t        | j                        t        | j                        z   | j                  z   dz   S )Nr2   )rM   rI   rQ   rL   r^   s    rg   
vocab_sizezMBartTokenizer.vocab_size   s2    4==!C(<(<$==@S@SSVWWWro   returnc                     | j                   S N)r[   rv   s    rg   rA   zMBartTokenizer.src_lang   s    ~~ro   new_src_langc                 H    || _         | j                  | j                          y rz   )r[   r]   )r^   r{   s     rg   rA   zMBartTokenizer.src_lang   s    %((8ro   token_ids_0token_ids_1already_has_special_tokensc                    |rt         |   ||d      S dgt        | j                        z  }dgt        | j                        z  }||dgt        |      z  z   |z   S |dgt        |      z  z   dgt        |      z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r}   r~   r   r2   r   )rY   get_special_tokens_maskrM   r,   r-   )r^   r}   r~   r   prefix_onessuffix_onesrf   s         rg   r   z&MBartTokenizer.get_special_tokens_mask   s    & &72'[]a 3   cC 2 233cC 2 2331#K(8"89KGGqcC$445!s;?O9OPS^^^ro   c                 |    || j                   |z   | j                  z   S | j                   |z   |z   | j                  z   S )ab  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r,   r-   )r^   r}   r~   s      rg    build_inputs_with_special_tokensz/MBartTokenizer.build_inputs_with_special_tokens   sG    , %%3d6H6HHH!!K/+=@R@RRRro   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        r   )sep_token_idcls_token_idrM   )r^   r}   r~   sepclss        rg   $create_token_type_ids_from_sequencesz3MBartTokenizer.create_token_type_ids_from_sequences   sm    $   !  !s[(3./1#553$s*S0;>DEKKro   return_tensorsrA   rB   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrA   convert_tokens_to_ids)r^   
raw_inputsr   rA   rB   extra_kwargsinputstgt_lang_ids           rg   _build_translation_inputsz(MBartTokenizer._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%ro   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w rz   )rangerw   convert_ids_to_tokensrT   added_tokens_encoder)r^   r`   vocabs      rg   	get_vocabzMBartTokenizer.get_vocab  sK    ;@;QRa++A.1RRT../ Ss   Atextc                 D    | j                   j                  |t              S )N)out_type)rI   encoderF   )r^   r   s     rg   	_tokenizezMBartTokenizer._tokenize  s    }}##D3#77ro   c                     || j                   v r| j                   |   S | j                  j                  |      }|r|| j                  z   S | j                  S )z0Converts a token (str) in an id using the vocab.)rK   rI   	PieceToIdrL   unk_token_id)r^   tokenspm_ids      rg   _convert_token_to_idz#MBartTokenizer._convert_token_to_id  sU    D...--e44((/ 06v+++L4;L;LLro   c                     || j                   v r| j                   |   S | j                  j                  || j                  z
        S )z=Converts an index (integer) in a token (str) using the vocab.)rU   rI   	IdToPiecerL   )r^   indexs     rg   _convert_id_to_tokenz#MBartTokenizer._convert_id_to_token  sA    D...--e44}}&&ut/B/B'BCCro   c                 l    dj                  |      j                  t        d      j                         }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r^   tokens
out_strings      rg   convert_tokens_to_stringz'MBartTokenizer.convert_tokens_to_string  s,    WWV_,,-=sCIIK
ro   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       d d d        |fS |fS # 1 sw Y   |fS xY w)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrI   rl   write)r^   r   r   out_vocab_fileficontent_spiece_models         rg   save_vocabularyzMBartTokenizer.save_vocabulary$  s%   ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0	src_texts	tgt_textsc                 B    || _         || _        t        |   ||fi |S rz   )rA   rB   rY   prepare_seq2seq_batch)r^   r   rA   r   rB   r_   rf   s         rg   r   z$MBartTokenizer.prepare_seq2seq_batch5  s*     ! w,Y	LVLLro   c                 8    | j                  | j                        S rz   )r]   rA   rv   s    rg   _switch_to_input_modez$MBartTokenizer._switch_to_input_modeA      //>>ro   c                 8    | j                  | j                        S rz   )set_tgt_lang_special_tokensrB   rv   s    rg   _switch_to_target_modez%MBartTokenizer._switch_to_target_modeD  r   ro   c                 t    | j                   |   | _        g | _        | j                  | j                  g| _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrQ   cur_lang_coder,   eos_token_idr-   )r^   rA   s     rg   r]   z*MBartTokenizer.set_src_lang_special_tokensG  s6    !11(;"//1C1CDro   langc                 t    | j                   |   | _        g | _        | j                  | j                  g| _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )r^   r   s     rg   r   z*MBartTokenizer.set_tgt_lang_special_tokensM  s6    !11$7"//1C1CDro   )r4   r6   r6   r4   r7   r5   r8   NNNNN)NFrz   )r   Nr"   )rx   N)*__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr,   r   int__annotations__r-   r   r   rF   r   rZ   rn   rt   propertyrw   rA   setterboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   __classcell__)rf   s   @rg   r)   r)   $   s   ( *$&67!M49!!M49!
 48"&L9 "$sCx.1L9\C X X #   __9S 9T 9 9
 sx_9_3;DI3F_ko_	c_> JNS9S3;DI3FS	cS8 JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_

8c 8d3i 8MD
!c !HSM !]bcf]g !(  )-
M9
M 
M DI&	
M
 
M 

M??EE E Ero   r)   )r   shutilr   typingr   r   r   r   r   sentencepiecerG   tokenization_utilsr
   r   r   utilsr   
get_loggerr   r   r   r   rP   r)   rD   ro   rg   <module>r      s_     
  3 3  P P  
		H	% !#<=  { mE( mEro   