
    sg*                         d dl Z d dlmZ d dlmZmZmZ d dlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ  e       rd	d
lmZ ndZ ej&                  e      ZdddZg dZ G d de      Zy)    N)copyfile)ListOptionalTuple)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       e Zd ZU dZeZddgZeZg Z	e
e   ed<   g Ze
e   ed<   	 	 	 	 	 	 	 	 	 	 	 	 d  fd	Zedefd	       Zedefd
       Zej(                  deddfd       Z	 d!de
e   dee
e      de
e   fdZ	 d!de
e   dee
e      de
e   fdZdedee   dee   fdZ	 	 	 d"de
e   dedee
e      dedef
 fdZd Zd Zd#dZdeddfdZd!dedee   dee   fdZ  xZ!S )$MBartTokenizerFastuO  
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensNc                     t        |	t              rt        |	dd      n|	}	t        j	                         }|$|j                  |D cg c]	  }||vs| c}       t        |   d|||||||||	|
||d| || _        t        D ci c]  }|| j                  |       c}| _
        |
|
nd| _        | j                  | j                        | _        || _        | j                  | j                         y c c}w c c}w )NTF)lstriprstrip)r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr    )
isinstancestrr	   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   convert_tokens_to_idslang_code_to_id	_src_langcur_lang_coder<   set_src_lang_special_tokens)selfr   r   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   kwargs_additional_special_tokenst	lang_code	__class__s                    d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mbart/tokenization_mbart_fast.pyrE   zMBartTokenizerFast.__init__I   s   " KUU_adJeZ
4Fku
%;%@%@%B"$0&--5]qB\9\] 	 	
!)!&@	
 	
  %Nd 
AJIt11)<< 
 &.%9w!77G ((87 ^( 
s   	C6C6C;returnc                 p    | j                   r)t        j                  j                  | j                         S dS )NF)r   ospathisfilerK   s    rQ   can_save_slow_tokenizerz*MBartTokenizerFast.can_save_slow_tokenizer~   s$    26//rww~~doo.LuL    c                     | j                   S N)rH   rW   s    rQ   r;   zMBartTokenizerFast.src_lang   s    ~~rY   new_src_langc                 H    || _         | j                  | j                          y r[   )rH   rJ   )rK   r\   s     rQ   r;   zMBartTokenizerFast.src_lang   s    %((8rY   token_ids_0token_ids_1c                 |    || j                   |z   | j                  z   S | j                   |z   |z   | j                  z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r/   r0   )rK   r^   r_   s      rQ    build_inputs_with_special_tokensz3MBartTokenizerFast.build_inputs_with_special_tokens   sG    0 %%3d6H6HHH!!K/+=@R@RRRrY   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        r   )sep_token_idcls_token_idlen)rK   r^   r_   sepclss        rQ   $create_token_type_ids_from_sequencesz7MBartTokenizerFast.create_token_type_ids_from_sequences   sm    $   !  !s[(3./1#553$s*S0;>DEKKrY   return_tensorsr;   r<   c                 v    ||t        d      || _         | |fd|d|}| j                  |      }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensri   forced_bos_token_id)
ValueErrorr;   rF   )rK   
raw_inputsri   r;   r<   extra_kwargsinputstgt_lang_ids           rQ   _build_translation_inputsz,MBartTokenizerFast._build_translation_inputs   sY     x/`aa jiT.i\hi00:(3$%rY   	src_texts	tgt_textsc                 B    || _         || _        t        |   ||fi |S r[   )r;   r<   rD   prepare_seq2seq_batch)rK   rs   r;   rt   r<   rL   rP   s         rQ   rv   z(MBartTokenizerFast.prepare_seq2seq_batch   s*     ! w,Y	LVLLrY   c                 8    | j                  | j                        S r[   )rJ   r;   rW   s    rQ   _switch_to_input_modez(MBartTokenizerFast._switch_to_input_mode       //>>rY   c                 8    | j                  | j                        S r[   )set_tgt_lang_special_tokensr<   rW   s    rQ   _switch_to_target_modez)MBartTokenizerFast._switch_to_target_mode   ry   rY   c                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrF   rI   r/   eos_token_idr0   convert_ids_to_tokensr   TemplateProcessinglistzip
_tokenizerpost_processor)rK   r;   prefix_tokens_strsuffix_tokens_strs       rQ   rJ   z.MBartTokenizerFast.set_src_lang_special_tokens   s    !77A"//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rY   langc                    | j                  |      | _        g | _        | j                  | j                  g| _        | j                  | j                        }| j                  | j                        }t        j                  |dgz   |z   |ddgz   |z   t        t        ||z   | j                  | j                  z                     | j                  _        y)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r~   r   r   Nr   )rK   r   r   r   s       rQ   r{   z.MBartTokenizerFast.set_tgt_lang_special_tokens   s    !77="//1C1CD 66t7I7IJ 66t7I7IJ)3)F)F$v-0AA"dD\14EE$58I$I4K]K]`d`r`rKr st*
&rY   save_directoryfilename_prefixc                    | j                   st        d      t        j                  j	                  |      st
        j                  d| d       y t        j                  j                  ||r|dz   ndt        d   z         }t        j                  j                  | j                        t        j                  j                  |      k7  rt        | j                  |       |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )rX   rm   rT   rU   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )rK   r   r   out_vocab_files       rQ   save_vocabularyz"MBartTokenizerFast.save_vocabulary   s    ++ 
 ww}}^,LL,^,<<TUVo_s22QbcoQpp
 77??4??+rww~/NNT__n5  rY   )NN<s></s>r   r   z<unk>z<pad>z<mask>NNNr[   )r   Nr%   )rR   N)"__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr/   r   int__annotations__r0   rE   propertyboolrX   r@   r;   setterr   ra   rh   rr   r
   rv   rx   r|   rJ   r{   r   r   __classcell__)rP   s   @rQ   r,   r,   *   s   . *$&67)!M49!!M49! "&39j M M M #   __9S 9T 9 9
 JNS9S3;DI3FS	cS< JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_
  )-
M9
M 
M DI&	
M
 
M 

M??

 
 
!c !HSM !]bcf]g !rY   r,   )rT   shutilr   typingr   r   r   
tokenizersr   tokenization_utilsr	   r
   tokenization_utils_fastr   utilsr   r   tokenization_mbartr   
get_loggerr   r   r   rA   r,   r>   rY   rQ   <module>r      si     
  ( ( ! ; > 8 2N 
		H	% $=P`a  { d!0 d!rY   