
    sgA                     &   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ  ej(                  e      Zdd	d
dddZdZ G d de      Zdede
ee	f   dej6                  fdZdeddfdZdedee
ef   fdZy)    N)Path)copyfile)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)loggingz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁c            
           e Zd ZdZeZddgZ ej                  d      Z		 	 	 	 	 	 	 	 	 d)de
eeef      ddf fdZd	 Zd
edefdZd ZdefdZdedee   fdZdedefdZ fdZ fdZdee   defdZd*dee   fdZd Zd Zedefd       Zd*dede
e   dee   fdZ defdZ!d Z"d Z#defd Z$d!eddfd"Z%d# Z&d$ Z'	 d+d%ed&e
e   d'e(dee   fd(Z) xZ*S ),MarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```	input_idsattention_maskz>>.+<<Nsp_model_kwargsreturnc                    |i n|| _         t        |      j                         s
J d|        || _        t	        |      | _        t        |      | j
                  vrt        d      t        |	      | j
                  v sJ |rKt	        |      | _        | j                  j                         D ci c]  \  }}||
 c}}| _
        g | _        nv| j
                  j                         D ci c]  \  }}||
 c}}| _
        | j
                  D cg c](  }|j                  d      s|j                  d      s'|* c}| _        || _        || _        ||g| _        t#        || j                         | _        t#        || j                         | _        | j$                  | _        | j
                  | _        | j-                          t/        | `  d|||||	|
| j                   ||d	| y c c}}w c c}}w c c}w )Nzcannot find spm source z <unk> token must be in the vocabz>>z<<)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs )r   r   existsr    	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codes
startswithendswithr   r   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   r    kwargskv	__class__s                   a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/marian/tokenization_marian.pyr5   zMarianTokenizer.__init__k   s     &5%<r/J&&(P,CJ<*PP(. 'y>-=>>9~---"+,=">D-1-@-@-F-F-HITQAqDIDL,.D)-1\\-?-?-ABTQAqDBDL>Bll2vall[_N`efeoeopteu12vD)&&$j1 #:t/C/CD":t/C/CD??#|| 	  	
##- 00/+	
 	
) J C2vs   1G(,G.G4'G49G4c                     	 ddl m}  || j                        j                  | _        y # t
        t        f$ r  t        j                  d       d | _        Y y w xY w)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     | S Nr!   )xs    r;   <lambda>z3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q     )	
sacremosesr=   r   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r6   r=   s     r;   r3   z!MarianTokenizer._setup_normalizer   sM    	/7#78H8H#I#S#SD ./ 	/MM@A#.D 	/s   '* ,AAr@   c                 ,    |r| j                  |      S dS )zHCover moses empty string edge case. They return empty list for '' input! )rE   )r6   r@   s     r;   rD   zMarianTokenizer.normalize   s    *+t##A&33rB   c                 h    | j                   j                  || j                   | j                           S r?   )r2   getr   )r6   tokens     r;   _convert_token_to_idz$MarianTokenizer._convert_token_to_id   s*    ##''t/C/CDNN/STTrB   textc                     | j                   j                  |      }|r|j                  d      gng }|| j                   j                  d|      fS )z6Remove language codes like >>fr<< before sentencepiecer   rK   )language_code_rematchgroupsub)r6   rP   rS   codes       r;   remove_language_codez$MarianTokenizer.remove_language_code   sJ    %%++D1).ekk!n%BT**..r4888rB   c                 v    | j                  |      \  }}| j                  j                  |t              }||z   S )N)out_type)rW   r1   encoder%   )r6   rP   rV   piecess       r;   	_tokenizezMarianTokenizer._tokenize   s;    ..t4
d!!(((<f}rB   indexc                 N    | j                   j                  || j                        S )z?Converts an index (integer) in a token (str) using the decoder.)r)   rM   r   )r6   r]   s     r;   _convert_id_to_tokenz$MarianTokenizer._convert_id_to_token   s    ||t~~66rB   c                 $    t        |   |fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]`: The list of decoded sentences.
        )r4   batch_decode)r6   	sequencesr7   r:   s      r;   ra   zMarianTokenizer.batch_decode   s    * w#I888rB   c                 $    t        |   |fi |S )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r4   decode)r6   	token_idsr7   r:   s      r;   rd   zMarianTokenizer.decode   s    0 w~i2622rB   tokensc                 L   | j                   r| j                  n| j                  }g }d}|D ]>  }|| j                  v r||j	                  |      |z   dz   z  }g }.|j                  |       @ ||j	                  |      z  }|j                  t        d      }|j                         S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserK    )	_decode_use_source_tokenizerr/   r0   all_special_tokensdecode_piecesappendreplaceSPIECE_UNDERLINEstrip)r6   rf   sp_modelcurrent_sub_tokens
out_stringrN   s         r;   convert_tokens_to_stringz(MarianTokenizer.convert_tokens_to_string   s    &*&G&G4??T__
 	1E///h445GH5PSVVV
%'""))%0	1 	h,,-?@@
''(8#>
!!rB   c                 L    ||| j                   gz   S ||z   | j                   gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r6   token_ids_0token_ids_1s      r;    build_inputs_with_special_tokensz0MarianTokenizer.build_inputs_with_special_tokens	  s5    $"3"3!444[(D,=,=+>>>rB   c                 H    | j                   | _        | j                  | _        y r?   )r/   r1   r$   r2   r6   s    r;   _switch_to_input_modez%MarianTokenizer._switch_to_input_mode  s    ??#||rB   c                 b    | j                   | _        | j                  r| j                  | _        y y r?   )r0   r1   r    r'   r2   rz   s    r;   _switch_to_target_modez&MarianTokenizer._switch_to_target_mode  s*    ??#'#6#6D   rB   c                 ,    t        | j                        S r?   )lenr$   rz   s    r;   
vocab_sizezMarianTokenizer.vocab_size  s    4<<  rB   save_directoryfilename_prefixc                 z   t         j                  j                  |      st        j	                  d| d       y g }| j
                  rt         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        | j                  |       t        | j                  |       |j                  |       |j                  |       nXt         j                  j                  ||r|dz   ndt        d   z         }t        | j                  |       |j                  |       t        t        d   t        d   g| j                  | j                  | j                  g      D ]  \  }}}	t         j                  j                  ||r|dz   nd|z         }
t         j                  j!                  |      t         j                  j!                  |
      k7  r=t         j                  j#                  |      rt%        ||
       |j                  |
       t         j                  j#                  |      rt'        |
d	      5 }|	j)                         }|j+                  |       d d d        |j                  |
        t-        |      S # 1 sw Y   (xY w)
NzVocabulary path (z) should be a directory-rK   r   r   r   r   wb)ospathisdirloggererrorr    joinVOCAB_FILES_NAMES	save_jsonr$   r'   rl   zipr-   r/   r0   abspathisfiler   openserialized_model_protowritetuple)r6   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_models                r;   save_vocabularyzMarianTokenizer.save_vocabulary  sL   ww}}^,LL,^,<<STU!#*93&rEVW^E__" "$*93&rEVWjEkk" dll$67d))+=>1212WW\\/3!6rUfgnUo oN dllN3~.;>|,.?.MNNN__doo.<
 	27}i
 GGLL/3!6rUf fM ww}-1OOTVT[T[TbTbcpTq6""=1WW^^M2-. 3"+4+K+K+M(HH123 ""=1	2" [!!3 3s   ("J11J:	c                 "    | j                         S r?   )get_src_vocabrz   s    r;   	get_vocabzMarianTokenizer.get_vocabJ  s    !!##rB   c                 B    t        | j                  fi | j                  S r?   )dictr$   added_tokens_encoderrz   s    r;   r   zMarianTokenizer.get_src_vocabM  s    DLL>D$=$=>>rB   c                 B    t        | j                  fi | j                  S r?   )r   r'   added_tokens_decoderrz   s    r;   get_tgt_vocabzMarianTokenizer.get_tgt_vocabP  s    D''E4+D+DEErB   c                     | j                   j                         }|j                  dD ci c]  }|d  c}       |S c c}w )N)r/   r0   r1   rE   r   )__dict__copyupdate)r6   stater8   s      r;   __getstate__zMarianTokenizer.__getstate__S  s?    ""$qrQWr	
  ss   
=dc                      | _         t         d      si  _         fd j                  D        \   _         _         j                   _         j                          y )Nr   c              3   J   K   | ]  }t        |j                          y wr?   )r.   r   ).0fr6   s     r;   	<genexpr>z/MarianTokenizer.__setstate__.<locals>.<genexpr>a  s     +fRSHQ8L8L,M+fs    #)r   hasattrr   r-   r/   r0   r1   r3   )r6   r   s   ` r;   __setstate__zMarianTokenizer.__setstate__Z  sQ     t./#%D +fW[WeWe+f(?? rB   c                      y)zJust EOS   r!   )r6   argsr7   s      r;   num_special_tokens_to_addz)MarianTokenizer.num_special_tokens_to_adde  s    rB   c                     t        | j                        }|j                  | j                         |D cg c]
  }||v rdnd c}S c c}w )Nr   r   )setall_special_idsremoveunk_token_id)r6   seqr   r@   s       r;   _special_token_maskz#MarianTokenizer._special_token_maski  sF    d223t001:=>QQ/)q0>>>s   Arv   rw   already_has_special_tokensc                     |r| j                  |      S || j                  |      dgz   S | j                  ||z         dgz   S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.r   )r   )r6   rv   rw   r   s       r;   get_special_tokens_maskz'MarianTokenizer.get_special_tokens_maskn  sS     &++K88 ++K8A3>>++K+,EF!LLrB   )	NNNz<unk>z</s>z<pad>i   NFr?   )NF)+__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesrecompilerR   r   r   r%   r   r5   r3   rD   rO   rW   r   r\   intr_   ra   rd   rs   rx   r{   r}   propertyr   r	   r   r   r   r   r   r   r   r   boolr   __classcell__)r:   s   @r;   r   r   ,   s   8t *$&67!rzz(+ 48<
 "$sCx.1<
 
<
|/43 43 4U9 9c d3i 
7# 7# 79.34"tCy "S " ?QUVYQZ ?,7
 !C ! !+"c +"HSM +"]bcf]g +"Z$4 $?Fd 	!d 	!t 	!? in	M	M.6tn	Mae	M	c	MrB   r   r   r   r   c                 R    t        j                  di |}|j                  |        |S )Nr!   )sentencepieceSentencePieceProcessorLoad)r   r   spms      r;   r.   r.   z  s%    

.
.
A
ACHHTNJrB   c                 v    t        |d      5 }t        j                  | |d       d d d        y # 1 sw Y   y xY w)Nw   )indent)r   jsondump)datar   r   s      r;   r   r     s2    	dC %A		$!$% % %s   /8c                 p    t        | d      5 }t        j                  |      cd d d        S # 1 sw Y   y xY w)Nr)r   r   load)r   r   s     r;   r#   r#     s-    	dC Ayy|  s   ,5)r   r   r   rH   pathlibr   shutilr   typingr   r   r   r   r	   r
   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   rn   r   r%   r   r.   r   r#   r!   rB   r;   <module>r      s     	 	    : :  5  
		H	% ,4   
KM) KM\
3 c3h M<`<` %# %$ %
C E$*- rB   