
    sg!K                         d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ ddlmZ  ej                  e      Zddd	d
Zd Zd Zd Z G d de      Zy)zTokenization classes for FSMT.    N)DictListOptionalTuple   )PreTrainedTokenizer)loggingzvocab-src.jsonzvocab-tgt.jsonz
merges.txt)src_vocab_filetgt_vocab_filemerges_filec                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/fsmt/tokenization_fsmt.py	get_pairsr   $   sF    
 EEQIQR 		9d#$	 L    c                 *   | j                  dd      } t        j                  dd|       } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d	d
      } | j                  dd
      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d d!      } | j                  d"d#      } | j                  d$d%      } | j                  d&d'      } | j                  d(d)      } | j                  d*d+      } | j                  d,d-      } t        j                  d.d|       } | j                  d/d0      } | j                  d1d2      } | j                  d3d4      } | j                  d5d6      } | j                  d7d8      } | j                  d9d:      } | j                  d;d<      } | j                  d=d>      } | j                  d?d@      } | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr7   1   sM    <<s#D66)T4(D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D66)T4(D<<s#D<<s#D<<u%D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#DKr   c                     g }| D ]:  }t        j                  |      }|j                  d      r*|j                  |       < dj	                  |      S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    C )unicodedatacategory
startswithappendjoin)r6   outputr   cats       r   remove_non_printing_charrB   \   sS     F ""4(>>#d	
 776?r   c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 d  fd	Zdee	e
f   fdZede
fd       Zd Zd	 Zd
 Zd Zed        Zed        Zd Zd Zd Zd!dZd Zd Zd Z	 d"dee
   deee
      dee
   fdZ	 d#dee
   deee
      dedee
   f fdZ	 d"dee
   deee
      dee
   fdZ d"de	dee	   de!e	   fdZ"d Z#d Z$ xZ%S )$FSMTTokenizera	  
    Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

    - Moses preprocessing and tokenization.
    - Normalizing all inputs text.
    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
      "__classify__") to a vocabulary.
    - The argument `langs` defines a pair of languages.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        langs (`List[str]`, *optional*):
            A list of two languages to translate from and to, for instance `["en", "ru"]`.
        src_vocab_file (`str`, *optional*):
            File containing the vocabulary for the source language.
        tgt_vocab_file (`st`, *optional*):
            File containing the vocabulary for the target language.
        merges_file (`str`, *optional*):
            File containing the merges.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

    	input_idsattention_maskc
                    	 dd l }|| _        || _        || _        || _        || _        i | _        i | _        i | _	        |rt        |      dk(  r|\  | _        | _        nt        d| d      t        |d      5 }t        j                   |      | _        d d d        t        |d      5 }t        j                   |      }|j%                         D ci c]  \  }}||
 c}}| _        d d d        t        |d      5 }|j)                         j+                  d      d d	 }d d d        D cg c]  }t-        |j+                         d d         }}t/        t1        |t3        t        |                        | _        i | _        t9        | t  d|||||||||	d
	|
 y # t        $ r t        d      w xY w# 1 sw Y   $xY wc c}}w # 1 sw Y   xY w# 1 sw Y   xY wc c}w )Nr   nYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.   zFarg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got zw. Usually that means that tokenizer can't find a mapping for the given model path in  and other maps of this tokenizer.utf-8encoding
)	langsr
   r   r   do_lower_case	unk_token	bos_token	sep_token	pad_token )
sacremosesImportErrorsmr
   r   r   rP   cache_moses_punct_normalizercache_moses_tokenizercache_moses_detokenizerlensrc_langtgt_lang
ValueErroropenjsonloadencoderitemsdecoderreadsplittupledictziprange	bpe_rankscachesuper__init__)selfrO   r
   r   r   rP   rQ   rR   rS   rT   kwargsrV   src_vocab_handletgt_vocab_handle	tgt_vocabkvmerges_handlemergesmerge	__class__s                       r   ro   zFSMTTokenizer.__init__   s   	 ,,&* -/)%'"')$SZ1_+0(DM4=XY^X_ `8 8  .73 	77G99%56DL	7.73 	@7G		"23I-6__->?TQAqD?DL	@ +0 	;M"'')//5cr:F	;8>?u%bq)*??c&%F*<=>
 	
))#'	
 	
M  	M 	8	7 	7 @	@ 	@	; 	;?sG   F  ;F8+)GG!	G?#G.#G# F58GGGG returnc                 "    | j                         S N)get_src_vocabrp   s    r   	get_vocabzFSMTTokenizer.get_vocab   s    !!##r   c                     | j                   S r}   )src_vocab_sizer   s    r   
vocab_sizezFSMTTokenizer.vocab_size   s    """r   c                     || j                   vr+| j                  j                  |      }|| j                   |<   | j                   |   j                  |      S Nlang)rY   rX   MosesPunctNormalizer	normalize)rp   r6   r   punct_normalizers       r   moses_punct_normzFSMTTokenizer.moses_punct_norm   sT    t888#ww;;;F6FD--d3006@@FFr   c                     || j                   vr+| j                  j                  |      }|| j                   |<   | j                   |   j                  |ddd      S )Nr   TF)aggressive_dash_splits
return_strescape)rZ   rX   MosesTokenizertokenize)rp   r6   r   moses_tokenizers       r   moses_tokenizezFSMTTokenizer.moses_tokenize   sc    t111"gg44$4?O/>D&&t,))$/88% 9 
 	
r   c                     || j                   vr+| j                  j                  |      }|| j                   |<   | j                   |   j                  |      S r   )r[   rX   MosesDetokenizer
detokenize)rp   tokensr   moses_detokenizers       r   moses_detokenizezFSMTTokenizer.moses_detokenize   sT    t333 $ 8 8d 8 C1BD((.++D1<<VDDr   c                 V    t        |      }| j                  ||      }t        |      }|S r}   )r7   r   rB   )rp   r6   r   s      r   moses_pipelinezFSMTTokenizer.moses_pipeline  s-    $T*$$T40'-r   c                 ,    t        | j                        S r}   )r\   rc   r   s    r   r   zFSMTTokenizer.src_vocab_size      4<<  r   c                 ,    t        | j                        S r}   )r\   re   r   s    r   tgt_vocab_sizezFSMTTokenizer.tgt_vocab_size  r   r   c                 B    t        | j                  fi | j                  S r}   )ri   rc   added_tokens_encoderr   s    r   r~   zFSMTTokenizer.get_src_vocab      DLL>D$=$=>>r   c                 B    t        | j                  fi | j                  S r}   )ri   re   added_tokens_decoderr   s    r   get_tgt_vocabzFSMTTokenizer.get_tgt_vocab  r   r   c                     t        |d d       |d   dz   fz   }| j                  v r j                  |   S t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d	k(  rd
}| j                  |<   |S # t        $ r |j                  ||d         Y pw xY w)NrN   </w>c                 N    j                   j                  | t        d            S )Ninf)rl   getfloat)pairrp   s    r   <lambda>z#FSMTTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W r   keyr   r   rI    z
  </w>z
</w>)rh   rm   r   minrl   r\   indexextendr_   r>   r?   )
rp   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezFSMTTokenizer.bpe  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E   F ?F c                 P   | j                   }| j                  r|j                         }|r|j                         }n&| j	                  ||      }| j                  ||      }g }|D ]=  }|s|j                  t        | j                  |      j                  d                   ? |S )av  
        Tokenize a string given language code using Moses.

        Details of tokenization:

            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
              languages. However, we don't enforce it.
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
              (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        r   r   )	r]   rP   lowerrg   r   r   r   listr   )rp   r6   r   bypass_tokenizersplit_tokensr   s         r   	_tokenizezFSMTTokenizer._tokenizeB  s    * }}::<D::<D&&t$&7D&&t$&7D 	FE##D%)>)>s)C$DE	F r   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rc   r   rQ   )rp   r   s     r   _convert_token_to_idz"FSMTTokenizer._convert_token_to_idi  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)re   r   rQ   )rp   r   s     r   _convert_id_to_tokenz"FSMTTokenizer._convert_id_to_tokenm  s    ||t~~66r   c                     |D cg c]$  }|j                  dd      j                  dd      & }}dj                  |      j                         }| j                  || j                        }|S c c}w )z:Converts a sequence of tokens (string) in a single string.r   r:   r   )r3   r?   rg   r   r^   )rp   r   tr6   s       r   convert_tokens_to_stringz&FSMTTokenizer.convert_tokens_to_stringq  sh     DJJa!))C$,,VS9JJ&&($$VT]];	 Ks   )A-token_ids_0token_ids_1c                 @    | j                   g}|||z   S ||z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A FAIRSEQ Transformer sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idrp   r   r   seps       r    build_inputs_with_special_tokensz.FSMTTokenizer.build_inputs_with_special_tokens{  s:    &   ! $$S ;.44r   already_has_special_tokensc                     |rt         |   ||d      S |'dgt        |      z  dgz   dgt        |      z  z   dgz   S dgt        |      z  dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r   r   )rn   get_special_tokens_maskr\   )rp   r   r   r   rz   s       r   r   z%FSMTTokenizer.get_special_tokens_mask  sz    & &72'[]a 3   "C#k**qc1aS3{;K5KLPQsRRc+&&1#--r   c                     | j                   g}|t        ||z         dgz  S t        ||z         dgz  t        ||z         dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).

        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
        FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
        r   r   )r   r\   r   s       r   $create_token_type_ids_from_sequencesz2FSMTTokenizer.create_token_type_ids_from_sequences  s^    4   ! {S()QC//;$%+c+2C.Ds.JJJr   save_directoryfilename_prefixc           	         t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	
      5 }|j                  t        j                  | j                  ddd      dz          d d d        t        |dd	
      5 }| j                  j                         D ci c]  \  }}||
 }	}}|j                  t        j                  |	ddd      dz          d d d        d}
t        |dd	
      5 }t        | j                  j                         d       D ]M  \  }}|
|k7  rt        j!                  d| d       |}
|j                  dj                  |      dz          |
dz  }
O 	 d d d        |||fS # 1 sw Y   xY wc c}}w # 1 sw Y   xY w# 1 sw Y   -xY w)NzVocabulary path (z) should be a directoryr-   r:   r
   r   r   wrJ   rK   rI   TF)indent	sort_keysensure_asciirM   r   c                     | d   S )Nr   rU   )kvs    r   r   z/FSMTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r   )ospathisdirloggererrorr?   VOCAB_FILES_NAMESr`   writera   dumpsrc   re   rd   sortedrl   warning)rp   r   r   r
   r   r   fru   rv   rt   r   writer
bpe_tokenstoken_indexs                 r   save_vocabularyzFSMTTokenizer.save_vocabulary  s/   ww}}^,LL,^,<<STUo_s22QbcsQtt
 o_s22QbcsQtt
 ggllo_s22QbcpQqq
 .#8 	cAGGDJJt||ATYZ]aab	c .#8 	`A*.,,*<*<*>?$!QA?I?GGDJJydQVWZ^^_	` +sW5 		+1$..2F2F2HN^+_ '
KK'NN/} =M M (ESXXj1D89
		 ~{::'	c 	c @	` 	`
		 		s7   6H'H0H*.H0A7H<H'*H00H9<Ic                 D    | j                   j                         }d |d<   |S )NrX   )__dict__copy)rp   states     r   __getstate__zFSMTTokenizer.__getstate__  s"    ""$dr   c                 Z    || _         	 dd l}|| _        y # t        $ r t        d      w xY w)Nr   rH   )r   rV   rW   rX   )rp   drV   s      r   __setstate__zFSMTTokenizer.__setstate__  s>    	   	M 	s    *)	NNNNFz<unk>z<s>z</s>z<pad>)enFr}   )NF)&__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesro   r   strintr   propertyr   r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   __classcell__)rz   s   @r   rD   rD   r   s   +Z *$&67 @
F$4S> $ #C # #G
E ! ! ! !??*X%NI7 JN5953;DI3F5	c56 sx.9.3;DI3F.ko.	c.: JNK9K3;DI3FK	cKB";c ";HSM ";]bcf]g ";H
r   rD   )r   ra   r   r4   r;   typingr   r   r   r   tokenization_utilsr   utilsr	   
get_loggerr   r   r   r   r7   rB   rD   rU   r   r   <module>r     sh    %  	 	  . . 5  
		H	% '& 
(V
,T' Tr   