
    sg3                         d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
  e
j                  e      Zddd	Zd
 Z G d de      Zy)z Tokenization classes for BioGPT.    N)ListOptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/biogpt/tokenization_biogpt.py	get_pairsr   !   sF    
 EEQIQR 		9d#$	 L    c            
       B    e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zed        Z	d Z
d Zd Zd	 Zdd
Zd Zd Zd Z	 ddee   deee      dee   fdZ	 ddee   deee      dedee   f fdZ	 ddee   deee      dee   fdZddedee   dee   fdZd Zd Z xZS )BioGptTokenizera:  
    Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskc           
         	 dd l }	d| _        |	| _        i | _        i | _        	 t        |d      5 }
t        j                  |
      | _	        d d d        | j                  j                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d      d d }d d d        D cg c]  }t        |j                         d d         }}t        t!        |t#        t%        |                        | _        i | _        t+        | X  d
|||||d	| y # t        $ r t        d      w xY w# 1 sw Y   xY wc c}}w # 1 sw Y   xY wc c}w )Nr   zqYou need to install sacremoses to use BioGptTokenizer. See https://pypi.org/project/sacremoses/ for installation.enutf-8encoding
   )	bos_token	eos_token	sep_token	unk_token	pad_token )
sacremosesImportErrorlangsmcache_moses_tokenizercache_moses_detokenizeropenjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr	   r
   r%   r"   r#   r$   r&   kwargsr(   vocab_handlekvmerges_handlemergesmerge	__class__s                   r   r>   zBioGptTokenizer.__init__\   sb   	 	%'"')$*w/ 	3<99\2DL	3)-););)=>A1>+0 	;M"'')//5cr:F	;8>?u%bq)*??c&%F*<=>
 	
	
 	
-  	M 		3 	3>	; 	;?s.   D1 E	1E#E#E'1E	EE$c                 ,    t        | j                        S )zReturns vocab size)r:   r1   r?   s    r   
vocab_sizezBioGptTokenizer.vocab_size   s     4<<  r   c                 B    t        | j                  fi | j                  S N)r7   r1   added_tokens_encoderrI   s    r   	get_vocabzBioGptTokenizer.get_vocab   s    DLL>D$=$=>>r   c                     || j                   vr+| j                  j                  |      }|| j                   |<   | j                   |   j                  |ddd      S )Nr*   TF)aggressive_dash_splits
return_strescape)r,   r+   MosesTokenizertokenize)r?   textr*   moses_tokenizers       r   moses_tokenizezBioGptTokenizer.moses_tokenize   sc    t111"gg44$4?O/>D&&t,))$/88% 9 
 	
r   c                     || j                   vr+| j                  j                  |      }|| j                   |<   | j                   |   j                  |      S )NrP   )r-   r+   MosesDetokenizer
detokenize)r?   tokensr*   moses_detokenizers       r   moses_detokenizez BioGptTokenizer.moses_detokenize   sT    t333 $ 8 8d 8 C1BD((.++D1<<VDDr   c                     t        |d d       |d   dz   fz   }| j                  v r j                  |   S t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d	k(  rd
}| j                  |<   |S # t        $ r |j                  ||d         Y pw xY w)Nr    </w>c                 N    j                   j                  | t        d            S )Ninf)r;   getfloat)pairr?   s    r   <lambda>z%BioGptTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W r   keyr   r   r!    z
  </w>z
</w>)r6   r<   r   minr;   r:   indexextend
ValueErrorappendjoin)
r?   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezBioGptTokenizer.bpe   s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E   F ?F c                     |r|j                         }n| j                  || j                        }g }|D ]=  }|s|j                  t	        | j                  |      j                  d                   ? |S )zReturns a tokenized string.ri   )r5   rX   r*   rl   listrw   )r?   rV   bypass_tokenizersplit_tokensrp   s        r   	_tokenizezBioGptTokenizer._tokenize   sm    ::<D&&tTYY7D 	FE##D%)>)>s)C$DE	F r   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r1   rc   r%   )r?   rp   s     r   _convert_token_to_idz$BioGptTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r3   rc   r%   )r?   rk   s     r   _convert_id_to_tokenz$BioGptTokenizer._convert_id_to_token   s    ||t~~66r   c                     |D cg c]$  }|j                  dd      j                  dd      & }}dj                  |      j                         }| j                  || j                        }|S c c}w )z:Converts a sequence of tokens (string) in a single string.ri    r`   )replacero   r5   r^   r*   )r?   r\   trV   s       r   convert_tokens_to_stringz(BioGptTokenizer.convert_tokens_to_string   sh     DJJa!))C$,,VS9JJ&&($$VTYY7	 Ks   )A-token_ids_0token_ids_1returnc                 V    || j                   g|z   S | j                   g}||z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BioGPT sequence has the following format:

        - single sequence: `</s> X `
        - pair of sequences: `</s> A </s> B `

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr?   r   r   seps       r    build_inputs_with_special_tokensz0BioGptTokenizer.build_inputs_with_special_tokens   sA    & %%&44  ![ 3&44r   already_has_special_tokensc                     |rt         |   ||d      S |'dgdgt        |      z  z   dgz   dgt        |      z  z   S dgdgt        |      z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r   r   )r=   get_special_tokens_maskr:   )r?   r   r   r   rG   s       r   r   z'BioGptTokenizer.get_special_tokens_mask  sy    $ &72'[]a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r   c                     | j                   g}|t        ||z         dgz  S t        ||z         dgz  t        ||z         dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r   )r   r:   r   s       r   $create_token_type_ids_from_sequencesz4BioGptTokenizer.create_token_type_ids_from_sequences  s^    .   ! {S()QC//;$%+c+2C.Ds.JJJr   save_directoryfilename_prefixc           	      .   t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	      5 }|j                  t        j                  | j                  d
dd      dz          d d d        d}t        |dd	      5 }t        | j                  j                         d       D ]M  \  }}	||	k7  rt        j                  d| d       |	}|j                  dj                  |      dz          |dz  }O 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)NzVocabulary path (z) should be a directory-r   r	   r
   wr   r   r!   TF)indent	sort_keysensure_asciir   r   c                     | d   S )Nr   r'   )kvs    r   rf   z1BioGptTokenizer.save_vocabulary.<locals>.<lambda>J  s    Y[\]Y^ r   rg   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!ri   r   )ospathisdirloggererrorro   VOCAB_FILES_NAMESr.   writer/   dumpsr1   sortedr;   r2   warning)
r?   r   r   r	   
merge_filefrk   writer
bpe_tokenstoken_indexs
             r   save_vocabularyzBioGptTokenizer.save_vocabulary:  s   ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 		+1$..2F2F2HN^+_ '
KK'NN/
| <M M (ESXXj1D89
		 :%%	c 	c		 :%%s   *6E<8A7F<FFc                 D    | j                   j                         }d |d<   |S )Nr+   )__dict__copy)r?   states     r   __getstate__zBioGptTokenizer.__getstate__V  s"    ""$dr   c                 Z    || _         	 dd l}|| _        y # t        $ r t        d      w xY w)Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r   r(   r)   r+   )r?   dr(   s      r   __setstate__zBioGptTokenizer.__setstate__[  s>    	   	M 	s    *)z<unk>z<s></s>r   z<pad>)FrL   )NF)__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr>   propertyrJ   rN   rX   r^   rw   r|   r~   r   r   r   intr   r   boolr   r   strr   r   r   r   __classcell__)rG   s   @r   r   r   .   sM   (T *$&67 *
X ! !?
E*XI7 JN5953;DI3F5	c52 sx.9.3;DI3F.ko.	c.8 JNK9K3;DI3FK	cK<&c &HSM &]bcf]g &8
r   r   )r   r/   r   typingr   r   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   r'   r   r   <module>r      sT    '  	 ( ( 5  
		H	%  
x) xr   