
    sg                         d Z ddlZddlmZmZ ddlmZ ddlmZ  ej                  e
      ZddiZd	 Z G d
 de      Zy)zTokenization classes for ESM.    N)ListOptional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     t        | d      5 }|j                         j                         }|D cg c]  }|j                          c}cd d d        S c c}w # 1 sw Y   y xY w)Nr)openread
splitlinesstrip)r   flinesls       [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sS    	j#	 *!##%#()a	)* *)* *s   #AAAAA!c            
            e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zdede	fdZ
de	defd	Zd
 Zd Zde	defdZdede	fdZ	 ddee   deee      dee   fdZ	 ddedee   dedee   fdZd Zedefd       Z xZS )EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_maskc           	      V   t        |      | _        t        t        | j                              | _        t        | j                        D 	ci c]  \  }}	|	|
 c}	}| _        t        
|   d|||||d| | j                  | _        | j                  | j                         y c c}	}w )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r&   zEsmTokenizer.__init__+   s     **5 4??!;<6?6PQ(#sS#XQ 	
!	
 	
 '+oo#$556 Rs   B%indexreturnc                 N    | j                   j                  || j                        S Nr#   getr   r)   r.   s     r   _convert_id_to_tokenz!EsmTokenizer._convert_id_to_tokenG         $$UDNN;;    tokenc                     | j                   j                  || j                   j                  | j                              S r1   r$   r3   r   r)   r8   s     r   _convert_token_to_idz!EsmTokenizer._convert_token_to_idJ   0      $$UD,=,=,A,A$..,QRRr7   c                 "    |j                         S r1   )split)r)   textr*   s      r   	_tokenizezEsmTokenizer._tokenizeM   s    zz|r7   c                 p    | j                   j                         }|j                  | j                         |S r1   )r$   copyupdateadded_tokens_encoder)r)   
base_vocabs     r   	get_vocabzEsmTokenizer.get_vocabP   s0    &&++-
$334r7   c                     | j                   j                  || j                   j                  | j                              S r1   r:   r;   s     r   token_to_idzEsmTokenizer.token_to_idU   r=   r7   c                 N    | j                   j                  || j                        S r1   r2   r4   s     r   id_to_tokenzEsmTokenizer.id_to_tokenX   r6   r7   token_ids_0token_ids_1c                     | j                   g}| j                  g}|| j                  ||z   S ||z   |z   S | j                  t        d      ||z   |z   |z   |z   S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r)   rL   rM   clsseps        r    build_inputs_with_special_tokensz-EsmTokenizer.build_inputs_with_special_tokens[   s       !  !  ([(([(3..&\]][ 3&4s::r7   already_has_special_tokensc                     |r-|t        d      |D cg c]  }|| j                  v rdnd c}S dgdgt        |      z  z   dgz   }||dgt        |      z  dgz   z  }|S c c}w )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rQ   all_special_idslen)r)   rL   rM   rU   r8   masks         r   get_special_tokens_maskz$EsmTokenizer.get_special_tokens_maski   s    $ && R 
 LWW%$"6"66AA=WWsqcC,,-3"QC#k**aS00D	 Xs   A!c                     t         j                  j                  ||r|dz   nddz         }t        |d      5 }|j	                  dj                  | j
                               d d d        |fS # 1 sw Y   |fS xY w)N- r	   w
)ospathjoinr   writer    )r)   save_directoryfilename_prefixr   r   s        r   save_vocabularyzEsmTokenizer.save_vocabulary   sk    WW\\.O?S3Hacgr2rs
*c" 	0aGGDIIdoo./	0}	0}s   +A--A8c                 ,    t        | j                        S r1   )rY   r    )r)   s    r   
vocab_sizezEsmTokenizer.vocab_size   s    4??##r7   )z<unk>z<cls>z<pad>z<mask>z<eos>r1   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr&   intstrr5   r<   rA   rG   rI   rK   r   r   rT   boolr[   rg   propertyri   __classcell__)r-   s   @r   r   r   #   s    *$&67
 78<# <# <S# S# S
S S S< < < JN;9;3;DI3F;	c; in.6tnae	c> $C $ $r7   r   )rm   ra   typingr   r   tokenization_utilsr   utilsr   
get_loggerrj   loggerrn   r   r   r   r7   r   <module>r{      sI    $ 	 ! 5  
		H	%!;/ *m$& m$r7   