
    sg<                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZ  ej"                  e      Zdd	iZ G d
 d      Z G d de      Zy)z"Tokenization class for model MyT5.    N)defaultdict)DictListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)logging
vocab_filezbyte_maps.jsonc            	           e Zd ZdZdZdeeeeef   f   fdZdeeee	e
e   f   f   dedefdZdeeef   d	eeee	e
e   f   f   fd
Zde
e   d	ede
e   f   fdZdde
e   d	e
e   fdZy)ByteRewriteraZ  
    Byte rewriter class for MyT5 tokenizer.
    This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.

    Args:
        rewriting_rules (`str` or `Dict[str, str]`):
            A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.

    z[LEAF]rewriting_rulesc                    t        |t              r+t        |d      5 }t        j                  |      }d d d        n't        |t
              st        dt        |             | j                  |      | _	        |j                         D ci c]  \  }}||
 }}}| j                  |      | _        y # 1 sw Y   YxY wc c}}w )NrzDrewriting_rules should be either a path to json file or a dict, got )
isinstancestropenjsonloaddict
ValueErrortypeconstruct_hash_tree	hash_treeitemsreverse_hash_tree)selfr   fkvreverse_rewriting_ruless         ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/myt5/tokenization_myt5.py__init__zByteRewriter.__init__.   s    os+os+ /q"&))A,/ /OT2VW[\kWlVmn  11/B4C4I4I4K"LDAq1a4"L"L!%!9!9:Q!R/ / #Ms   B3B?3B<r   byte_in_sequencebyte_out_sequencec                     |j                  d      }|j                  d      }|}|D ]  }||vri ||<   ||   } ||| j                  <   y)zL
        Add a leaf with the output byte sequence to the hash tree.
         N)splitLEAF)r   r   r&   r'   byte_in_listbyte_out_listtree_pointerbs           r$   add_leafzByteRewriter.add_leaf;   sb     (--c2)//4  	+A$"$Q'?L	+
 #0TYY    returnc                     t        t              }d t        d      D        D ]  }|g||   | j                  <    |j	                         D ]  \  }}| j                  |||        |S )zE
        Construct a hash tree for rewritten byte sequences.
        c              3   $   K   | ]  }|d  
 yw)02xN ).0xs     r$   	<genexpr>z3ByteRewriter.construct_hash_tree.<locals>.<genexpr>O   s     1QsG*1s      )r   r   ranger+   r   r0   )r   r   r   r/   in_sequenceout_sequences         r$   r   z ByteRewriter.construct_hash_treeJ   ss      %	1eCj1 	*A'(cIaL#	* *9)>)>)@ 	@%KMM)[,?	@ r1   byte_sequenceNc                 \    | j                   }|D ]  }||v r||   } y || j                     S )zW
        Search the hash tree and return the rewritten byte sequence if found.
        N)r   r+   )r   r>   r.   r/   s       r$   search_hash_treezByteRewriter.search_hash_treeW   sA     ~~ 	AL +A		 DII&&r1   in_bytesc                 Z   g }d}d}|t        |      k  r|s| j                  n| j                  }t        |t        |            D ]?  }||   }||v r||   }n||k(  r|g}	|} n$ n"| j                  |v s/|| j                     }	|}A |j                  	       |dz   }|t        |      k  r|S )a6  
        Rewrite a sequence of bytes using the hash tree.

        Args:
            in_bytes (`List[str]`): A list of bytes to be rewritten.
            reverse (`bool`): If True, decoding is performed with the reverse hash tree.
        Returns:
            `List[str]`: The rewritten byte sequence.
        r      )lenr   r   r;   r+   extend)
r   rA   reverse	out_bytesb_startb_endr.   jr/   cur_leafs
             r$   rewrite_byteszByteRewriter.rewrite_bytesd   s     	H%184>>d>T>TL7CM2 QK$#/?L'\ !sHE99,+DII6HE X&aiG! H%$ r1   )F)__name__
__module____qualname____doc__r+   r   r   r   r%   r   r   r0   r   r@   rL   r6   r1   r$   r   r   !   s     DSc4S>.A(B S0$sE$S	/,B'B"C 0WZ 0or 04S> d3PUVZ\`ad\eVePfKfFg 'd3i 'E$S	/<R ' d3i  49  r1   r   c            
           e Zd ZdZddgZeZ	 	 	 	 	 d	 d fdZed        Z	d Z
	 ddee   d	eee      d
edee   f fdZdee   dee   fdZ	 ddee   d	eee      dee   fdZ	 ddee   d	eee      dee   fdZdedee   fdZd Zd Zdee   dee   fdZdee   dee   fdZd Zddedee   dee   fdZ xZS )MyT5Tokenizera  
    Construct a MyT5 tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`): The file containing the byte rewriting rules.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskr2   c           	         |dkD  r|t        |      D cg c]  }d| d
 }}nK|dkD  rF|Dt        |      dkD  r6t        t        t        d |                  }	|	|k7  rt	        d| d| d      t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}t        |t              rt        |dd	      n|}|||d
| _        t        | j                        | _	        d| _
        t        j                  t        |d            | _        t        | j                  d         | _        t        | j                  d         | _        t%        
| L  d|||d|d| y c c}w )Nr   z
<extra_id_>c                 .    t        dt        |       v       S )Nextra_id)boolr   )r8   s    r$   <lambda>z(MyT5Tokenizer.__init__.<locals>.<lambda>   s    Ds1v9M4N r1   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to MyT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r   rC      r:   r   decompose_map	merge_map)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr6   )r;   rD   setfilterr   r   r   r
   _added_tokens_decoderoffset_utf_vocab_sizer   r   r   	byte_mapsr   decompose_rewritermerge_rewritersuperr%   )r   r   r`   ra   rb   rc   rd   kwargsiextra_tokens	__class__s             r$   r%   zMyT5Tokenizer.__init__   s    q=6>DI)DT(Uq:aS):(U%(U]8DMfIgjkIks6*NPi#jklLy( &yk1RSlRm n( (  HRR[]`GaJydCgp	GQR[]`GaJydCgp	GQR[]`GaJydCgp	)2yY%O"$445# 4
C#89".t~~o/N"O*4>>++FG 	
&?	
 	
3 )Vs   E1c                     | j                   S N)ri   )r   s    r$   
vocab_sizezMyT5Tokenizer.vocab_size   s    ###r1   c                     t        | j                  | j                  z         D ci c]  }| j                  |      | }}|j	                  | j
                         |S c c}w rs   )r;   rt   rh   convert_ids_to_tokensupdateadded_tokens_encoder)r   ro   vocabs      r$   	get_vocabzMyT5Tokenizer.get_vocab   sW    ;@SWS^S^A^;_`a++A.1``T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                     |rt         |   ||d      S |dgt        |      z  dgz   S dgt        |      z  dgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r{   r|   r}   r   rC   )rm   get_special_tokens_maskrD   )r   r{   r|   r}   rq   s       r$   r   z%MyT5Tokenizer.get_special_tokens_mask   sy    $ &72'[]a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr1   	token_idsc                     t        |      dkD  r7|d   | j                  k(  r%t        j                  d| j                   d       |S || j                  gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rD   eos_token_idwarningswarnr`   )r   r   s     r$   _add_eos_if_not_presentz%MyT5Tokenizer._add_eos_if_not_present   s]    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r1   c                 t    | j                   g}|t        ||z         dgz  S t        ||z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )r   rD   )r   r{   r|   eoss       r$   $create_token_type_ids_from_sequencesz2MyT5Tokenizer.create_token_type_ids_from_sequences  sP        !{S()QC//;${2S89QC??r1   c                 X    | j                  |      }||S | j                  |      }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r   )r   r{   r|   s      r$    build_inputs_with_special_tokensz.MyT5Tokenizer.build_inputs_with_special_tokens  s;    & 22;?66{CK,,r1   textc                 r    |j                  d      D cg c]  }|d }}| j                  |      }|S c c}w )zTake as input a string and return a list of strings (tokens) for words/sub-words.
        Represents tokens in two character hex formatutf-8r5   )encodemorphological_encode)r   r   rn   ro   tokenss        r$   	_tokenizezMyT5Tokenizer._tokenize6  s@     '+kk'&:;QsG*;;**62 <s   4c                 \    t        |      dk7  rd}|S t        |d      | j                  z   }|S )z0Converts a token (str) in an id using the vocab.r]   N   )rD   intrh   )r   tokentoken_ids      r$   _convert_token_to_idz"MyT5Tokenizer._convert_token_to_id>  s6     u:?H  5"~3Hr1   c                 (    || j                   z
  d}|S )z=Converts an index (integer) in a token (str) using the vocab.r5   )rh   )r   indexr   s      r$   _convert_id_to_tokenz"MyT5Tokenizer._convert_id_to_tokenH  s    4;;&s+r1   indicesc                 z    | j                   j                  |d      }| j                  j                  |d      }|S )NFrF   )rk   rL   rl   r   r   s     r$   r   z"MyT5Tokenizer.morphological_encodeM  s=    ))777O%%33GU3Kr1   c                 z    | j                   j                  |d      }| j                  j                  |d      }|S )NTr   )rl   rL   rk   r   s     r$   morphological_decodez"MyT5Tokenizer.morphological_decodeS  s=    %%33GT3J))777Nr1   c                    d}g }|D ]`  }|| j                   v r|j                  | j                   |          0|| j                  v r|j                  |       P|j                  |       b | j                  |      }t	        | j                   j                               t	        | j                        z  }|D ].  }||v r|t        |d      z  }|t        j                  |      z  }0 |j                  dd      }|S )z:Converts a sequence of tokens (string) in a single string.r1   r   ignore)errors)	added_tokens_decoderappendrx   r   re   valuesbytesfromhexdecode)r   r   bstring
out_tokensr   _added_tokensstrings          r$   convert_tokens_to_stringz&MyT5Tokenizer.convert_tokens_to_stringY  s    
 	)E111!!$";";E"BC$333!!%(!!%(	) ..z:
D55<<>?#dF_F_B`` 	0E%5005==//		0
 9r1   save_directoryfilename_prefixc                 n   t         j                  j                  |      r2t         j                  j                  ||r|dz   ndt        d   z         }n|r|dz   nd|z   }t        |dd      5 }|j                  t        j                  | j                  dd	             d d d        |fS # 1 sw Y   |fS xY w)
N- r   wr   )encodingr]   F)indentensure_ascii)
ospathisdirjoinVOCAB_FILES_NAMESr   writer   dumpsrj   )r   r   r   r   writers        r$   save_vocabularyzMyT5Tokenizer.save_vocabularyp  s    77==(/3!6rUfgsUt tJ 4C/C/n\J*cG4 	SLLDNN15QR	S}	S}s   ,2B))B4)z</s>z<unk>z<pad>}   N)r2   N)NFrs   )rM   rN   rO   rP   model_input_namesr   vocab_files_namesr%   propertyrt   rz   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__)rq   s   @r$   rR   rR      s   4 %&67)
 "&,
 
,
\ $ $ sxO9O3;DI3FOkoO	cO8	3c 	3tCy 	3 JN@9@3;DI3F@	c@0 JN-9-3;DI3F-	c-4c S	 
DI $s) DI $s) .	c 	HSM 	]bcf]g 	r1   rR   )rP   r   r   r   collectionsr   typingr   r   r   r   r   tokenization_utilsr
   r   utilsr   
get_loggerrM   loggerr   r   rR   r6   r1   r$   <module>r      s`    )  	  # 5 5 A  
		H	% "#34 c cLr' rr1   