
    sg                        d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ  ej6                  e      Zdd
Zdede fdZ!d Z" G d d      Z# G d de#      Z$de defdZ% G d d      Z& G d de&      Z' G d de&      Z( G d de&      Z) G d de&      Z* G d d e&      Z+ G d! d"e&      Z, G d# d$e&      Z- G d% d&e&      Z. G d' d(e&      Z/ G d) d*e&      Z0 G d+ d,e&      Z1 G d- d.e&      Z2 G d/ d0e2      Z3 G d1 d2e2      Z4 G d3 d4e2      Z5 G d5 d6e2      Z6 G d7 d8e2      Z7 G d9 d:e2      Z8 G d; d<e2      Z9 G d= d>e2      Z: G d? d@e2      Z; G dA dBe2      Z< G dC dDe2      Z= G dE dFe2      Z> G dG dHe2      Z? G dI dJe2      Z@ G dK dLe2      ZA G dM dNe2      ZB G dO dPe&      ZC G dQ dRe2      ZD G dS dTe&      ZE G dU dVe&      ZF G dW dXe&      ZG G dY dZe2      ZH G d[ d\e2      ZI G d] d^e2      ZJ G d_ d`e&      ZK G da dbe2      ZLdc ZM G dd de      ZNi dfe3dge/dhe4die'djeDdkeGdle5dmeEdne,doe'dpe1dqe6dre'dse'dte'due'dve'i dwe3dxe)dye,dze-d{e'd|e'd}e/d~e;de/de/de'deKde7de8de*de'de/i de9de+de@de.de'de=de>de'de/de0de:de'deAdeBdeCde;de<e(eHeJeJeIeJdZOddefdZPy)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)DictListTuple)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                    t               rddlm} |S t               rSdd l}t        j                  |j                  j                        t        j                  d      k  rddl	m} |S ddl	m
} |S t        t        j                  |             )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      V/var/www/html/venv/lib/python3.12/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr%   $   sl    !#9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 4    | rd}t        |dd      sd}|S d}|S )NalwayslegacyTfirstnever)getattr)r'   original_tokenizerprepend_schemes      r$   _get_prepend_schemer1   5   s1    !)8T:$N  !r&   c                     |d u}|rt        |      n }g }|j                         D ]j  \  }}g }t        dt        |            D ]*  }|d | ||d  }	}| v s|	 v s|j	                  ||	|f       , t        | fd      }|j                  |       l t        |d |      }|D 
cg c]  }
|
d   |
d   f }}
|S c c}
w )Nr   c                 $    | d      | d      fS Nr   r    )xvocabs    r$   <lambda>z!generate_merges.<locals>.<lambda>J   s    U1Q4[%!+,F r&   keyc                 B    | d   t        | d         t        | d         fS )N   r   r   )lenvals    r$   r8   z!generate_merges.<locals>.<lambda>M   s!    SVSQ[#c!f+,N r&   r:   reverser   )dictitemsranger=   appendsortedextend)r7   vocab_scoresrA   mergesmergepiece_scorelocalindexpiece_lpiece_rr?   s   `          r$   generate_mergesrP   ?   s    $&G)04%eLF*002 {1c%j) 	>E$Ve}eEFmWG%Gu$4gw<=	> u"FGe F NX_`F*013s1vs1v1F1M 2s   'B<c                   D    e Zd ZdZdefdZddeeeef   e	e   f   fdZ
y)SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 v    t        | d       ddlm}  |       | _        | j                  j	                  |       y )Nr   r   )SentencePieceProcessor)r   r   rU   spLoad)selfrS   rU   s      r$   __init__zSentencePieceExtractor.__init__W   s)    $08(*Ur&   Nr(   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}t	        ||      }||fS c c}w )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        )rV   rD   GetPieceSizeid_to_piecerP   rX   rH   rV   rM   r7   rI   s         r$   extractzSentencePieceExtractor.extract^   sV    
 WW;@AR;ST%&-TT 5f}	 Us   AN)__name__
__module____qualname____doc__strrY   r   r   intr   r_   r5   r&   r$   rR   rR   R   s5    c 
E$sCx.$u+2M,N 
r&   rR   c                   4    e Zd Zddeeeef   ee   f   fdZy)GemmaSentencePieceExtractorNr(   c                     | j                   }t        |j                               D ci c]  }|j                  |      | }}|j	                  d      |d<   t        ||      }||fS c c}w )r[   <0x09>	)rV   rD   r\   r]   getrP   r^   s         r$   r_   z#GemmaSentencePieceExtractor.extractl   sj    
 WW;@AR;ST%&-TT ii)d 5f} Us   A'r`   )	ra   rb   rc   r   r   re   rf   r   r_   r5   r&   r$   rh   rh   k   s$    E$sCx.$u+2M,N r&   rh   piecec                 ^    t        |       dk  xs | d   dk7  xs | d   j                          S )Nr<   ,)r=   isdigit)rm   s    r$   check_number_commars   |   s3    u:>HU2Y#-HU2Y5F5F5H1HHr&   c                       e Zd Zd ZdefdZy)	Converterc                     || _         y r`   )r/   )rX   r/   s     r$   rY   zConverter.__init__   s
    "4r&   r(   c                     t               r`   )NotImplementedErrorrX   s    r$   	convertedzConverter.converted   s    !##r&   N)ra   rb   rc   rY   r	   rz   r5   r&   r$   ru   ru      s    5$9 $r&   ru   c                       e Zd ZdefdZy)BertConverterr(   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr/   r7   r	   r   re   r   hasattrr   tokenize_chinese_charsr   do_lower_caser   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr
   decoder
rX   r7   	tokenizerr   r   r   clssepr   r   s
             r$   rz   zBertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r&   Nra   rb   rc   r	   rz   r5   r&   r$   r|   r|          #9 #r&   r|   c                       e Zd ZdefdZy)SplinterConverterr(   c           
         | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }t	        | j                   j&                        }d}	| j                   j(                  }
| j                   j*                  }| j                   j,                  }| j                   j/                  d      }| j                   j0                  dk(  r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3        j4                  | d| d|||
f||f||f|	|fg      |_        t9        j                  d      |_        |S )Nr~   Fr   Tr   .rightr    r   r   r   r   r   r   )r/   r7   r	   r   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r
   r   )rX   r7   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r$   rz   zSplinterConverter.converted   s"   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*l#l#,-l#		$
	  %..d;	r&   Nr   r5   r&   r$   r   r      s    .9 .r&   r   c                       e Zd ZdefdZy)FunnelConverterr(   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr~   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r$   rz   zFunnelConverter.converted   r   r&   Nr   r5   r&   r$   r   r      r   r&   r   c                       e Zd ZdefdZy)MPNetConverterr(   c                 r   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	| d
||f||	fg      |_        t1        j                  d      |_        |S )Nr~   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r$   rz   zMPNetConverter.converted	  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=l#l#$
	  %..d;	r&   Nr   r5   r&   r$   r   r     r   r&   r   c                       e Zd ZdefdZy)OpenAIGPTConverterr(   c           
         | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d t        |      dd            }|j                  t        |            |j                  t        |      g       t        j                  d      |_        t        j                         |_        t#        j$                  d      |_        |S )N</w>F)r7   rI   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r/   encoderlist	bpe_rankskeysr   r	   r   re   token_to_idadd_special_tokensr   r   r   r   r   r   r
   
BPEDecoderr   rX   r7   rI   r   r   s        r$   rz   zOpenAIGPTConverter.converted0  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r&   Nr   r5   r&   r$   r   r   /  s    9 r&   r   c                   <    e Zd Zddeeef   deeeef      defdZ	y)GPT2ConverterNr7   rI   r(   c           
      N   |s| j                   j                  }|st        | j                   j                        }t	        t        ||d ddd            }t        | j                   dd      }t        j                  |      |_	        t        j                         |_        t        | j                   dd      rT| j                   j                  }| j                   j                  }t        j                  | d| d||fg	      |_        |S t        j                  d
      |_        |S )N Fr7   rI   r   continuing_subword_prefixr   r   r'   r'   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r/   r   r   r   r	   r   r.   r   	ByteLevelr   r
   r   	bos_tokenbos_token_idr   r   r   )rX   r7   rI   r   r'   bosr   s          r$   rz   zGPT2Converter.convertedK  s   ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUC))33C22??L'1'D'DguL),' (I$  (2';';'OI$r&   NN
ra   rb   rc   r   re   rf   r   r   r	   rz   r5   r&   r$   r   r   J  s2    "tCH~ "d5c?>S "_h "r&   r   c                       e Zd ZdefdZy)HerbertConverterr(   c           	         d}d}| j                   j                  }t        | j                   j                  j	                               }||d   d   v r|dd  }t        t        ||d | j                   j                  |            }t        j                  dd      |_
        t        j                         |_        t        j                  |      |_        t#        j$                  | j                   j&                  | j                   j(                  f| j                   j*                  | j                   j,                  f	      |_        |S )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r/   r   r   r   r   r	   r   r   r   r   r   r   r   r   r
   r   r   r   BertProcessingr   r   r   r   r   )rX   tokenizer_info_strtoken_suffixr7   rI   r   s         r$   rz   zHerbertConverter.convertedq  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r&   Nr   r5   r&   r$   r   r   p      9 r&   r   c                   <    e Zd Zddeeef   deeeef      defdZ	y)Qwen2ConverterNr7   rI   r(   c                 0   |s| j                   j                  }|s-t        | j                   j                  j	                               }t        t        ||d d dddd            }t        j                         |_	        t        j                  t        j                  t        d      dd      t        j                  t        | j                   dd      d      g      |_        t#        j                         |_        t'        j                  d	      |_        |S )
Nr   F)r7   rI   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr'   r'   	use_regexr   )r/   r   r   r   r   r	   r   r   NFCr   r   SequenceSplitr   r   r.   r   r
   r   r   r   )rX   r7   rI   r   s       r$   rz   zQwen2Converter.converted  s   ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r&   r   r   r5   r&   r$   r   r     s2    (tCH~ (d5c?>S (_h (r&   r   c                       e Zd ZdefdZy)RobertaConverterr(   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  |j                  |j                   f|j"                  |j$                  f|j                  d      |_        |S )Nr   Fr   r   Tr   r   r'   r   )r/   r   r   r   r   r	   r   r   r   r'   r   r
   r   r   RobertaProcessingr   r   r   r   r   rX   otr7   rI   r   s        r$   rz   zRobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r&   Nr   r5   r&   r$   r   r         9 r&   r   c                       e Zd ZdefdZy)RoFormerConverterr(   c           	      V   ddl m} | j                  j                  }t	        t        |t        | j                  j                                    }d}d}t        | j                  d      r@| j                  j                  j                  }| j                  j                  j                  }t        j                  dd||      |_        t        j                   j#                   ||            |_        t        | j                  j&                        }t        | j                  j(                        }| j                  j*                  }| j                  j,                  }	t/        j0                  | d| d	| d| d
| d||f||	fg      |_        t5        j
                  d      |_        |S )Nr   )JiebaPreTokenizerr~   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r/   r7   r	   r   re   r   r   r   r   r   r   r   r   r   PreTokenizercustomr   r   r   r   r   r   r   r   r
   r   )
rX   r   r7   r   r   r   r   r   r   r   s
             r$   rz   zRoFormerConverter.converted  sy   I''--iT=T=T=^=^9_`a	4**,=> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r&   Nr   r5   r&   r$   r   r     r   r&   r   c                       e Zd ZdefdZy)DebertaConverterr(   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  ddd| j                   j                  d      fd| j                   j                  d      fg	      |_        |S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r/   r   r   r   r   r	   r   r   r   r'   r   r
   r   r   r   r   r   r   s        r$   rz   zDebertaConverter.converted   s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r&   Nr   r5   r&   r$   r   r     r   r&   r   c                   `     e Zd ZdZeZi Z fdZd Zd Z	d Z
d Zd Zd Zd	 Zd
efdZ xZS )SpmConverterFc                    t        | d       t        |   |  t               }|j	                         }t        | j                  j                  d      5 }|j                  |j                                d d d        || _
        | j                  j                  j                  r#| j                  st        j                  d       y y y # 1 sw Y   TxY w)Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrY   r%   
ModelProtoopenr/   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rX   args	model_pb2mf	__class__s        r$   rY   zSpmConverter.__init__#  s    $
+$ $%	  "$))44d; 	(qaffh'	(
::""009R9RMMe :S0		( 	(s    CCc                 l    |j                   D cg c]  }|j                  |j                  f c}S c c}w r`   piecesrm   scorerX   r  rm   s      r$   r7   zSpmConverter.vocab8  s'    8=Euekk*EEEs   1c                 .    |j                   j                  S r`   )r  unk_idrX   r  s     r$   r  zSpmConverter.unk_id;  s    !!(((r&   c                 ~   |j                   j                  }| j                  |      }|dk(  r1t        t	        || j                  |      | j                              }n|dk(  r| j                  | j                  j                        j                  |      \  }}t        |      D 	ci c]  \  }\  }}	|| }
}}}	t        t        |
||j                   j                  d| j                  d             }nt        d      t        |j                        D cg c]I  \  }}|j                   dv r6||j"                  |j                   dk(  xs |j"                  | j$                  v fK }}}|j'                  t)        |d	 
      D cg c]  \  }}}t+        |d|       c}}}       |S c c}	}}w c c}}w c c}}}w )Nr   )r  r   r<   T)r   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithm)      r"  c                     | d   S Nr   r5   )r6   s    r$   r8   z(SpmConverter.tokenizer.<locals>.<lambda>j  s    QRSTQU r&   r9   F)
normalizedspecial)r  
model_typer7   r	   r   r  r  SpmExtractorr/   r  r_   	enumerater   	unk_piece	Exceptionr  typerm   r   
add_tokensrF   r   )rX   r  r(  rH   r   _rI   iwordr  	bpe_vocabidpspm_added_tokenstokenr'  s                   r$   r   zSpmConverter.tokenizer>  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIAv9B<9PQQ%5QuqQIQ!#00::!"&";"; 	I o  #5<<0
Avv !&&A+GD4G4G)GH
 

 	 +11A~*V &Bw 5UGD	
 C R*
s   )F+AF2F8c                     |j                   j                  }t        j                  dd      t        j                  t        d      d      g}|st        j                  |      S t        j                  t        j                  |      g|z         S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrX   r  r<  _normalizerss       r$   r   zSpmConverter.normalizerp  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr&   c                 \    t        || j                        }t        j                  ||      S Nreplacementr0   )r1   r/   r   	MetaspacerX   rE  r'   r0   s       r$   r   zSpmConverter.pre_tokenizer{  s)    ,-=t?V?VW''KP^__r&   c                      y r`   r5   ry   s    r$   r   zSpmConverter.post_processor  s    r&   c                 \    t        || j                        }t        j                  ||      S rC  )r1   r/   r
   rF  rG  s       r$   r   zSpmConverter.decoder  s(    ,-=t?V?VW!!k.YYr&   r(   c                 z   | j                  | j                        }| j                  | j                        }|||_        d}d}t        | j                  d      r| j                  j
                  }| j                  ||      }|||_        | j                  ||      |_        | j                         }|r||_        |S )Nr:  Tr'   )	r   r  r   r   r/   r'   r   r   r   )rX   r   r   rE  r'   r   r   s          r$   rz   zSpmConverter.converted  s    NN4::.	 __TZZ0
!#-I 4**,>?#66GG**;8HI$&3I# LL6FG	,,.'5I$r&   )ra   rb   rc   r  rR   r)  r   rY   r7   r  r   r   r   r   r   r	   rz   __classcell__r  s   @r$   r  r    sL     )LN*F)0d	h`Z9 r&   r  c                       e Zd Zd Zd Zd Zy)AlbertConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w Nd   r  rs   rm   r  r  s      r$   r7   zAlbertConverter.vocab  ^     
 +=U[[*IU[[%++&PUP[P[]b]h]hkn]nOoo
 	
 
   AA!c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S Nz``"z''r9  r   r   r>  r/   keep_accentsrE   NFKDStripAccentsr   	Lowercaser;  r<  r?  r   r   rX   r  list_normalizersr<  s       r$   r   zAlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nr  r  r  r  r   r   r   r/   r   ry   s    r$   r   zAlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
r&   Nra   rb   rc   r7   r   r   r5   r&   r$   rN  rN        
6&
r&   rN  c                       e Zd Zd Zd Zy)BarthezConverterc                 
    d}|S Nr"  r5   rX   r  r  s      r$   r  zBarthezConverter.unk_id      r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rb  ry   s    r$   r   zBarthezConverter.post_processor  R    ,, +//EEeLM00FFvNO
 	
r&   N)ra   rb   rc   r  r   r5   r&   r$   rg  rg    s    
r&   rg  c                       e Zd Zd Zd Zd Zy)CamembertConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )N))z
<s>NOTUSED        z<pad>rt  )z</s>NOTUSEDrt  z<unk>rt  )z<unk>NOTUSEDir   z<mask>rt  r  rX   r  r7   rm   s       r$   r7   zCamembertConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" L   Ac                      yri  r5   r   s     r$   r  zCamembertConverter.unk_id  s    r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rm  rb  ry   s    r$   r   z!CamembertConverter.post_processor  rp  r&   Nra   rb   rc   r7   r  r   r5   r&   r$   rr  rr    s    
r&   rr  c                       e Zd Zd Zd Zd Zy)DebertaV2Converterc                    g }| j                   j                  r%|j                  t        j                  d             t        || j                         }|j                  t        j                  ||             t        j                  |      S )Nr   )r   rD  )r/   split_by_punctrE   r   Punctuationr1   rF  r   )rX   rE  r'   list_pretokenizersr0   s        r$   r   z DebertaV2Converter.pre_tokenizer  sq    ""11%%n&@&@*&UV,-=t?V?VW!!.":":{cq"rs&&'9::r&   c                    g }| j                   j                  r#|j                  t        j                                |j                  t        j
                                |j                  j                  }|r$|j                  t        j                  |             |j                  t        j                  t        d      d             t        j                  |      S )Nr9  r   )r/   r   rE   r   r\  r=  r;  r<  r?  r>  r   r   r]  s       r$   r   zDebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S ra  rb  ry   s    r$   r   z!DebertaV2Converter.post_processor  rc  r&   N)ra   rb   rc   r   r   r   r5   r&   r$   r~  r~    s    ;6
r&   r~  c                       e Zd Zd Zd Zd Zy)MBartConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nrn  rt  ru  ro  rt  rv  r"  )ar_ARrt  cs_CZrt  de_DErt  en_XXrt  es_XXrt  et_EErt  fi_FIrt  fr_XXrt  gu_INrt  hi_INrt  it_ITrt  ja_XXrt  kk_KZrt  ko_KRrt  lt_LTrt  lv_LVrt  my_MMrt  ne_NPrt  nl_XXrt  ro_ROrt  ru_RUrt  si_LKrt  tr_TRrt  vi_VNrt  zh_CNrt  rw  r  rx  s       r$   r7   zMBartConverter.vocab  sa    
 	%,,qr:JK5;;,KK 
 	
6 	/""; L   A
c                      yri  r5   r   s     r$   r  zMBartConverter.unk_id9      r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A </s> en_XXz$A $B </s> en_XXr  ro  r   rb  ry   s    r$   r   zMBartConverter.post_processor<  R    ,,"#$11GGPQ00FFvNO
 	
r&   Nr|  r5   r&   r$   r  r    s    $L
r&   r  c                       e Zd Zd Zd Zd Zy)MBart50Converterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|dgz  }|S c c}w )Nr  r"  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArt  )az_AZrt  )bn_INrt  )fa_IRrt  )he_ILrt  )hr_HRrt  )id_IDrt  )ka_GErt  )km_KHrt  )mk_MKrt  )ml_INrt  )mn_MNrt  )mr_INrt  )pl_PLrt  )ps_AFrt  )pt_XXrt  )sv_SErt  )sw_KErt  )ta_INrt  )te_INrt  )th_THrt  )tl_XXrt  )uk_UArt  )ur_PKrt  )xh_ZArt  )gl_ESrt  )sl_SIrt  rw  r  rx  s       r$   r7   zMBart50Converter.vocabH  sa    
 	%,,qr:JK5;;,KK  R  	R/"" Lr  c                      yri  r5   r   s     r$   r  zMBart50Converter.unk_idT  r  r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzen_XX $A </s>zen_XX $A $B </s>r  ro  r   rb  ry   s    r$   r   zMBart50Converter.post_processorW  r  r&   Nr|  r5   r&   r$   r  r  G  s    

r&   r  c                       e Zd Zd Zd Zd Zy)NllbConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr  r"  r  rx  s       r$   r7   zNllbConverter.vocabc  C    
 	%,,qr:JK5;;,KK L   =c                      yri  r5   r   s     r$   r  zNllbConverter.unk_idm  r  r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnro  r   rb  ry   s    r$   r   zNllbConverter.post_processorp  sR    ,,%&T44JJ:VW00FFvNO
 	
r&   Nr|  r5   r&   r$   r  r  b  s    
r&   r  c                       e Zd Zd Zd Zd Zy)SeamlessM4TConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )N)ru  rv  r  r  r"  r  rx  s       r$   r7   zSeamlessM4TConverter.vocab|  r  r  c                 .    | j                   j                  S r`   )r/   unk_token_idr   s     r$   r  zSeamlessM4TConverter.unk_id  s    &&333r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__ro  r   rb  ry   s    r$   r   z#SeamlessM4TConverter.post_processor  sR    ,,$%D33II)TU00FFvNO
 	
r&   Nr|  r5   r&   r$   r  r  {  s    4
r&   r  c                       e Zd Zd Zd Zd Zy)XLMRobertaConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|dgz  }|S c c}w )Nr  r"  rw  r  rx  s       r$   r7   zXLMRobertaConverter.vocab  sP    
 	%,,qr:JK5;;,KK/"" Lry  c                 
    d}|S ri  r5   rj  s      r$   r  zXLMRobertaConverter.unk_id  rk  r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S rm  rb  ry   s    r$   r   z"XLMRobertaConverter.post_processor  rp  r&   Nr|  r5   r&   r$   r  r        	
r&   r  c                       e Zd Zd Zd Zd Zy)XLNetConverterc                     |j                   D cg c]J  }t        |j                        r|j                  |j                  fn|j                  |j                  dz
  fL c}S c c}w rP  rR  r  s      r$   r7   zXLNetConverter.vocab  rS  rT  c                    t        j                  dd      t        j                  dd      g}| j                  j                  sF|j	                  t        j
                                |j	                  t        j                                | j                  j                  r#|j	                  t        j                                |j                  j                  }|r$|j	                  t        j                  |             |j	                  t        j                  t        d      d             t        j                  |      S rV  rX  r]  s       r$   r   zXLNetConverter.normalizer  r_  r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rb  ry   s    r$   r   zXLNetConverter.post_processor  rc  r&   Nrd  r5   r&   r$   r  r    re  r&   r  c                       e Zd Zy)ReformerConverterNra   rb   rc   r5   r&   r$   r  r        r&   r  c                       e Zd Zd Zd Zy)RemBertConverterc                 b   t        j                  dd      t        j                  dd      t        j                  t        d      d      g}| j                  j                  sF|j                  t        j                                |j                  t        j                                | j                  j                  r#|j                  t        j                                |j                  j                  }|r$|j                  t        j                  |             t        j                  |      S rV  )r   r>  r   r/   rY  rE   rZ  r[  r   r\  r;  r<  r?  r   r]  s       r$   r   zRemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S ra  rb  ry   s    r$   r   zRemBertConverter.post_processor  rc  r&   N)ra   rb   rc   r   r   r5   r&   r$   r  r    s    6&
r&   r  c                       e Zd Zy)BertGenerationConverterNr  r5   r&   r$   r  r    r  r&   r  c                   $    e Zd Zd Zd Zd Zd Zy)PegasusConverterc                 v   | j                   j                  df| j                   j                  dfg}| j                   j                  || j                   j                  dfgz  }| j                   j                  I| j                   j
                  | j                   j                  k  r|| j                   j                  dfgz  }|t        d| j                   j                        D cg c]
  }d| ddf c}z  }||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w c c}w )Nrt  r<   z<unk_>g      Y)r/   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrD   r  rm   r  )rX   r  r7   r0  rm   s        r$   r7   zPegasusConverter.vocab  s%   $$..4$$..4

 ""22>t..>>DEEE ##..:''558O8O8V8VVt..993?@@E%4;R;R;Y;Y2Z[QU1#Q<([[%,,qr:JK5;;,KK \Ks   %D1D6c                 \    |j                   j                  | j                  j                  z   S r`   )r  r  r/   r  r   s     r$   r  zPegasusConverter.unk_id  s%    !!((4+B+B+I+IIIr&   c                     t        || j                        }t        j                  t        j                         t        j
                  ||      g      S rC  )r1   r/   r   r   WhitespaceSplitrF  rG  s       r$   r   zPegasusConverter.pre_tokenizer  sJ    ,-=t?V?VW&&..0(([Q_`
 	
r&   c                     | j                   j                  }|| j                   j                  fg}t        j                  d|gdd|g|      S )N$A$Br   )r/   r  eos_token_idr   r   )rX   eosr   s      r$   r   zPegasusConverter.post_processor  sR    %%//$))667
 ,,T3KtTSVFWhvwwr&   N)ra   rb   rc   r7   r  r   r   r5   r&   r$   r  r    s    &J
xr&   r  c                       e Zd Zd Zd Zy)T5Converterc                     | j                   j                  }|j                  D cg c]  }|j                  |j                  f }}|t        |dz
  dd      D cg c]
  }d| ddf c}z  }|S c c}w c c}w )Nr   ro   z
<extra_id_r	  rt  )r/   
_extra_idsr  rm   r  rD   )rX   r  num_extra_idsrm   r7   r0  s         r$   r7   zT5Converter.vocab%  sw    //::9>F%++u{{+FFE-!:KRQS4TUqZs!$c*UU GUs   A/A4c                 r    t        j                  ddgg dd| j                  j                  d      fg      S Nr  ro  )r  ro  r  ro  r   rb  ry   s    r$   r   zT5Converter.post_processor+  =    ,,&>-00FFvNO
 	
r&   N)ra   rb   rc   r7   r   r5   r&   r$   r  r  $  s    
r&   r  c                       e Zd Zd Zy)UdopConverterc                 r    t        j                  ddgg dd| j                  j                  d      fg      S r  rb  ry   s    r$   r   zUdopConverter.post_processor6  r  r&   Nra   rb   rc   r   r5   r&   r$   r!  r!  5  s    
r&   r!  c                       e Zd ZdefdZy)WhisperConverterr(   c           
         | j                   j                  }t        | j                   j                  j	                               }t        t        ||d ddd            }t        j                  | j                   j                        |_
        t        j                         |_        | j                   j                  }| j                   j                  |      }| j                   j                  }| j                   j                   }dj#                  |D cg c]  }| d	 c}      }	t%        j&                  |	 d| d|	 d| d	||fgt)        ||      
      |_        |S c c}w )Nr   Fr   r   r   r   z $A:0 z $A:0 $B:1 r   r   )r/   r   r   r   r   r	   r   r   r   r'   r   r
   r   prefix_tokensconvert_ids_to_tokensr  r  joinr   r   zipr   )
rX   r7   rI   r   prefix_token_idsprefixesr  r  r6  prefix_templates
             r$   rz   zWhisperConverter.convertedA  sR   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GUugRL#GH#-#@#@%&fSE4#$KuB7l#X/0$
	   $Hs   ENr   r5   r&   r$   r%  r%  @  s     9  r&   r%  c                       e Zd Zd Zy)BigBirdConverterc           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S ra  rb  ry   s    r$   r   zBigBirdConverter.post_processore  rc  r&   Nr#  r5   r&   r$   r/  r/  d  s    
r&   r/  c                       e Zd ZdefdZy)CLIPConverterr(   c                 p   | j                   j                  }t        | j                   j                  j	                               }| j                   j
                  }t        t        ||d dddt        |                  }t        j                  t        j                         t        j                  t        d      d      t        j                         g      |_        t!        j                  t!        j"                  t        d      dd	
      t!        j$                  d      g      |_        t)        j$                         |_        t-        j.                  | j                   j0                  | j                   j2                  f| j                   j4                  | j                   j6                  fdd      |_        |S )Nr   r   Fr7   rI   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r/   r   r   r   r   r   r	   r   re   r   r   r   r>  r   r\  r   r   r   r   r   r
   r   r   r   r  r  r   r   r   r   s        r$   rz   zCLIPConverter.convertedq  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r&   Nr   r5   r&   r$   r2  r2  p  s    '9 'r&   r2  c                       e Zd ZdefdZy)LayoutLMv2Converterr(   c           	      l   | j                   j                  }t        t        |t	        | j                   j
                                    }d}d}d}t        | j                   d      r`| j                   j                  j                  }| j                   j                  j                  }| j                   j                  j                  }t        j                  d|||      |_        t        j                         |_        t	        | j                   j"                        }t	        | j                   j$                        }| j                   j&                  }| j                   j(                  }	t+        j,                  | d| d| d| d| d	||f||	fg
      |_        t1        j                  d      |_        |S )Nr~   FTr   r   r   r   r   r   r   r   r   r   r   s
             r$   rz   zLayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5l#l#$
	  %..d;	r&   Nr   r5   r&   r$   r7  r7    r   r&   r7  c                       e Zd ZdefdZy)BlenderbotConverterr(   c           
         | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd            }t        j                  |j                        |_
        t        j                         |_        t        j                  d|j                   d|j                  |j                   fg      |_        |S )Nr   Fr   r   z$A:0 r   )r   r   )r/   r   r   r   r   r	   r   r   r   r'   r   r
   r   r   r   r  r  r   r   s        r$   rz   zBlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r&   Nr   r5   r&   r$   r:  r:    r   r&   r:  c                       e Zd Zd Zd Zd Zy)XGLMConverterc                     g d}||j                   dd  D cg c]  }|j                  |j                  f c}z  }|g dz  }|S c c}w )Nr  r"  ))z<madeupword0>rt  )z<madeupword1>rt  )z<madeupword2>rt  )z<madeupword3>rt  )z<madeupword4>rt  )z<madeupword5>rt  )z<madeupword6>rt  r  rx  s       r$   r7   zXGLMConverter.vocab  sT    
 	%,,qr:JK5;;,KK  z  	z Ls   Ac                 
    d}|S ri  r5   rj  s      r$   r  zXGLMConverter.unk_id  rk  r&   c           	          t        j                  ddd| j                  j                  d      fd| j                  j                  d      fg      S )Nz</s> $Az</s> $A </s> </s> $Brn  ro  r   rb  ry   s    r$   r   zXGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r&   Nr|  r5   r&   r$   r=  r=    r  r&   r=  c                   <    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zy	)
GemmaConvertTz<start_of_turn>z<end_of_turn>c                 .    t        j                  dd      S Nr   r:  )r   r>  r   s     r$   r   zGemmaConvert.normalizer
  s    ""3..r&   c                 8   | j                   j                  df| j                   j                  df| j                   j                  dfg}|j                  dd  D ]@  }|j
                  dk(  r|d|j                  fgz  }%||j
                  |j                  fgz  }B |S )Nrt  r"  rj   rk   )r/   r
  r  r   r  rm   r  rx  s       r$   r7   zGemmaConvert.vocab  s    $$..4$$..4$$..4

 \\!"% 	6E{{h&4-..5;;455		6 r&   c                 .    t        j                  dd      S )Nr   merged_with_previous)r   r   rX   rE  r'   s      r$   r   zGemmaConvert.pre_tokenizer  s    ##C)?@@r&   c                 
    d}|S ri  r5   rj  s      r$   r  zGemmaConvert.unk_id  rk  r&   c                     t        j                  t        j                  dd      t        j                         t        j                         g      S )Nr:  r   )r
   r   r>  ByteFallbackFuserH  s      r$   r   zGemmaConvert.decoder"  s?        ,%%'
 	
r&   N)ra   rb   rc   r  rh   r)  r   r   r7   r   r  r   r5   r&   r$   rB  rB    s6    .L'9N/A
r&   rB  c                   4    e Zd ZdZd Zd Zd Zd Zd Zd Z	y)	LlamaConverterTc                 (   | j                   j                  d      df| j                   j                  d      df| j                   j                  d      dfg}||j                  dd  D cg c]  }|j                  |j                  f c}z  }|S c c}w )Nr   rt  r   r<   r"  )r/   r(  r  rm   r  rx  s       r$   r7   zLlamaConverter.vocab/  s    $$::1=sC$$::1=sC$$::1=sC

 	%,,qr:JK5;;,KK Ls   )Bc                 
    d}|S r%  r5   rj  s      r$   r  zLlamaConverter.unk_id8  rk  r&   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S Nr:  r   r   )contentr8  r
   r>  rK  rL  r=  r   rX   rE  r'   sequences       r$   r   zLlamaConverter.decoder<  \    UC(!!#MMO

 !<==H  **r&   c                     t        | j                  dd      rcg }t        | j                  dd      r|t        j                  d      gz  }|t        j                  dd      gz  }t        j
                  |      S y )Nr+   Tr'   r:  )prependr   )patternrS  )r.   r/   r   Prependr>  r   )rX   r  rV  s      r$   r   zLlamaConverter.normalizerF  sr    4**Hd;Ht..0BDI[00?@@,,S%HIIH''11r&   c                     t        | j                  dd      s.t        || j                        }t        j                  ||d      S y )Nr+   TFrE  r0   split)r.   r/   r1   r   rF  rG  s       r$   r   zLlamaConverter.pre_tokenizerO  sA    t..$?01A4CZCZ[N!++Tbjoppr&   c                      y r`   r5   ry   s    r$   r   zLlamaConverter.post_processorU  s    r&   N)
ra   rb   rc   r  r7   r  r   r   r   r   r5   r&   r$   rN  rN  ,  s&    +r&   rN  c                       e Zd ZdefdZy)MarkupLMConverterr(   c                    | j                   }|j                  }t        |j                  j	                               }t        t        ||d ddd| j                   j                              }t        j                  |j                        |_        t        j                         |_        t        | j                   j                        }t        | j                   j                         }| j                   j"                  }| j                   j$                  }t'        j(                  | d| | d| d| ||f||fg      |_        |S )Nr   Fr4  r   z $A z $B r   )r/   r   r   r   r   r	   r   r   r   r   r'   r   r
   r   re   r   r   r   r   r   r   r   )	rX   r   r7   rI   r   r   r   r   r   s	            r$   rz   zMarkupLMConverter.converted[  s,   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+l#l#$
	  r&   Nr   r5   r&   r$   ra  ra  Z  s    "9 "r&   ra  c                   *    e Zd ZdZddZd Zd Zd Zy)MoshiConverterTNc                    t        | d       t        j                  | |       t               }|j	                         }t        |d      5 }|j                  |j                                d d d        || _        y # 1 sw Y   || _        y xY w)Nr   r  )	r   ru   rY   r%   r
  r  r  r  r  )rX   r  model_max_lengthkwargsr  r  r  s          r$   rY   zMoshiConverter.__init__  sr    $
+4, $%	  "*d# 	(qaffh'	(
	(
s   	 A99B	c                     |j                   j                  }t        j                  dd      g}|st        j                  |      S t        j                  t        j
                  |      g|z         S rD  )r;  r<  r   r>  r   r?  r@  s       r$   r   zMoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr&   c                     t        j                  dd      t        j                         t        j                         g}|r|t        j                  dd      gz  }t        j
                  |      S rR  rT  rU  s       r$   r   zMoshiConverter.decoder  rW  r&   c                 6    d}t        j                  ||d      S )Nr,   Fr]  )r   rF  rG  s       r$   r   zMoshiConverter.pre_tokenizer  s     ''KP^fkllr&   r`   )ra   rb   rc   r  rY   r   r   r   r5   r&   r$   rd  rd    s    h+mr&   rd  c            	         t        t        t        d      t        d      dz               t        t        t        d      t        d      dz               z   t        t        t        d      t        d      dz               z   } | dd }d	}t        d
      D ]1  }|| vs| j                  |       |j                  d
|z          |dz  }3 |D cg c]  }t	        |       }}t        t        | |            S c c}w )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      )r   rD   ordrE   chrrB   r*  )bscsnbs       r$   bytes_to_unicodery    s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[ B;IIaLIIdQhFA	
 	Q#a&	B	B 
s   C4c                   J     e Zd ZdZ	 	 	 	 d fd	ZdefdZd ZdefdZ	 xZ
S )	TikTokenConverterz'
    A general tiktoken converter.
    c                 V    t        |   |  || _        || _        || _        || _        y r`   )r	  rY   r  rZ  r'   additional_special_tokens)rX   r  rZ  r'   r}  r  rg  r  s          r$   rY   zTikTokenConverter.__init__  s0     	$$ 0)B&r&   tiktoken_urlc                 0   	 ddl m}  ||      t	               fd}g }i }j                         D ]  \  }}|| ||      <   t        |      dk(  r g }t        dt        |            D ]2  }	|d |	 ||	d  }}
|
v s|v s|
|z   v s|j                  |
||f       4 t        |fdd      }|j                  |        t        |d	 d      }|D cg c]  } ||d          ||d         f }}||fS # t        $ r t        d      w xY wc c}w )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	          dj                  | j                  d      D cg c]  }t        |          c}      S c c}w )Nr   zlatin-1)r)  decoders  )rx  charbyte_encoders     r$   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s2    77@STLT3TUUTs   <r   c                 $    | d      | d      fS r4   r5   )r6   r   s    r$   r8   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0R r&   Fr@   c                     | d   S )Nr<   r5   r>   s    r$   r8   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s
    A r&   )tiktoken.loadr  r,  
ValueErrorry  rC   r=   rD   rE   rF   rG   )rX   r~  r  r  rI   r7   r6  rankrL   rM   rN   rO   r?   r   r  s                @@r$   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model  sY   	7 &l3	')	V $??, 
	!KE426E'./5zQEq#e*- ;#(%=%-i'Gy,@gPWFW\eEeLL'7D!9:; 5&R\abEMM% 
	! $6F\bcUX(Q02GA2OPccf}5  	n 	2 ds   C; D;Dc                     | j                  | j                        \  }}t        t        ||d            }t	        |j
                  d      rd|j
                  _        |S )NF)r   ignore_mergesT)r  r  r	   r   r   rS   r  )rX   rH   rI   r   s       r$   r   zTikTokenConverter.tokenizer  sN    #CCDOOTfc,GH	9??O4,0IOO)r&   r(   c                    | j                         }t        j                  t        j                  t	        | j
                        dd      t        j                  | j                  d      g      |_        t        j                         |_
        |j                  | j                         t        j                  d      |_        |S )Nr   Fr   r   r   )r   r   r   r   r   rZ  r   r'   r   r
   r   r   r}  r   r   )rX   r   s     r$   rz   zTikTokenConverter.converted  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$T%C%CD#-#7#7U#K	 r&   )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)ra   rb   rc   rd   rY   re   r  r   r	   rz   rK  rL  s   @r$   r{  r{    s<      K"&CC >9 r&   r{  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                 v   | j                   j                  }|t        v r!|st        |   } ||       j                         S 	 t        j                  d       t        | j                  | j                        j                         S # t        $ r* t        dt        t        j                                      w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r  r}  zConverting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: )r  ra   SLOW_TO_FAST_CONVERTERSrz   loggerinfor{  r  r}  r,  r  r   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r$   convert_slow_tokenizerr  I  s      1::CC66}12FG45??AA	KK23$0;;*?*Y*Y ik  	>>BCZC_C_Ca>b=ce 	s   AB 3B8)r   )F)Qrd   r  typingr   r   r   	packagingr   
tokenizersr   r   r	   r
   r   r   r   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerra   r  r%   boolre   r1   rP   rR   rh   rs   ru   r|   r   r   r   r   r   r   r   r   r   r   r  rN  rg  rr  r~  r  r  r  r  r  r  r  r  r  r  r  r!  r%  r/  r2  r7  r:  r=  rB  rN  ra  rd  ry  r{  r  r  r5   r&   r$   <module>r     s^    $ $  f f f 5 5 ` ` 5 
		H	%G"$ s & 2"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6#I #Ly >)Y )Xy :$	 $Ny >~9 ~B"
l "
J
| 
 
 
:
 
B2
\ 2
j
| 
6
L 
2
< 
2
, 
6"
\ "
J	 	
| 
@	l 	%x| %xP
, 
"
L 
!y !H	
| 	
(I (V$) $N) :
L 
6/
< /
d+\ +\#	 #L&m\ &mT0G GT::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #:$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E:F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng:h +"$("#s: z!) !r&   