
    sg                        d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ d	d
lmZ d	dlm Z  d	dl!m"Z" d	dl#m$Z$ d	dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d	dl0m1Z1m2Z2m3Z3  e3jh                  e5      Z6dZ7dZ8dZ9dZ:dZ;e&dz  Z&eeeedZ<e7e:dZ= e2e&       G d de+             Z>y)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)AnyDictIterableListOptionalTupleUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       *    e Zd ZU dZeZdZeed<    fdZ	e
defd       Ze
defd       Ze
defd       Zdeeef   fd	Ze
deeef   fd
       Ze
deeef   fd       Ze
deeef   fd       Zdeeef   fdZdefdZe
defd       Ze
defd       Z	 	 	 	 	 	 	 dEdedee   dee   dedededededeeee f   e!e   f   fdZ"de#ee$e   f   de#ee!e   f   fdZ%dedefdZ&d edee   fd!Z'dFd"e!e#eef      defd#Z(dFd$edefd%Z)	 dFd&e#ee!e   f   d'ede#ee!e   f   fd(Z*dGd)ed$ee   d*ede!e   fd+Z+d,e,d-e-d.ed/ed0ee   d1ee   fd2Z.de,j^                  e-j`                  dd3ddddddddddddfd4e#e!e1   e!e2   e!e3   e!e4   f   d*ed,e,d-e-d.ee   d/ed5ed0ee   d1ee   d6ee   dee   dee   dededededed7ede5f&d8Z6dde,j^                  e-j`                  dd3ddddddddddddfd)e#e1e3f   d9ee#e1e3f      d*ed,e,d-e-d.ee   d/ed5ed0ee   d1ee   d6ee   dee   dee   dededededed7ede5f(d:Z7de!e   defd;Z8	 	 dHd<e#ee!e   f   d'ed=edefd>Z9	 	 dId?e#ee:jv                  f   d@ee   dAee   dBee   dee   f
dCZ<	 	 	 dJdDZ= xZ>S )KPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc           	      	   |j                  dd       }|j                  dd       }|j                  dd       }|j                  dd       }|j                  dd      }|j                  di       }|r|| j                  t        d      |t        j                  |      }	n||st        j                  |      }	n|rt        |      }	n|lt        |j                  d	            }
|
d
   d   }|
d   }|
d   }t        ||      \  }	}|j                  |       t        |      dkD  r|j                  |       ny| j                  "|dur | j                  |i |}t        |      }	nK|s>|j                  d	d       | _        |j                  dg       | _        t        | d      }	d }nt        d      |	| _        ||j                  |j                          d| _        | j                  j$                  }|q | j                  j&                  d"i | |j)                  d|d          |j)                  d|d          |j)                  d|d          |j)                  d|d          n| j                  j+                          | j                  j,                  }| | j                  j.                  d"i | |j)                  d|d          |j)                  d|d          |j)                  d|d          |j)                  d|d          |j)                  d|d          t1        | d  d"i | | j4                  | j                  _        | j8                  D ch c]  }t;        t=        |             }}t?        |jA                         d        D cg c]  \  }}t;        t=        |            |vr| }}}tC        | jD                  jG                               |D cg c]  }tI        |       c}z   }|| jJ                  D cg c]  }||vs||vs| c}z  }t        |      dkD  rg }| jL                  }|D ]p  }tO        |tP              r|jR                  xs tI        |      |v ntI        |      |v }tO        |tH              rtQ        ||!      }n||_)        |jU                  |       r |r| jW                  |       y y y c c}w c c}}w c c}w c c}w )#Ntokenizer_object__slow_tokenizer	gguf_filer(   	from_slowFadded_tokens_decoderzCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r)   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                     | d   S Nr    )xs    W/var/www/html/venv/lib/python3.12/site-packages/transformers/tokenization_utils_fast.py<lambda>z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW     key)specialrG   ),popr,   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   getr   updatelenr)   r7   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr2   hashreprsorteditemslistadded_tokens_encoderkeysstrall_special_tokens_extendedall_special_tokens
isinstancer   rN   append
add_tokens)selfargskwargsr.   slow_tokenizerr0   fast_tokenizer_filer1   r2   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr6   additional_kwargs_truncation_paddingtokenadded_tokens_decoder_hashindextokens_to_addencodertokensspecial_tokens
is_special	__class__s                            rI   rb   z PreTrainedTokenizerFast.__init__b   s   !::&8$?$6=JJ{D1	$jj)94@JJ{E2	%zz*@"E/D4M4M4U0 
 '!]]+;<N ,Y*445HIN3NCN"-fjj.FGJ%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0&&2~U7R6T66GGN3NCN$jjt<DO-3ZZ8SUW-XD*3DMN!Nr  )%MM.445,1)oo00"-DOO--<<lK,EF/[1IJhH(=>3[5LMOO))+??***DOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS 	"6"040I0I-DHD]D]$^5T$u+%6$^!$^ !'';'A'A'C X
uDK (AA 
 

 t005578Ta;b5CJ;bb#??
5PWCW\aiv\vE
 	
 }!F!44N& 
% "%4 ]]Bc%jN&BU~5 
 eS)&ujAE$.EMe$
% '  " %_

 <c
s$   S "SS	SSSreturnc                      y)NTrG   rr   s    rI   is_fastzPreTrainedTokenizerFast.is_fast   s    rK   c                      y)z
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        TrG   r   s    rI   can_save_slow_tokenizerz/PreTrainedTokenizerFast.can_save_slow_tokenizer   s     rK   c                 :    | j                   j                  d      S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensrX   get_vocab_sizer   s    rI   
vocab_sizez"PreTrainedTokenizerFast.vocab_size   s    
 ---FFrK   c                 :    | j                   j                  d      S )NTr   )rX   	get_vocabr   s    rI   r   z!PreTrainedTokenizerFast.get_vocab   s    ((4(@@rK   c                 "    | j                         S N)r   r   s    rI   vocabzPreTrainedTokenizerFast.vocab   s    ~~rK   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                     | d   S rF   rG   items    rI   rJ   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>       dhijdk rK   rL   rg   r2   rh   contentrr   vks      rI   rj   z,PreTrainedTokenizerFast.added_tokens_encoder   s;     *00I0I0O0O0QWk)lmA		1mmm   Ac                 6    | j                   j                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        )rX   get_added_tokens_decoderr   s    rI   r2   z,PreTrainedTokenizerFast.added_tokens_decoder   s     7799rK   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                     | d   S rF   rG   r   s    rI   rJ   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rK   rL   r   r   s      rI   get_added_vocabz'PreTrainedTokenizerFast.get_added_vocab   s;     *00I0I0O0O0QWk)lmA		1mmmr   c                 :    | j                   j                  d      S )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   s    rI   __len__zPreTrainedTokenizerFast.__len__  s     ---EErK   c                     | j                   S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )rX   r   s    rI   backend_tokenizerz)PreTrainedTokenizerFast.backend_tokenizer  s    
 rK   c                 .    | j                   j                  S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )rX   decoderr   s    rI   r   zPreTrainedTokenizerFast.decoder  s    
 &&&rK   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 J   |d| j                   v }|d| j                   v }|r|j                  |g|j                  z   }	n|g}	t        t              }
|	D ]  }|
d   j	                  |j
                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |s|
d   j	                  t        |j
                                |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrC   )model_input_namesoverflowingr   ri   rp   idstype_idsr   r   offsetsrW   )rr   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rI   _convert_encodingz)PreTrainedTokenizerFast._convert_encoding  s$   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D) 	;A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyAh'..s155z:	; i''rK   r   c                     t        |t              r| j                  |      S |D cg c]  }| j                  |       c}S c c}w )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        )ro   rl   #_convert_token_to_id_with_added_voc)rr   r   r~   s      rI   convert_tokens_to_idsz-PreTrainedTokenizerFast.convert_tokens_to_idsI  s>     fc";;FCCMSTE88?TTTs   Ar~   c                 X    | j                   j                  |      }|| j                  S |S r   )rX   token_to_idunk_token_id)rr   r~   r   s      rI   r   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocY  s,    ++E2=$$$rK   r   c                 J    | j                   j                  t        |            S r   )rX   id_to_tokenint)rr   r   s     rI   _convert_id_to_tokenz,PreTrainedTokenizerFast._convert_id_to_token_  s    **3u:66rK   
new_tokensc                 r    |r| j                   j                  |      S | j                   j                  |      S r   )rX   add_special_tokensrq   )rr   r   r   s      rI   _add_tokensz#PreTrainedTokenizerFast._add_tokensb  s/    ??55jAA))*55rK   pairc                 8    | j                   j                  |      S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )rX   num_special_tokens_to_add)rr   r   s     rI   r   z1PreTrainedTokenizerFast.num_special_tokens_to_addh  s    & 88>>rK   r   skip_special_tokensc                     t        |t              r| j                  j                  |      S g }|D ]H  }t        |      }|r|| j                  v r|j                  | j                  j                  |             J |S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )ro   r   rX   r   all_special_idsrp   )rr   r   r   r   r   s        rI   convert_ids_to_tokensz-PreTrainedTokenizerFast.convert_ids_to_tokens}  su      c3??..s33 	>EJE"u0D0D'DMM$//55e<=		>
 rK   textr   c                 J     | j                   d|||d|j                         S )N)r   	text_pairr   rG   )encode_plusr   )rr   r   r   r   rt   s        rI   tokenizez PreTrainedTokenizerFast.tokenize  s,    tkTTN`kdjkrrttrK   padding_strategyr=   r9   r<   rD   rB   c                    | j                   j                  }| j                   j                  }|t        j                  k(  r||| j                   j                          na|||j                  | j                  d}	|d}
n |	D ci c]  }||j                  |d       }
}|
|	k7  r | j                   j                  di |	 |t        j                  k(  r|| j                   j                          yy|t        j                  k(  r|nd}|||n| j                  | j                  | j                   | j"                  |d}	||	k7  r | j                   j$                  di |	 yyc c}w )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r9   r<   r>   r;   )rC   r;   pad_idr?   rA   rD   rG   )rX   r[   r_   r    DO_NOT_TRUNCATEr^   valuer:   rU   r\   r!   
DO_NOT_PAD
no_padding
MAX_LENGTHrB   pad_token_idr?   r@   r`   )rr   r   r=   r9   r<   rD   rB   r|   r}   targetcurrentr   rC   s                rI   set_truncation_and_paddingz2PreTrainedTokenizerFast.set_truncation_and_padding  sZ   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG11kooa66GG& 111;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F 6!...88 "% Hs   Er   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrc   c                 
   t        |t        t        f      st        dt	        |       d      | j                  ||||||	       | j                  j                  |k7  r|| j                  _        | j                  j                  |||      }|D cg c]  }| j                  ||||||||       }}i }|d   d   j                         D ]'  }|D cg c]  \  }}||   D ]  }|  }}}}|||<   ) |D cg c]  \  }}|D ]  }|  }}}}|r2g }t        |      D ]  \  }\  }}||gt        |d         z  z  } ||d<   |d   D ]  } | j                  | ||        t        |||
	      S c c}w c c}}}w c c}}}w )
Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r=   r9   r<   rD   rB   )r   is_pretokenized)r   r   r   r   r   r   r   r   r   r   overflow_to_sample_mapping)tensor_type)ro   tupleri   	TypeErrortyper   rX   rd   encode_batchr   rk   	enumeraterW   &_eventual_warn_about_too_long_sequencer   )!rr   r   r   r   r=   r9   r<   r   rD   rB   r   r   r   r   r   r   r   r   rc   r   r   tokens_and_encodingssanitized_tokensrM   r   _r   stacksanitized_encodingsr   itoksr   s!                                    rI   _batch_encode_plusz*PreTrainedTokenizerFast._batch_encode_plus  s   . 2UDMBLTRjMkLllmn 
 	''- 3!1% 	( 	
 ??004HH4HDOO1OO00$1/ 1 
	. & 
  ""!&;&;*C+E'=+ # 	 
  
( '*1-224 	*C&:NN74DINqQNQNEN$)S!	* 1ESSWQdSqSqSS %)+& )*> ? K9D!*qcC[8I4J.JJ*K=W9:)+6 	XI77	:wW	X-/BP^__I 
, OSs    E2E7
<E>r   c                    |r||fgn|g} | j                   |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|`|s^t        |j                         D ci c].  \  }}|t        |      dkD  rt	        |d   t
              r|d   n|0 c}}|j                        }| j                  |d   ||       |S c c}}w )Nr   r   r   r=   r9   r<   rD   rB   r   r   r   r   r   r   r   r   rc   r   r   )r   r   rh   rW   ro   ri   r   r   )rr   r   r   r   r   r=   r9   r<   r   rD   rB   r   r   r   r   r   r   r   r   rc   rt   batched_inputbatched_outputrM   r   s                            rI   _encode_plusz$PreTrainedTokenizerFast._encode_plus@  sg   . 09$	*+tf000
 3
  2
 .	

 !4
 "
 
  2
 &
 *
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ "6'
0 !*C* '5&:&:&<"U c%j1nE!Hd9S%(Y^^ ((N 	33N;4OQ[]des   -3C
c                     | j                   j                  %| j                   j                  j                  |      S dj                  |      S )N )r   r   decodejoin)rr   r   s     rI   convert_tokens_to_stringz0PreTrainedTokenizerFast.convert_tokens_to_string}  sJ     %%--9 ""**11&9	
 &!	
rK   	token_idsclean_up_tokenization_spacesc                     |j                  dd      | _        t        |t              r|g}| j                  j                  ||      }||n| j                  }|r| j                  |      }|S |S )Nuse_source_tokenizerF)r   )rO   rZ   ro   r   rX   r  r  clean_up_tokenization)rr   r
  r   r  rt   r   
clean_texts          rI   _decodezPreTrainedTokenizerFast._decode  s~     -3JJ7Mu,U)i%"I%%iEX%Y ,7 )22 	%
 (33D9JKrK   save_directory
file_nameslegacy_formatfilename_prefixc                    t        |      }| j                  |du rt        d      |du xs |du xr | j                  duxr | j                  }|du xs |du }|rt        j
                  j                  ||r|dz   ndt        z         }| j                  j                         D 	ci c]  \  }}	|	| j                  k\  s||	 }
}}	|
rDt        |dd	      5 }t        j                  |
d
dd      dz   }|j                  |       ddd       | j                  ||      }||z   |fz   }|rOt        j
                  j                  ||r|dz   ndt         z         }| j"                  j%                  |       ||fz   }|S c c}	}w # 1 sw Y   ~xY w)z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- wzutf-8)r      )indent	sort_keysensure_ascii
)r  )rl   r,   rP   r   ospathr  ADDED_TOKENS_FILErj   rh   r   openjsondumpswritesave_vocabularyTOKENIZER_FILEr   save)rr   r  r  r  r  	save_slow	save_fastadded_tokens_filetokr   added_vocabfout_strvocab_filesr(   s                  rI   _save_pretrainedz(PreTrainedTokenizerFast._save_pretrained  s    ^,$$,$1F`  d";mt&; -))5-,, 	
 "T)C]e-C	 "/3!6rUf f! 9=8Q8Q8W8W8Yv*#u]bfjfufu]u3:vKv+S7C %q"jjQ$]bcfjjGGGG$% ..~._K#k15F4HHJWW\\/3!6rUc cN ""''7#~&77J! w% %s   &E;>E;.FF
c           	      	   t        j                  | j                  j                               }|j	                  d      }|j	                  d      }	d}
|d   d   dk(  ri |d   d<   g |d   d<   np|d   d   d	k(  r=|d   d
   ]|d   d
   }|d   d   |   d   }
|	|
|v r||
   }
d|d   d
<   |
dgg|d   d<   n(|d   d   dv r	i |d   d<   nt        d|d   d    d      |"d|d   v r|d   d   |v r||d   d      |d   d<   t        j                  t        j                  |            }g }|D ]b  }|j	                  dd      }|j	                  dd      }|d   d   d	k7  r|s5||d   |v r||d      |d<   |j                  t        d'i |       d ||j                  |       |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   d	k(  r|
|
|d<   |d   |d   d   dk(  s*|d   d   dk(  r@d|d   v r9t        d |d   d   D              r!t        j                  j                         |d<   t         |d   d      } |d'||d|}|j#                  |||       |	&t        j                  |j                               }d|	v r|	d   D ]  }|	d   |   d   }||D cg c]  }|j%                  ||       }}||	d   |   d<   |D ]   }|j'                  |      }|t        d        |D cg c]  }|j'                  |       c}|	d   |   d!<    d"D ]?  }||	v s|	|   \  }}|	||v r||   }|j'                  |      }|t        d       ||g|	|<   A |	|d<   t        j                  t        j                  |            }| j(                  j+                         }t,        j.                  j+                         }|j1                  d#       |D ]  }t3        | |      t3        | |      }|	||v r||   }| j4                  j%                  |d      }t7        |t              r=t        ||j8                  |j:                  |j<                  |j>                  d$%      ||<   |||<    | j@                  }||j                  |       tC        |      dkD  r||d#<    | jD                  d'd&|i|S c c}w c c}w )(uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelr   r$   r   mergesr%   unk_idr   g        )r&   r'   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrN   idr   continuing_subword_prefixend_of_word_suffixpre_tokenizer	ByteLevelSequencepretokenizersc              3   ,   K   | ]  }|d    dk(    yw)r   r<  NrG   ).0pretokenizers     rI   	<genexpr>zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>5  s"        V$3s   initial_alphabet)r   r   )rC   trainerr   r   zQAttempted to set a token in the post processor that does not exist in the mappingr   )clssepr7   T)single_wordlstriprstrip
normalizedrN   r.   rG   )#r"  loadsrX   to_strrO   rP   rS   from_strr#  rp   r   extendanypre_tokenizers_fastr<  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrU   r   rY   rQ   r   SPECIAL_TOKENS_ATTRIBUTESremovegetattr_special_tokens_mapro   rG  rH  rI  rJ  r7   rW   r   )rr   text_iteratorr   rC   new_special_tokensspecial_tokens_maprt   tokenizer_jsonr2  r3  r7  r6  r5   r   added_tokenrN   r   trainer_classrD  trained_tokenizer_jsonrM   r   r~   token_idspecial_tokenspecial_tokens_listspecial_token_fullr7   s                               rI   train_new_from_iteratorz/PreTrainedTokenizerFast.train_new_from_iterator  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1iCU6U 29 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EnU\F]^iFj3kN7#K0!**4::n+EF	 ' 	=K!ooi6Gd+Ag&v.);G!-+i2HL^2^);K	<R)SI&!!*";{";<	= )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+F;?+7/7;Fo.v6*D>/#BB $2?$CO$T 
 *=)F)F)O)O)QF%&01H1PQ_:n_X^_%%mFG%T%%)ZZ	0@0@0B%C">1)*:; vC+,<=cB8LF)5TZ![5"4"8"8"F![![FLN#34S9(C!' #,#8#8#?#+", s#  ouCuejIDYDYZ_D`CuN#34S9%@v "0 
F N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1
F 8F"#34%..tzz:P/QRI!!&&(0JJOOQ""#>?( 	2EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*=$.%$6$B$B188188#5#@#@ $%F5M %2F5M%	2( %)$B$B!)%,,-?@()A-2KF./t~~CyCFCCq "\ Dvs   SS)NNFFFFT)F)NF)FN)NN)NNN)?__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesr,   r   __annotations__rb   propertyboolr   r   r   r   r   rl   r   r   rj   r   r2   r   r   rS   r   DecoderFastr   EncodingFastr   r	   r   r   r   r
   r   r   r   r   r   r   r   r   r!   r    r   r   r   r   r   r   r   r   r   r  r	  r  r  PathLiker0  rc  __classcell__)r   s   @rI   r+   r+   Q   s   
 *04-4m(^       GC G GA4S> A  tCH~     nd38n n n :d3
?&; : :nc3h nF F =   ' ' ' 1504*/+0',#-(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^UE#x}2D,E U%PSUYZ]U^P^J_ U   7# 7(3- 76d5j+A&B 6]` 6?d ?s ?, GLd3i(?C	sDI~	4uS u uRV umqrumv uI9)I9 0I9 	I9
 I9 %SMI9 tnI9` $(,;,F,F2D2T2T$($),0'+(,0404*/+0',#%*+Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` tnY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y`| DH#',;,F,F2D2T2T$($),0'+)-0404*/+0',#%*);I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; tn; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-;z
tCy 
S 
 %*-1	d3i( " '+	 
8 )-)-/c2;;.// #J/  ~	/
 "#/ 
s/j rDrK   r+   )?rg  rQ   r"  r  collectionsr   typingr   r   r   r   r   r	   r
   tokenizers.pre_tokenizerspre_tokenizersrP  
tokenizersr   rn  r   rS   tokenizers.decodersr   rm  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r    utilsr!   r"   r#   
get_loggerrd  loggerr&  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEr   rR  rh  r+   rG   rK   rI   <module>r     s   
   	 # D D D 7 / 1 6 ^ ^ : 5 = 3   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-mD5 mD .mDrK   