
    sgy                        d Z ddlZddlZddlmZmZ ddlZddlm	Z	 ddl
mZ  ej                  e      Zddd	Zi d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+i d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMi dNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtZdu Z G dv dwe	      Zy)xz)Tokenization classes for Salesforce CTRL.    N)OptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 x    t               }| d   }| dd D ]  }|j                  ||f       |} t        |      }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairsrJ   ^   sO     EEQIQR 		9d#$	 JEL    c                        e Zd ZdZeZeZd fd	Ze	d        Z
d Zd Zd Zd Zd Zd	 Zdd
edee   dee   fdZ xZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    c           
      V   t        |d      5 }t        j                  |      | _        d d d        | j                  j	                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d      dd }	d d d        	D 
cg c]  }
t        |
j                                }	}
t        t        |	t        t        |	                        | _        i | _        t        | @  dd|i| y # 1 sw Y   xY wc c}}w # 1 sw Y   xY wc c}
w )Nutf-8encoding
rB   	unk_token )openjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr   r	   rT   kwargsvocab_handlekvmerges_handlemergesmerge	__class__s              rI   rf   zCTRLTokenizer.__init__   s    *w/ 	3<99\2DL	3)-););)=>A1>+0 	<M"'')//5a;F	<4:;5%&;;c&%F*<=>
7977	3 	3>	< 	<;s#   DD2#D! D&DD#c                 ,    t        | j                        S N)rb   rY   rg   s    rI   
vocab_sizezCTRLTokenizer.vocab_size   s    4<<  rK   c                 B    t        | j                  fi | j                  S rq   )r_   rY   added_tokens_encoderrr   s    rI   	get_vocabzCTRLTokenizer.get_vocab   s    DLL>D$=$=>>rK   c                 $    | j                   v r j                   |   S t        |      }t        t        |d d       |d   dz   gz         }t        |      }|s|S 	 t	        | fd      }| j
                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d d	 }| j                   |<   |S # t        $ r |j                  ||d         Y nw xY w)
NrS   z</w>c                 N    j                   j                  | t        d            S )Ninf)rc   getfloat)pairrg   s    rI   <lambda>z#CTRLTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1W rK   keyr   rB      @@ )rd   r^   listrJ   minrc   rb   indexextend
ValueErrorappendjoin)
rg   tokenrE   rF   bigramfirstsecondnew_wordijs
   `         rI   bpezCTRLTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E/ /FFc                     g }t        j                  d|      }|D ]:  }|j                  t        | j	                  |      j                  d                   < |S )zTokenize a string.z\S+\n? )refindallr   r   r   r]   )rg   textsplit_tokenswordsr   s        rI   	_tokenizezCTRLTokenizer._tokenize   sT    

9d+ 	BETXXe_%:%:3%? @A	BrK   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rY   rz   rT   )rg   r   s     rI   _convert_token_to_idz"CTRLTokenizer._convert_token_to_id   s,    ||t||'7'7'GHHrK   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r[   rz   rT   )rg   r   s     rI   _convert_id_to_tokenz"CTRLTokenizer._convert_id_to_token   s    ||t~~66rK   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r   r    )r   replacestrip)rg   tokens
out_strings      rI   convert_tokens_to_stringz&CTRLTokenizer.convert_tokens_to_string   s,    XXf%--eR8>>@
rK   save_directoryfilename_prefixreturnc           	      P   t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	      5 }|j                  t        j                  | j                  d
dd      dz          d d d        d}t        |dd	      5 }|j                  d       t        | j                  j                         d       D ]M  \  }}	||	k7  rt        j                  d| d       |	}|j                  dj                  |      dz          |dz  }O 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)NzVocabulary path (z) should be a directory-r   r   r	   wrO   rP   r   TF)indent	sort_keysensure_asciirR   r   z#version: 0.2
c                     | d   S )NrB   rU   )kvs    rI   r}   z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>   s    Y[\]Y^ rK   r~   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   rB   )ospathisdirloggererrorr   VOCAB_FILES_NAMESrV   writerW   dumpsrY   sortedrc   rZ   warning)
rg   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             rI   save_vocabularyzCTRLTokenizer.save_vocabulary   s   ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 
	LL*++1$..2F2F2HN^+_ '
KK'NN/
| <M M (ESXXj1D89

	 :%%!	c 	c
	 :%%s   *6F8BFFF%)z<unk>rq   )__name__
__module____qualname____doc__r   vocab_files_namesCONTROL_CODEScontrol_codesrf   propertyrs   rv   r   r   r   r   r   strr   r   r   __classcell__)ro   s   @rI   rM   rM   n   ss      *!M	8 ! !?*XI7
&c &HSM &]bcf]g &rK   rM   )r   rW   r   typingr   r   regexr   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   rJ   rM   rU   rK   rI   <module>r      s   0  	 "  5  
		H	%  88D8 v8 u	8
 e8 
58 
58 F8 8 8 e8 8 u8 v8 u8  !8" u#8$ U%8& U'8( e)8* T+8, T-8. U/80 E182 U384 d586 
5788 e98: e;8< u=8> t?8@ eA8B %C8D uE8F G8H VI8J uK8L EM8N uO8P UQ8R uS8T fU8V W8X TY8Z u[8\ 6]8^ %_8` Ua8b c8d Ee8f Vg8h o8v D&' D&rK   