
    sg0;                         d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZ  ej                  e      Zddd	Zd
 Z G d d      Zd Zd Z G d de	      Zy)z$Tokenization classes for OpenAI GPT.    N)OptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 N    | j                         } | sg S | j                         }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)texttokenss     a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/openai/tokenization_openai.pywhitespace_tokenizer   $   s%    ::<D	ZZ\FM    c                   J    e Zd ZdZ	 	 	 	 	 d
dZddZd ZddZd Zd Z	d	 Z
y)BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    Nc                 d    |g }|| _         t        |      | _        || _        || _        || _        y N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr   r   r   r   r   s         r   __init__zBasicTokenizer.__init__E   s<     K*{+&<#* 0r   c                 J   |r$| j                   j                  t        |            n| j                   }| j                  |      }| j                  r| j                  |      }t        j                  d|      }t        |      }g }|D ]  }||vrY| j                  r0|j                         }| j                  dur/| j                  |      }n| j                  r| j                  |      }|j                  | j                  ||              t        dj                  |            }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCF )r   unionr   _clean_textr   _tokenize_chinese_charsunicodedata	normalizer   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r   r   r   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizezBasicTokenizer.tokenizeU   s    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB  	MEK'%%!KKME))6 $ 7 7 >'' 33E:E 7 7{ KL	M ,CHH\,BCr   c                     t        j                  d|      }g }|D ].  }t        j                  |      }|dk(  r|j                  |       0 dj	                  |      S )z$Strips accents from a piece of text.NFDMn )r'   r(   categoryappendr-   )r   r   outputcharcats        r   r*   z!BasicTokenizer._run_strip_accents{   s^    $$UD1 	 D&&t,Cd{MM$		 
 wwvr   c                 v   | j                   r|||v r|gS t        |      }d}d}g }|t        |      k  rb||   }t        |      r|j	                  |g       d}n)|r|j	                  g        d}|d   j	                  |       |dz  }|t        |      k  rb|D cg c]  }dj                  |       c}S c c}w )z&Splits punctuation on a piece of text.r   TF   r7   )r   listlenr   r9   r-   )	r   r   r   charsistart_new_wordr:   r;   xs	            r   r,   z!BasicTokenizer._run_split_on_punc   s    $$)@T[EX6MT
#e*n8Dt$tf%!%!MM"%!&r
!!$'FA #e*n %++q
+++s   B6c                     g }|D ]c  }t        |      }| j                  |      r4|j                  d       |j                  |       |j                  d       S|j                  |       e dj                  |      S )z)Adds whitespace around any CJK character.r#   r7   )ord_is_chinese_charr9   r-   r   r   r:   r;   cps        r   r&   z&BasicTokenizer._tokenize_chinese_chars   sm     	$DTB$$R(c"d#c"d#	$ wwvr   c                     |dk\  r|dk  sF|dk\  r|dk  s<|dk\  r|dk  s2|dk\  r|dk  s(|d	k\  r|d
k  s|dk\  r|dk  s|dk\  r|dk  s
|dk\  r|dk  ryy)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )r   rJ   s     r   rH   zBasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r   c                     g }|D ]Q  }t        |      }|dk(  s|dk(  st        |      r$t        |      r|j                  d       A|j                  |       S dj	                  |      S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r#   r7   )rG   r   r	   r9   r-   rI   s        r   r%   zBasicTokenizer._clean_text   sf     	$DTBQw",+d*;d#c"d#	$ wwvr   )TNTNTr   )__name__
__module____qualname____doc__r    r3   r*   r,   r&   rH   r%   rL   r   r   r   r   .   s<    0 #1 $L	,,0r   r   c                 b    t               }| d   }| dd D ]  }|j                  ||f       |} |S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r   r?   N)r   add)wordpairs	prev_charr;   s       r   	get_pairsrW      sF    
 EEQIQR 		9d#$	 Lr   c                 `   | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } t        j                  d	d
|       } t        j                  dd|       } t        j                  dd|       } | j                         S )zm
    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
    u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+r#   )replaceresubr   )r   s    r   text_standardizer_      s     <<s#D<<s#D<<s#D<<u%D<<c"D66]_fhlmD66+vt,D66+sD)D::<r   c                        e Zd ZdZeZddgZd fd	Zed        Z	ed        Z
d Zd Zd	 Zd
 Zd Zd Zddedee   dee   fdZ xZS )OpenAIGPTTokenizera(  
    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

    - lowercases all inputs,
    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
      `BasicTokenizer` if not.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_maskc           
      :   	 dd l }ddlm}  |       }|j                  | _        |j
                  | _        t        |d      5 }t        j                  |      | _        d d d        | j                  j                         D 	
ci c]  \  }	}
|
|	
 c}
}	| _        t        |d      5 }|j!                         j#                  d      d	d
 }d d d        D cg c]  }t%        |j#                                }}t'        t)        |t+        t-        |                        | _        i | _        t3        | h  dd|i| y # t        $ r1 t        j                  d       t        d      | _        d | _        Y ?w xY w# 1 sw Y   xY wc c}
}	w # 1 sw Y   xY wc c}w )Nr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)r   utf-8encoding
r?   r>   	unk_tokenrL   )ftfyspacy.lang.enre   	tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr   tupledictziprangerA   	bpe_rankscachesuperr    )r   r   r   rj   kwargsrk   re   _nlpvocab_handlekvmerges_handlemergesmerge	__class__s                 r   r    zOpenAIGPTTokenizer.__init__  sQ   
	!-9D~~DH MMDM *w/ 	3<99\2DL	3)-););)=>A1>+0 	<M"'')//5a;F	<4:;5%&;;c&%F*<=>
7977  	!NNno%D9DH DM	!
	3 	3>	< 	<;s5   3D< E9F&#F F<6E65E69FFc                      y)NTrL   r   s    r   r   z OpenAIGPTTokenizer.do_lower_case  s    r   c                 ,    t        | j                        S r   )rA   rv   r   s    r   
vocab_sizezOpenAIGPTTokenizer.vocab_size  s    4<<  r   c                 B    t        | j                  fi | j                  S r   )r{   rv   added_tokens_encoderr   s    r   	get_vocabzOpenAIGPTTokenizer.get_vocab#  s    DLL>D$=$=>>r   c                     t        |d d       |d   dz   fz   }| j                  v r j                  |   S t        |      }|s|dz   S 	 t        | fd      }| j                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d	k(  rd
}| j                  |<   |S # t        $ r |j                  ||d         Y pw xY w)Nr>   </w>c                 N    j                   j                  | t        d            S )Ninf)r~   getfloat)pairr   s    r   <lambda>z(OpenAIGPTTokenizer.bpe.<locals>.<lambda>0  s    1C1CD%PU,1W r   keyr   r?      r#   z
  </w>z
</w>)rz   r   rW   minr~   rA   indexr+   
ValueErrorr9   r-   )
r   r1   rT   rU   bigramfirstsecondnew_wordrC   js
   `         r   bpezOpenAIGPTTokenizer.bpe&  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E   F ?F c           	         g }| j                   \| j                  j                  |      }|D ]:  }|j                  t	        | j                  |      j                  d                   < |S | j                  t        | j                  |                  }|D ]R  }|j                  t	        | j                  |j                  j                               j                  d                   T |S )zTokenize a string.r#   )
ro   rn   r3   r+   r@   r   r   r_   r   r)   )r   r   r0   r1   s       r   	_tokenizezOpenAIGPTTokenizer._tokenizeR  s    == 88$$T*D F##D%)>)>s)C$DEF  88,T]]4-@ABD S##D%**2B2B2D)E)K)KC)P$QRSr   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)rv   r   rj   )r   r1   s     r   _convert_token_to_idz'OpenAIGPTTokenizer._convert_token_to_ida  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z0Converts an id in a token (BPE) using the vocab.)rx   r   rj   )r   r   s     r   _convert_id_to_tokenz'OpenAIGPTTokenizer._convert_id_to_tokene  s    ||t~~66r   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r7   r   r#   )r-   r\   r   )r   r   
out_strings      r   convert_tokens_to_stringz+OpenAIGPTTokenizer.convert_tokens_to_stringi  s+    WWV_,,VS9??A
r   save_directoryfilename_prefixreturnc           	      P   t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t        |dd	      5 }|j                  t        j                  | j                  d
dd      dz          d d d        d}t        |dd	      5 }|j                  d       t        | j                  j                         d       D ]M  \  }}	||	k7  rt        j                  d| d       |	}|j                  dj                  |      dz          |dz  }O 	 d d d        ||fS # 1 sw Y   xY w# 1 sw Y   ||fS xY w)NzVocabulary path (z) should be a directoryrY   r7   r   r   wrf   rg   r   TF)indent	sort_keysensure_asciiri   r   z#version: 0.2
c                     | d   S )Nr?   rL   )kvs    r   r   z4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r#   r?   )ospathisdirrq   errorr-   VOCAB_FILES_NAMESrs   writert   dumpsrv   sortedr~   rw   rr   )
r   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabularyz"OpenAIGPTTokenizer.save_vocabularyn  s   ww}}^,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4 	cGGDJJt||ATYZ]aab	c *cG4 
	LL*++1$..2F2F2HN^+_ '
KK'NN/
| <M M (ESXXj1D89

	 :%%!	c 	c
	 :%%s   *6F8BFFF%)z<unk>r   )rN   rO   rP   rQ   r   vocab_files_namesmodel_input_namesr    propertyr   r   r   r   r   r   r   r   strr   r   r   __classcell__)r   s   @r   ra   ra      s    ( *$&6780   ! !?*XI7
&c &HSM &]bcf]g &r   ra   )rQ   rt   r   r]   r'   typingr   r   tokenization_utilsr   r   r   r	   utilsr
   
get_loggerrN   rq   r   r   r   rW   r_   ra   rL   r   r   <module>r      sq    +  	 	  " c c  
		H	%  ^ ^B
^&, ^&r   