
    sgji              	       J   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlmZ ddlmZ  ej                  e      Zdd	d
Zd Z G d de      Z	 dZdZededdddddf	Z e
j0                  ddj3                  e      z  e
j4                  e
j6                  z  e
j8                  z        Z e
j0                  d      Z e
j0                  ee
j4                  e
j6                  z  e
j8                  z        Z e
j0                  d      Z d"dZ!d#dZ" G d d      Z#d Z$d  Z%d$d!Z&y)%z!Tokenization classes for BERTweet    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 x    t               }| d   }| dd D ]  }|j                  ||f       |} t        |      }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       e/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairsr   &   sO     EEQIQR 		9d#$	 JEL    c            
       8    e Zd ZdZeZ	 	 	 	 	 	 	 	 d fd	Z	 ddee   de	ee      dee   fdZ
	 ddee   de	ee      dedee   f fdZ	 ddee   de	ee      dee   fd	Zed
        Zd Zd Zd Zd Zd Zd Zd Zd Zddede	e   dee   fdZd Z xZS )BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`):
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    c                    	 ddl m} || _        || _        || _        i | _        d| j                  t        |      <   d| j                  t        |	      <   d| j                  t        |      <   d| j                  t        |      <   | j                  |       | j                  j                         D ci c]  \  }}||
 c}}| _        t        |d      5 }|j                         j                  d	      d d
 }d d d        D cg c]  }t!        |j                         d d
         }}t#        t%        |t'        t)        |                        | _        i | _        || _        t1               | _        ddd| _        t7        | p  d|||||||	|
d| y # t        $ r  t        j                  d       d | _        Y w xY wc c}}w # 1 sw Y   xY wc c}w )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   utf-8encoding
'z...)u   ’u   …)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_token )emojir   	demojizerImportErrorloggerwarningr
   r   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscacher"   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr
   r   r"   r#   r$   r%   r&   r'   r(   r)   kwargsr   kvmerges_handlemergesmerge	__class__s                     r   rC   zBertweetTokenizer.__init__k   s   		"&%DN %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>A1>+0 	;M"'')//5cr:F	;9?@%cr*+@@c&%F*<=>
*!/!1&)%8 
	
'!
	
 
	
=  	"NN( "DN	"$ ?	; 	;@s)   F 3G#G#G%F?>F?Gtoken_ids_0token_ids_1returnc                     || j                   g|z   | j                  gz   S | j                   g}| j                  g}||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERTweet sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)rD   rL   rM   clsseps        r    build_inputs_with_special_tokensz2BertweetTokenizer.build_inputs_with_special_tokens   sg    ( %%&48I8I7JJJ  !  ![ 3&,{:S@@r   already_has_special_tokensc                     |rt         |   ||d      S |dgdgt        |      z  z   dgz   S dgdgt        |      z  z   ddgz   dgt        |      z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rL   rM   rU   r   r   )rB   get_special_tokens_maskr<   )rD   rL   rM   rU   rK   s       r   rW   z)BertweetTokenizer.get_special_tokens_mask   s    & &72'[]a 3   31#K 001QC77sqcC,,-A61#K@P:PQUVTWWWr   c                     | j                   g}| j                  g}|t        ||z   |z         dgz  S t        ||z   |z   |z   |z   |z         dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        r   )rQ   rP   r<   )rD   rL   rM   rS   rR   s        r   $create_token_type_ids_from_sequencesz6BertweetTokenizer.create_token_type_ids_from_sequences   sm    "   !  !s[(3./1#553$s*S0;>DEKKr   c                 ,    t        | j                        S N)r<   r0   rD   s    r   
vocab_sizezBertweetTokenizer.vocab_size   s    4<<  r   c                 B    t        | j                  fi | j                  S r[   )r9   r0   added_tokens_encoderr\   s    r   	get_vocabzBertweetTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 $    | j                   v r j                   |   S t        |      }t        t        |d d       |d   dz   gz         }t        |      }|s|S 	 t	        | fd      }| j
                  vrn|\  }}g }d}|t        |      k  r	 |j                  ||      }	|j                  |||	        |	}||   |k(  r6|t        |      dz
  k  r%||dz      |k(  r|j                  ||z          |dz  }n|j                  ||          |dz  }|t        |      k  rt        |      }|}t        |      dk(  rnt        |      }dj                  |      }|d d	 }| j                   |<   |S # t        $ r |j                  ||d         Y nw xY w)
Nr    z</w>c                 N    j                   j                  | t        d            S )Ninf)r=   getfloat)pairrD   s    r   <lambda>z'BertweetTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1W r   )keyr   r   r   @@ )r>   r8   listr   minr=   r<   indexextend
ValueErrorappendjoin)
rD   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezBertweetTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"ME6HAc$i-

5!,A
 OOD1I.A7e#CIM(9d1q5kV>SOOEFN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E/ /FFc                     | j                   r| j                  |      }g }t        j                  d|      }|D ]:  }|j	                  t        | j                  |      j                  d                   < |S )zTokenize a string.z\S+\n? )r"   normalizeTweetrefindallrn   rk   ry   r7   )rD   textsplit_tokenswordsrr   s        r   	_tokenizezBertweetTokenizer._tokenize(  sm    &&t,D

9d+ 	BETXXe_%:%:3%? @A	Br   c                 (   | j                   D ]!  }|j                  || j                   |         }# | j                  j                  |      }dj	                  |D cg c]  }| j                  |       c}      }|j                  dd      j                  dd      j                  dd      j                  dd      j                  d	d
      }|j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      }|j                  dd      j                  dd      j                  dd      j                  dd      }dj	                  |j                               S c c}w )z'
        Normalize a raw Tweet
        r{   zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rA   replacer@   tokenizerq   normalizeTokenr7   )rD   tweetpuncttokensrr   	normTweets         r   r|   z BertweetTokenizer.normalizeTweet3  se    (( 	EEMM%)<)<U)CDE	E ''007HHfMUd11%8MN	 i4WVW%WWg&WXw'WXw' 	 eV,WVW%WUF#WVW%WUF#WVW% 	 j(3WY(WZ)WY(	 	 xx	)**1 Ns   Fc                 $   |j                         }|j                  d      ry|j                  d      s|j                  d      ryt        |      dk(  r<|| j                  v r| j                  |   S | j                  | j	                  |      S |S |S )z-
        Normalize tokens in a Tweet
        @z@USERhttpwwwHTTPURLr   )lower
startswithr<   rA   r,   )rD   rr   lowercased_tokens      r   r   z BertweetTokenizer.normalizeTokenU  s     !;;=C ((04D4O4OPU4VZ1_+++**511~~)~~e,,Lr   c                     | j                   j                  || j                   j                  | j                              S )z0Converts a token (str) in an id using the vocab.)r0   rd   r'   )rD   rr   s     r   _convert_token_to_idz&BertweetTokenizer._convert_token_to_idh  s,    ||t||'7'7'GHHr   c                 N    | j                   j                  || j                        S )z=Converts an index (integer) in a token (str) using the vocab.)r4   rd   r'   )rD   rm   s     r   _convert_id_to_tokenz&BertweetTokenizer._convert_id_to_tokenl  s    ||t~~66r   c                 d    dj                  |      j                  dd      j                         }|S )z:Converts a sequence of tokens (string) in a single string.r{   ri    )rq   r   strip)rD   r   
out_strings      r   convert_tokens_to_stringz*BertweetTokenizer.convert_tokens_to_stringp  s,    XXf%--eR8>>@
r   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  r@t         j                  j                  | j                        rt        | j                  |       nit         j                  j                  | j                        s@t        |d      5 }| j                  j                         }|j                  |       d d d        t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       ||fS # 1 sw Y   lxY w)NzVocabulary path (z) should be a directory-r   r
   r   wb)ospathisdirr.   errorrq   VOCAB_FILES_NAMESabspathr
   isfiler   r5   sp_modelserialized_model_protowriter   )rD   r   r   out_vocab_fileout_merge_fileficontent_spiece_models          r   save_vocabularyz!BertweetTokenizer.save_vocabularyu  sr   ww}}^,LL,^,<<STUo_s22QbcoQpp
 o_s22QbcpQqq
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n50nd+ /r'+}}'K'K'M$-./ 77??4++,0OOT%%~6~--/ /s   ,G11G:c                    t        |t              r*	 t        |dd      5 }| j                  |       ddd       y|j                         }|D ]Z  }|j                         }|j                  d      }|dk(  rt        d	      |d| }t        | j                        | j                  |<   \ y# 1 sw Y   yxY w# t        $ r}|d}~wt
        $ r t        d| d      w xY w)
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr   r   NzIncorrect encoding detected in z, please rebuild the datasetr{   r    z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer1   r5   r2   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindro   r<   r0   )	rD   ffdfnfelineslineTmplineidxr   s	            r   r2   zBertweetTokenizer.add_from_file  s     ac!S73 +r&&r*+  	3G==?D**S/Cby !XYY:D!$T\\!2DLL	3+ 	 % 
 c"A!D` abbcs3   B7 B+B7 +B40B7 4B7 7	C CC)F<s></s>r   r   z<unk>z<pad>z<mask>r[   )NF)__name__
__module____qualname____doc__r   vocab_files_namesrC   r   intr   rT   boolrW   rY   propertyr]   r`   ry   r   r|   r   r   r   r   r1   r   r   r2   __classcell__)rK   s   @r   r   r   6   sN   0d * :
z JNA9A3;DI3FA	cA6 sxX9X3;DI3FXkoX	cX: JNL9L3;DI3FL	cL0 ! !?*X	 +D&I7
.c .HSM .]bcf]g .:3r   r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);c                 R    |d}t        | t              r| j                  ||      S | S )Nr   )r   bytesdecode)r   r   errorss      r   _str_to_unicoder   \  s-    ${{8V,,Kr   c                 R    fd}t         j                  |t        | |            S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".

    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c                     | j                  d      }| j                  d      rU	 | j                  d      rt        |d      }nt        |d      }d|cxk  rdk  rn nt        |f      j                  d      S n>|v r| j                  d	      S t
        j                  j                  j                  |      }|	 t        |      S rd
S | j                  d	      S # t        $ r d }Y 0w xY w# t        t        f$ r Y 7w xY w)Nr   r   r      
         cp1252r   r   )groupr   r   r   ro   htmlentitiesname2codepointrd   chrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entityz/_replace_html_entities.<locals>._convert_entity  s    kk!n;;q>;;q> b1F b1F
 6)T) &+228<< d"{{1~%5599+F6{" $r7Q7   . s$   AC :
C+ C('C(+C=<C=)ENT_REsubr   )r   r   r   r   r   s    ``  r   _replace_html_entitiesr   d  s"    <8: ::otX'FGGr   c                       e Zd ZdZddZd Zy)r?   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```c                 .    || _         || _        || _        y r[   preserve_case
reduce_lenstrip_handles)rD   r   r   r   s       r   rC   zTweetTokenizer.__init__  s    *$*r   c                 X   t        |      }| j                  rt        |      }| j                  rt	        |      }t
        j                  d|      }t        j                  |      }| j                  s4|D cg c])  }t        j                  |      r|n|j                         + }}|S c c}w )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1)r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_REr~   r   EMOTICON_REsearchr   )rD   r   	safe_textr   xs        r   r   zTweetTokenizer.tokenize  s     &d+!$'D??%d+DKK	40		*!!HMN1+,,Q/QQWWY>NEN Os   5.B'NTFF)r   r   r   r   rC   r   r*   r   r   r?   r?     s    &+
r   r?   c                 P    t        j                  d      }|j                  d|       S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   r   patterns     r   r   r     s#     mmL)G;;y$''r   c                 P    t        j                  d      }|j                  d|       S )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)r{   r   r   s     r   r   r     s+     mm 	BG ;;sD!!r   c                 <    t        |||      j                  |       S )z:
    Convenience function for wrapping the tokenizer.
    r   )r?   r   )r   r   r   r   s       r   casual_tokenizer     s$     *\ijss r   )Nstrict)r*   Tr   r   )'r   r   r   r}   shutilr   typingr   r   r   r   tokenization_utilsr   utilsr	   
get_loggerr   r.   r   r   r   	EMOTICONSURLSREGEXPSr   rq   VERBOSEIUNICODEr   r   r   r   r   r   r?   r   r   r   r*   r   r   <module>r     sP    (  	 	  ( (  5  
		H	%   q3+ q3~L		$)\ 		  (.
A+` %--chhw&779PSXS`S`9`
a %--/
0 emmIu}}uww'>'NO 
.	/;HB0 0p(" r   