
    sgw.                         d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ  G d d	ej"                  j$                        Zy)
    N)ListUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
edededef fdZe	dd       Z
e	deeej                  f   fd       Zd Z	 	 	 	 	 	 	 ddZd Z xZS )TFBertTokenizera  
    This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab_list (`list`):
            List containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        padding (`str`, defaults to `"longest"`):
            The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
            or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
        truncation (`bool`, *optional*, defaults to `True`):
            Whether to truncate the sequence to the maximum length.
        max_length (`int`, *optional*, defaults to `512`):
            The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
            `truncation` is `True`).
        pad_to_multiple_of (`int`, *optional*, defaults to `None`):
            If set, the sequence will be padded to a multiple of this value.
        return_token_type_ids (`bool`, *optional*, defaults to `True`):
            Whether to return token_type_ids.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention_mask.
        use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
            TFLite.
    
vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc                 .   t         |           |r#t        |ft        j                  |d|| _        nt        j                  j                  t        j                  j                  |t        j                  t        j                  t        j                  |t        j                        t        j                        t        j                        d      }t        |ft        j                  |d|| _        || _        || _        ||j                  d      n|| _        ||j                  d	      n|| _        ||j                  d
      n|| _        t'        |dz
  d      | _        || _        || _        || _        |	| _        |
| _        || _        y )N)token_out_typelower_case_nfd_strip_accents)out_type)dtype)keys	key_dtypevaluesvalue_dtyper   )num_oov_buckets)r   
lower_casez[CLS]z[SEP]z[PAD]r   axis)super__init__r   tfint64tf_tokenizerlookupStaticVocabularyTableKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   tokenizer_kwargslookup_table	__class__s                  `/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bert/tokenization_bert_tf.pyr*   zTFBertTokenizer.__init__9   sp     	" 1!+-88R_!cs!D 99::		33# ii88BGGJ$JRTRZRZ[ "	 4  !" ; L !3!-/XX-!Sc!D %*9E9MJ,,W5S_9E9MJ,,W5S_9E9MJ,,W5S_2:>J$$"4%:"%:"    c           	         |j                  dd      }||j                  n|}|j                  dd      }||j                  n|}|j                  dd      }||j                  n|}|j                  dd      }||j                  n|}|j                         }t        |j                         d       }|D cg c]  }|d   	 }	} | d
|	||||d	|S c c}w )a  
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```python
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        r   Nr   r   r   c                     | d   S )Nr    )xs    r;   <lambda>z0TFBertTokenizer.from_tokenizer.<locals>.<lambda>   s
    AaD r<   )keyr   r   r   r   r   r   r?   )popr   r   r   r   	get_vocabsorteditems)
cls	tokenizerkwargsr   r   r   r   vocabentryr   s
             r;   from_tokenizerzTFBertTokenizer.from_tokenizeri   s    $ 

?D93@3H	//mzz.$71=1Ey--<zz.$71=1Ey--<zz.$71=1Ey--<##%u{{}.9,125eAh2
2 
!'%%%
 
 	
 3s   9Cpretrained_model_name_or_pathc                     	 t        j                  |g|i |} | j                  |fi |S #  ddlm}  |j                  |g|i |}Y 3xY w)a  
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```python
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        r   )BertTokenizerFast)r   from_pretrainedtokenization_bert_fastrP   rM   )rH   rN   init_inputsrJ   rI   rP   s         r;   rQ   zTFBertTokenizer.from_pretrained   sj    "	q%556SlValeklI
 "s!!)6v66		qA9)99:WpZepiopIs	   . Ac                     | j                   rt        |      }| j                  j                  |      }|j	                  dd      S )Nr   )r   r   r-   tokenize
merge_dims)r7   textstokenss      r;   unpaired_tokenizez!TFBertTokenizer.unpaired_tokenize   s>    "5)E""++E2  B''r<   c	                    || j                   }|dvrt        d      ||t        d      || j                  }|| j                  }|| j                  }|| j
                  }|| j                  }t        |t        j                        st        j                  |      }|/t        |t        j                        st        j                  |      }|H|j                  j                  dkD  rt        d      |j                  j                  dkD  rt        d      |j                  j                  dk(  r|d d df   |d d df   }}| j                  |      }|7|r|d d d |dz
  f   }t        |f| j                  | j                   	      \  }	}
nZ| j                  |      }|r | j"                  j%                  ||g      \  }}t        ||f| j                  | j                   	      \  }	}
|d
k(  r:|	j'                  d      }|(|t        j(                  j+                  | |       z  }n|}t-        |	|| j.                        \  }	}d|	i}|r||d<   |r t-        |
|| j.                        \  }
}|
|d<   |S )N)longestr   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )start_of_sequence_idend_of_segment_idr\   r'   )max_seq_length	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r   
isinstancer+   Tensorconvert_to_tensorshaperankrZ   r	   r   r   r6   trimbounding_shapemathfloordivr
   r   )r7   text	text_pairr   r   r   r   r   r   rb   rd   
pad_lengthrc   output_s                  r;   callzTFBertTokenizer.call   s    ?llG33PQQ!i&;ijjJJ%!%!8!8 ($($>$>! ($($>$>!$		*''-D Iryy)I,,Y7I zz" !mnn##a' !LMM::??a"1a4j$q!t*)D%%d+A/a//0(8d.?.?SWSdSd)%I~ ..y9I"&"5"5":":D);L"Mi(8y!8I8I]a]n]n)%I~ i"11q19J!-/BGG4D4Dj[Rd4e3ef
#J$4Yzeievev$w!	>y) '5F#$  0zTEVEV!NA (6F#$r<   c                 v    | j                   | j                  | j                  | j                  | j                  dS )NrC   rC   )r7   s    r;   
get_configzTFBertTokenizer.get_config   s7    //!// -- -- --
 	
r<   )
NNNr\   Ti   NTTT)rI   PreTrainedTokenizerBase)NNNNNNN)__name__
__module____qualname____doc__r   boolintstrr*   classmethodrM   r   osPathLikerQ   rZ   rt   rv   __classcell__)r:   s   @r;   r   r      s   *` !   "&&*&*(,.;.; .; 	.;
 .; .; .; .; .;  .;  $.;  $.; "&.;` $
 $
L 7E#r{{BR<S 7 70( ""FP
r<   r   )r   typingr   r   
tensorflowr+   tensorflow_textr   r4   r   r   r   r	   r
   modeling_tf_utilsr   tokenization_bertlayersLayerr   r?   r<   r;   <module>r      s5    	   ? w w & ,r
ell(( r
r<   