
    sg                     Z    d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ  G d de      Zy	)
z
Processor class for MarkupLM.
    )OptionalUnion   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategyc                        e Zd ZdZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedeee	e
f   deee	ef   d	ee   d
edee   dee   dee   dedededededeee	ef      defdZd Zd Zed        Zy)MarkupLMProcessoraJ  
    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
    processor.

    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

    Args:
        feature_extractor (`MarkupLMFeatureExtractor`):
            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
        parse_html (`bool`, *optional*, defaults to `True`):
            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
    MarkupLMFeatureExtractor)MarkupLMTokenizerMarkupLMTokenizerFastTNadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    | j                   r:|t        d      |||t        d      | j                  |      }|d   }|d   }n|t        d      ||t        d      || j                   rt        |t              r|g} | j
                  di d||n|d	||ndd|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|S )a  
        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
        returns the output.

        Optionally, one can also provide a `text` argument which is passed along as first sequence.

        Please refer to the docstring of the above two methods for more information.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`text	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r    )
parse_html
ValueErrorfeature_extractor
isinstancestr	tokenizer)selfhtml_stringsr    r!   r$   	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsfeaturesencoded_inputss                          c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/markuplm/processing_markuplm.py__call__zMarkupLMProcessor.__call__2   s   B ??# !ghh F$6+:Q k  --l;HW%Eh'F' !cdd} !lmm  T__)S)&K	' 
'3
(4e$
 
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
,     c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r+   batch_decoder,   argsr/   s      r2   r6   zMarkupLMProcessor.batch_decode   s     
 +t~~**D;F;;r4   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r+   decoder7   s      r2   r:   zMarkupLMProcessor.decode   s     
 %t~~$$d5f55r4   c                 2    | j                   j                  }|S )N)r+   model_input_names)r,   tokenizer_input_namess     r2   r<   z#MarkupLMProcessor.model_input_names   s     $ @ @$$r4   )NNNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____doc__feature_extractor_classtokenizer_classr&   boolr   r*   r	   r
   r   intr   r   r3   r6   r:   propertyr<   r%   r4   r2   r   r      sU   & 9DOJ #'5:;?$(,00404*/+0',#;?)N !N tS/12N $%778N SMN N %SMN  (~N  (~N $(N  %)!N" !%#N$ %N& 'N( !sJ!78)N, 
-N`<6 % %r4   r   N)rA   typingr   r   
file_utilsr   processing_utilsr   tokenization_utils_baser   r	   r
   r   r%   r4   r2   <module>rK      s)    # $ . Y Yy% y%r4   