
    sg                         d Z ddlZddlmZmZ ddlmZmZmZ  e       r
ddl	Z	ddl	m
Z
  ej                  e      Z G d de      Zy)	z'
Feature extractor class for MarkupLM.
    N   )BatchFeatureFeatureExtractionMixin)is_bs4_availableloggingrequires_backends)BeautifulSoupc                   @     e Zd ZdZ fdZd Zd Zd ZdefdZ	 xZ
S )MarkupLMFeatureExtractorao  
    Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
    strings.

    This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
    of the main methods. Users should refer to this superclass for more information regarding those methods.

    c                 >    t        | dg       t        |   di | y )Nbs4 )r   super__init__)selfkwargs	__class__s     k/var/www/html/venv/lib/python3.12/site-packages/transformers/models/markuplm/feature_extraction_markuplm.pyr   z!MarkupLMFeatureExtractor.__init__+   s    $("6"    c           
         g }g }|j                   r|n|j                  j                  D ]y  }|j                  j                   d      }|j	                  j                          |j	                  dt        |      k(  rdnt        fdt        |d      D                     |{ |j                          |j                          ||fS )NF)	recursive   r   c              3   2   K   | ]  \  }}|u s|  y w)Nr   ).0ischilds      r   	<genexpr>z6MarkupLMFeatureExtractor.xpath_soup.<locals>.<genexpr>7   s     1e1Z[_dZd!1es   )	nameparentparentsfind_allappendlennext	enumeratereverse)r   element
xpath_tagsxpath_subscriptsr    siblingsr   s         @r   
xpath_soupz#MarkupLMFeatureExtractor.xpath_soup/   s    
"<<W^^mm 	FuzzUCHejj)###h-'T1e	(TU@V1e-e E	 	  "+++r   c                 ^   t        |d      }g }g }g }|j                  D ]  }t        |t        j                  j
                        s(t        |j                        t        j                  j                  urXt        j                  |      j                         }|s~|j                  |       | j                  |      \  }}	|j                  |       |j                  |	        t        |      t        |      k7  rt        d      t        |      t        |      k7  rt        d      |||fS )Nzhtml.parserz3Number of doc strings and xtags does not correspondz3Number of doc strings and xsubs does not correspond)r	   descendants
isinstancer   r(   NavigableStringtyper    Taghtmlunescapestripr#   r,   r$   
ValueError)
r   html_string	html_codeall_doc_stringsstring2xtag_seqstring2xsubs_seqr(   text_in_this_tagr)   r*   s
             r   get_three_from_singlez.MarkupLMFeatureExtractor.get_three_from_single>   s   !+}=	 ,, 	:G'3;;#>#>?'s{{>#'==#9#?#?#A '&&'78/3w/G,
,&&z2 ''(89	: 3#77RSS3'7#88RSS1AAAr   c                 `    d}t        ||      D ]  \  }}|d| z  }|dk7  s|d| dz  } |S )N /r   [])zip)r   r)   r*   xpathtagnamesubss         r   construct_xpathz(MarkupLMFeatureExtractor.construct_xpath[   sR     -=> 	%MGTq	]"Eqy1TF!$	% r   returnc                 f   d}t        |t              rd}n9t        |t        t        f      r#t	        |      dk(  st        |d   t              rd}|st        dt        |       d      t        t        |t        t        f      xr t        |d   t                    }|s|g}g }g }|D ]t  }| j                  |      \  }}}	|j                  |       g }
t        |||	      D ])  \  }}}| j                  ||      }|
j                  |       + |j                  |
       v ||d}t        |d      }|S )	a\  
        Main method to prepare for the model one or several HTML strings.

        Args:
            html_strings (`str`, `List[str]`):
                The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **nodes** -- Nodes.
            - **xpaths** -- Corresponding xpaths.

        Examples:

        ```python
        >>> from transformers import MarkupLMFeatureExtractor

        >>> page_name_1 = "page1.html"
        >>> page_name_2 = "page2.html"
        >>> page_name_3 = "page3.html"

        >>> with open(page_name_1) as f:
        ...     single_html_string = f.read()

        >>> feature_extractor = MarkupLMFeatureExtractor()

        >>> # single example
        >>> encoding = feature_extractor(single_html_string)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])

        >>> # batched example

        >>> multi_html_strings = []

        >>> with open(page_name_2) as f:
        ...     multi_html_strings.append(f.read())
        >>> with open(page_name_3) as f:
        ...     multi_html_strings.append(f.read())

        >>> encoding = feature_extractor(multi_html_strings)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])
        ```FTr   zQHTML strings must of type `str`, `List[str]` (batch of examples), but is of type .)nodesxpathsN)datatensor_type)r/   strlisttupler$   r6   r1   boolr=   r#   rC   rG   r   )r   html_stringsvalid_strings
is_batchedrK   rL   r7   r9   r:   r;   xpath_stringsnodetag_listsub_listxpath_stringrM   encoded_inputss                    r   __call__z!MarkupLMFeatureExtractor.__call__c   sU   `  lC( MtUm4< A%LOS)I $""&|"4!5Q8 
 *\D%=AhzR^_`RacfGgi
(>L ' 	)KAEA[A[\gAh>O_.>LL)M,/Rb,c 3(h#33HhG$$\23 MM-(	) &1%4TBr   )__name__
__module____qualname____doc__r   r,   r=   rG   r   r\   __classcell__)r   s   @r   r   r   !   s+    #,B:T Tr   r   )r`   r3   feature_extraction_utilsr   r   utilsr   r   r   r   r	   
get_loggerr]   loggerr   r   r   r   <module>rf      sI     L A A ! 
		H	%V5 Vr   