
    sg%                         d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZ dd	lmZ  G d
 ded      Z ej*                  e      Z G d de      Zy)z
Processor class for Donut.
    N)contextmanager)ListOptionalUnion   )
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)loggingc                       e Zd Zi Zy)DonutProcessorKwargsN)__name__
__module____qualname__	_defaults     ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/donut/processing_donut.pyr   r      s    Ir   r   F)totalc            
            e Zd ZdZddgZdZdZd fd	Z	 	 	 	 ddede	e
eee   eef      d	ee   fd
Zd Zd Zed        ZddZed        Zed        Z xZS )DonutProcessora  
    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
    processor.

    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
    [`~DonutProcessor.decode`] for more information.

    Args:
        image_processor ([`DonutImageProcessor`], *optional*):
            An instance of [`DonutImageProcessor`]. The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    image_processor	tokenizerAutoImageProcessorAutoTokenizerc                     d }d|v r+t        j                  dt               |j                  d      }||n|}|t	        d      |t	        d      t
        |   ||       | j                  | _        d| _	        y )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.F)
warningswarnFutureWarningpop
ValueErrorsuper__init__r   current_processor_in_target_context_manager)selfr   r   kwargsr    	__class__s        r   r'   zDonutProcessor.__init__9   s     &(MM
 !'

+> ?-<-H/N_"HIIABB)4!%!5!5*/'r   imagestextr+   c                    |j                  dd      }|rt        j                  d       | j                  r | j                  ||fi |S ||t        d       | j                  t        fd| j                  j                  i|}| | j                  |fi |d   }|/|s||d   j                  dd	        | j                  |fi |d   }	|S |	S 	d
   d<   |	d
   |d
<   |S )a  
        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
        legacyTa_  Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, if both images and text are provided, the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. To test the new behavior, set `legacy=False`as a processor call argument.zBYou need to specify either an `images` or `text` input to process.tokenizer_init_kwargsimages_kwargstext_kwargsadd_special_tokensF	input_idslabels)r$   loggerwarning_oncer)   r(   r%   _merge_kwargsr   r   init_kwargsr   
setdefault)
r*   r-   r.   audiovideosr+   r0   output_kwargsinputs	encodingss
             r   __call__zDonutProcessor.__call__M   s0    Hd+\ **)4))&$A&AA>dlabb*** 
"&.."<"<
 
 )T))&SM/4RSFf0m,778LeT&tL}]/KLI<M^(5F8"+K"8F;Mr   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder*   argsr+   s      r   rC   zDonutProcessor.batch_decode   s     
 +t~~**D;F;;r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r   decoderD   s      r   rG   zDonutProcessor.decode   s     
 %t~~$$d5f55r   c              #      K   t        j                  d       d| _        | j                  | _        d | j
                  | _        d| _        yw)z
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
        z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your images inputs, or in a separate call.TNF)r!   r"   r)   r   r(   r   r*   s    r   as_target_processorz"DonutProcessor.as_target_processor   sH     
 	9	

 +/'!%!%!5!5*/'s   AAc                    || j                   j                         }i }|r4t        j                  d|t        j                        }|n|j                  d      }t        j                  |      }t        j                  d| d|t        j                        }|j                         }||j                  |d      }n|j                         }t        j                  |      }	t        j                  |      }
t        j                  |	 d|
 |t        j                  t        j                  z        }||j                  d      j                         }d|v r3d|v r/| j                  |d	|
      }|rt        |      dk(  r|d   }|||<   ntg ||<   |j                  d      D ]?  }|j                         }||v r|d   dk(  r|dd dk(  r|dd }||   j                  |       A t        ||         dk(  r||   d   ||<   ||j                  |      t        |      z   d j                         }|dd dk(  r|g| j                  |dd d	|
      z   S |r4t        |      r|r|gS |S |rg S d|iS )zS
        Convert a (generated) token sequence into an ordered JSON format.
        Nz	<s_(.*?)>   z</s_> z(.*?)z<s_T)is_inner_valueadded_vocabr   z<sep/><z/>   text_sequence)r   get_added_vocabresearch
IGNORECASEgroupescapereplaceDOTALLstrip
token2jsonlensplitappendfind)r*   tokensrO   rP   outputstart_tokenkeykey_escaped	end_tokenstart_token_escapedend_token_escapedcontentvalueleafs                 r   r^   zDonutProcessor.token2json   s{    ..88:K))L&"--HK"##A&C))C.K		T+a"8&"--PI%++-K R8%OO-	&(ii&<#$&IIi$8!))*+51B0CDfbmm^`^g^gNg &%mmA.446G(W-? $Ze f "5zQ(-a*/F3K&(s$+MM)$< 5D#'::<D#{2tAw#~$rs)W[J['+Abz"3K..t4	5
 vc{+q0*0+a.F3KI 6Y G IJPPR"1:*"8doofQRjQUcno&oooM P v;-F8969'2Fov-FFr   c                 N    t        j                  dt               | j                  S )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r!   r"   r#   image_processor_classrI   s    r   feature_extractor_classz&DonutProcessor.feature_extractor_class   s"    u	
 )))r   c                 N    t        j                  dt               | j                  S )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r!   r"   r#   r   rI   s    r   r    z DonutProcessor.feature_extractor   s"    i	
 ###r   )NN)NNNN)FN)r   r   r   __doc__
attributesro   tokenizer_classr'   r   r   r   strr   r   r   r   r   rA   rC   rG   r   rJ   r^   propertyrp   r    __classcell__)r,   s   @r   r   r   %   s     $[1J0%O0, "NR33 uS$s)Y8IIJK3 -.3j<6 0 04Gl * * $ $r   r   )rr   rV   r!   
contextlibr   typingr   r   r   image_utilsr   processing_utilsr	   r
   r   tokenization_utils_baser   r   utilsr   r   
get_loggerr   r7   r   r   r   r   <module>r      s[    
  % ( ( % H H C +5  
		H	%$^ $r   