
    sg`                     `   d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z
ddlmZ ddlmZ ddlmZ d	d
lmZmZmZmZ  e       rddlmZ  e       rddlZ ej2                  e      Zedz  ZddiZdedefdZd ZddefdZ ddede!defdZ"d Z#d Z$defdZ% ee       G d de             Z&y)z"
Fast tokenizer class for Nougat.
    N)partial)Pool)ListUnion)INIT_TOKENIZER_DOCSTRING)PreTrainedTokenizerFast)add_end_docstrings   )is_levenshtein_availableis_nltk_availableloggingrequires_backends)ratiou  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
tokenizer_fileztokenizer.jsontextreturnc                    t        j                  dd| t         j                        } t        j                  dd| t         j                        } t        j                  dd| t         j                        } | j                  dd	      } | j                  d
d      j                  dd      } t        j                  dd|       } t        j                  dd|       } t        j                  dd| t         j                        } | S )a1  
    Make text compatible with Markdown formatting.

    This function makes various text formatting adjustments to make it compatible with Markdown.

    Args:
        text (`str`):
            The input text to be made Markdown-compatible.

    Returns:
        `str`: The Markdown-compatible text.
    z%^\(([\d.]+[a-zA-Z]?)\) \\\[(.+?)\\\]$z\[\2 \\tag{\1}\]flagsz%^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\)$z\[\1 \\tag{\2}\]z3^\\\[(.+?)\\\] \(([\d.]+[a-zA-Z]?)\) (\\\[.+?\\\])$z\[\1 \\tag{\2}\] \3z\. z. z\bm{z\mathbf{z{\\bm z\\mbox{ ?\\boldmath\$(.*?)\$}z\\mathbf{\1}z^((?:http|ftp|https):\/\/(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))z[\1](\1)z```\s*(.+?)\s*```z```\n\1\n```)resubMreplaceS)r   s    f/var/www/html/venv/lib/python3.12/site-packages/transformers/models/nougat/tokenization_nougat_fast.pymarkdown_compatibler   8   s     66:<OQU]_]a]abD66:<OQU]_]a]abD66>dd	D <<%D<<-55iMD662OTJD66iD 66&RTTJDK    c           
      4   d}t        t        t        j                  || t        j                  t        j
                  z                    D ]H  }|j                         \  }}|j                  d      dz   }|j                  d      j                  |      }d}|j                  d      |dd }|j                  d      dz   }nd}{| d| | |d }
}	t        |      D ]  \  }}d}|j                         j                  d      \  }}}|s.t        j                  d	|t        j                  t        j
                  z        r|j                  d
      }||dkD  rdndd|z  z   |dkD  s|dk(  r|n|z   |j                         z   z  } |
dk(  rd}
|	|z   |
z   } K | S )a  
    Normalize lines in the given text that resemble list items. The function looks for lines that start optionally with
    '-' or '*', possibly followed by Roman numerals or digits indicating nesting levels. The function reformats such
    lines to make them more structured.

    Args:
        generation (str): The input text containing lines that need to be normalized.

    Returns:
        str: The input text with the list-like lines normalized.

    Note:
        The function uses regular expressions to identify and reformat the list-like lines. The patterns capture
        optional bullet points, nesting levels indicated by numerals, and the actual list item content. The
        normalization adjusts the bullet point style and nesting levels based on the captured patterns.
    zX(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)r   r
    r       Nz^[\dixv]+((?:\.[\dixv])?)+$.
	)reversedlistr   finditerIr   spangroupsplit	enumeratestrip	partitionmatchcount)
generationpatternr/   startstopdelimsplitsreplacementdelim1prepostiitemlevelpotential_numeral_rests                    r   normalize_list_like_linesrA   b   s   , jG$r{{7JbddRTTkRST .jjltA$Q%%e,;;q>%ABZF[[^c)FFv&
45(9T ( 	GAtE)-)?)?)D&q$xx68IQSQUQUXZX\X\Q\])//4QB4%<8QUeWXjE^dehlhrhrhttK	 2:D;&-
?.B r   c                 N    t        |t        |             D ]  }| |   dv s|c S  y)z
    Find the index of the next punctuation mark.

    Args:
        text (`str`):
            String to examine
        start_idx (`int`, *optional*)
            Index where to start
    )r"   ?!r#   N)rangelen)r   	start_idxr;   s      r   find_next_punctuationrH      s5     9c$i( 7++H r   min_lenc                 B   | j                         }t        |      }|d|z  k  r| S d}t        |t        |dz              D ]8  }d}t        d|      D ]   }|||z
  |z
  dz
     |||z
  dz
     k7  sd} n |s7|}: || S || d }| }	|}
|
j	                  |      r|	d|  }	|
d|  }
|
j	                  |      r|t        |
      d }|
}	 t        |t        |            }t        |ddd   t        |            }|r|r||| }|d|dz    }||v rnnH| dt        |       }|S )a  
    Attempt to truncate repeating segments in the input string.

    This function looks for the longest repeating substring at the end of the input string and truncates it to appear
    only once. To be considered for removal, repetitions need to be continuous.

    Args:
        text (`str`):
            The input raw prediction to be truncated.
        min_len (int):
            The minimum length of the repeating segment.

    Returns:
        `str`: The input string with repeated segments truncated.
       NTr   r!   F)lowerrF   rE   intendswithrH   )r   rI   
text_lowertext_lengthmax_repetition_lengthrepetition_lengthsamer;   lcssubstituted_textsubstituted_text_lowerrepeating_tailsubstituted_text_lower_outsentence_endsentence_startsentencetext_outs                    r   truncate_repetitionsr^      s     Jj/KQ[  !"7Ca,@A 	6q+, 	A+(99A=ABjQ\_`Q`cdQdFee	
 $5!	6 $
++,
-C '
 
)
)#
.+,C.C-CD!78O:O9O!P !
)
)#
.
  $: ; =>N "8
,Z=W9XY.z$B$/?E_A`aN!.>H)34FlQ6F)G&>)  5c456HOr   c                 |    d }t        | t              r ||       S g }| D ]  }|j                   ||              |S )Nc                 L    t        j                  dd|       j                         S )Nz(?:[\d_]|\*\*)r    )r   r   r-   )ss    r   _cleanzremove_numbers.<locals>._clean   s    vv'Q/5577r   )
isinstancestrappend)linesrb   outls       r   remove_numbersri      sF    8 %e}
C 

6!9Jr   c                 \   t        j                  t        |             }t        t        |       dz
        D ]  }|dz   }||   s-|t        |       dz
  k  r|dz  }||   s|t        |       dz
  k  rt        ||         dk  sLt        ||         dkD  s^t        ||         dk  spt        ||         dkD  s||   j	                  d      r||   ||   k(  st        ||   ||         dkD  sd|||  t        j                  |      d   }g }t        |      dk(  r|S d}t        t        j                  |      dkD        D ])  \  }}|s	|j                  ||   ||   dz   f       |dz   }+ |j                  ||   |d   dz   f       |D 	cg c]  }	|	d   |	d   z
  d	kD  s|	 c}	S c c}	w )
a5  
    Get slices of text based on specific criteria within the lines.

    This function identifies and returns slices of text from the input lines based on certain conditions.

    These conditions were chosen by the Nougat authors:
    - The slice is less than 200 characters long.
    - The slice is more than 3 characters long.
    - The slice does not start with "[MISSING_PAGE".
    - The slice is either the same as the next slice or the ratio of the two in terms of Levensthein distance is
      greater than 0.9.

    Args:
        lines (`List[str]`):
            The list of lines containing the text.
        clean_lines (`List[str]`):
            A cleaned version of the text (without numbers).

    Returns:
        `List[tuple]`: A list of tuples representing the start and end indices of text slices.
    r!      r
   z[MISSING_PAGE?r   rK   rL      )
npzerosrF   rE   
startswithr   wherer,   diffre   )
rf   clean_linesindicesr;   jidsslicesj0xslis
             r   
get_slicesr{      s   , hhs5z"G3u:>" Ea.QUa%7FA a.QUa%7 A#%KN#a'KN#c)KN#a'N--o>Q;q>1U;q>;WX>5Z]`5`GAaL ((7
A
CF
3x1}	
B"''#,*+ 1MM3r7CFQJ/0QB MM3r7CGaK()!:CSVc!f_r%9C:::s   F)"F)c           	      L   ||d      }t        |      }d}t        t        d|d   dz
        t        d|d   dz
        d      D ]  }| |   s	| |   dk(  r||d<    nt        |t	        | |               dk  s4|dz   |d<   t	        | t        d|dz
           j                  d      d         }t        |      d	t        |      z  k\  rt        ||      dk  r||d<   d
} n t        t        t        |       |d         t        t        |       |d   dz               D ]$  }t        |t	        | |               dk  s||d<    n t        |       |d   k  rt        |       dz
  |d<   dj                  | |d   |d   dz          }t        | |d   dz
           t        | |d            }
}		 	 t        |	      \  }}|j                         rt        |	      \  }}|j                         rt        |
      \  }}|j                         rt        |
      \  }}|j                         r||k7  rn	 |rd|v rd|j                  d      d   z   }	 t        | |d            z
  dz
  }|dkD  r|d|  }|j                         S # t        $ r Y [w xY w# t        $ r Y |j                         S w xY w)a  
    Remove a slice of text from the lines based on specific criteria.

    This function identifies a slice of text within the lines and removes it based on certain conditions.

    Args:
        lines (list of str): The list of lines containing the text.
        clean_text (list of str): A cleaned version of the text (without numbers).
        slice (tuple): A tuple representing the start and end indices of the slice to be removed.

    Returns:
        str: The removed slice of text as a single string.
    r   Fr!      rL   z## Referencesrl   z* [g      ?Tr#   N)r&   rE   maxr   ri   r.   rF   minjoinr,   next	isnumericStopIterationUnboundLocalErrorr-   )rf   
clean_textslicebasesectioncheck_start_flagline_idxpotential_ref	to_deleteiteraiterbiaaibbdeltas                   r   remove_slice_from_linesr   1  s    eAhD5kG#aqA.AuQx!|0DbI X?o-!GAJ4h89C?!AGAJ*5Q11E+F+P+PQV+WXZ+[\M=!TCI%55%m:TWZ:Z%
# #c%j%(3SUU1XPQ\5RS ~eHo67#=!GAJ 5zWQZZ!^
		%
WQZ!^<=IU71:>23YuWQZ?P5Q5E

	5kGR++-u+Q ++-5kGR++-u+Q ++-Av   EY.I//6r::	E'!*%&+a/19!'E6*I ??  		  ??s0   '<I9 $<I9 !I9 "J 9	JJ	J#"J#c                        e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 d fd	Zde	de	fdZ
d	e	de	fd
Zdd	e	dede	fdZ	 	 dd	ee	ee	   f   dededee	ee	   f   fdZ xZS )NougatTokenizerFasta  
    Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods. This class mainly adds Nougat-specific
    methods for postprocessing the generated text.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.

        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
            spaces.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.

        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskNc                 B    t        	|   d|||||||d| || _        y )N)
vocab_filer   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_token )super__init__r   )
selfr   r   r   r   r   r   r   kwargs	__class__s
            r   r   zNougatTokenizerFast.__init__  s?     	 		
!))E		
 		
 %r   r   r   c                 0   |j                  d      }t        |      dk(  ryt        |      }t        ||      }g }|D ]  }|j	                  t        |||               t        |      D ]  }|j                  |d      } t        j                  dd|      }|S )av  
        Remove hallucinated or missing references from the text.

        This function identifies and removes references that are marked as missing or hallucinated from the input text.

        Args:
            text (`str`):
                The input text containing references.

        Returns:
            `str`: The text with hallucinated references removed.
        r#   r   r    z

[MISSING_PAGE_POST]

z,## References\n+\[MISSING_PAGE_POST(:\d+)?\]z

[MISSING_PAGE_POST\1])
r+   rF   ri   r{   re   r   r%   r   r   r   )r   r   rf   rs   rw   r   r   s          r   remove_hallucinated_referencesz2NougatTokenizerFast.remove_hallucinated_references  s     

4 u:?$U+E;/	 	QE4UKOP	Q!), 	JI<<	+HID	Jvv;(

 r   r1   c                    |j                  d      D ]Q  }|j                  d      dkD  s)|j                  d      dkD  s|j                  d      dkD  s@|j                  |d      }S |j                  d	d
      }|j                  dd      }|j                  dd      }t        j                  dd|t        j
                        }|j                  dd      }|j                  dd      }|S )a  
        Takes a generated string and fixes tables/tabulars to make them match the markdown format needed.

        Args:
            generation (str): The generated text to be postprocessed.

        Returns:
            str: The postprocessed text.

        Example:

        ```python
        correct_tables("\begin{table} \begin{tabular}{l l} & \ \end{tabular} \end{table}")
        "\begin{table}
\begin{tabular}{l l} & \ \end{tabular}
\end{table}"
        ```
        r#   z\begin{tabular}rm   z\multicolumn<   &i  r    z\begin{table} \begin{tabular}z\begin{table}
\begin{tabular}z\end{tabular} \end{table}z\end{tabular}
\end{table}z\end{table} Tabz\end{table}
Tabz(^.+)\\begin{tabz\1\n\\begin{tabr   z(\begin{tabular}{l l}  & \\ \end{tabular}z \begin{tabular}{}

\end{tabular})r+   r0   r   r   r   r   )r   r1   rh   s      r   correct_tablesz"NougatTokenizerFast.correct_tables  s    $ !!$' 	7Aww)*R/177?3Kb3PTUT[T[\_T`cfTf'//26
	7
  ''(IKmn
''(EGef
''(:<OP
VV/1CZWYW[W[\
  ''(SUWX
''(NPRS
r   fix_markdownc                    t        j                  dd|      }|j                         }|j                  dd      }t        j                  dd|t         j                        }|j                  d      }|d   j                  d	      r`|d   j                  d	      j                  d
      r=t        |      dkD  r/t        j                  d|d   z          dj                  |dd       }t        |      }| j                  |      }t        j                  dd|t         j                        }t        j                  dd|t         j                        }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        |      }|j                  d      r|dz  }t        j                   d|      r|d
z  }n|j                  d      rd|z   }nm|j                  d      d   j                  d      r|dz   }nD	 |j                  d
      d   }|t"        j$                  j&                  j'                         v r|d
z  }| j+                  |      }|j                  dd      }t        j                  d d|      }t        j                  d!d|      }t        j                  d"d|t         j                        }t        j                  d#d|t         j                        }t        j                  d$d|      }|rt-        |      S |S # t(        $ r |d
z  }Y w xY w)%a  
        Postprocess a single generated text. Regular expressions used here are taken directly from the Nougat article
        authors. These expressions are commented for clarity and tested end-to-end in most cases.

        Args:
            generation (str): The generated text to be postprocessed.
            fix_markdown (bool, optional): Whether to perform Markdown formatting fixes. Default is True.

        Returns:
            str: The postprocessed text.
        z(?:\n|^)#+ \d*\W? ?(.{100,})z\n\1z
* [leftmargin=*]
r#   z'^#+ (?:\.?(?:\d|[ixv])+)*\s*(?:$|\n\s*)r    r   rL   #r   r!   z2Likely hallucinated title at the end of the page: Nz#^\* \[\d+\](\s?[A-W]\.+\s?){10,}.*$z^(\* \[\d+\])\[\](.*)$z\1\2z(^\w\n\n|\n\n\w$)z8([\s.,()])_([a-zA-Z0-9])__([a-zA-Z0-9]){1,3}_([\s.,:()])z\1\(\2_{\3}\)\4z$([\s.,\d])_([a-zA-Z0-9])_([\s.,\d;])z
\1\(\2\)\3z;(\nFootnote .*?:) (?:footnotetext|thanks):\W*(.*(?:\n\n|$))z\1 \2z$\[FOOTNOTE:.+?\](.*?)\[ENDFOOTNOTE\])r"   }z

z[A-Z0-9,;:]$)r   z**z\begin)r   FigureTablez\begin{array}[]{z\begin{array}{z?\\begin{tabular}{([clr ]){2,}}\s*[& ]*\s*(\\\\)? \\end{tabular}z(\*\*S\. A\. B\.\*\*\n+){2,}z^#+( [\[\d\w])?$z^\.\s*$z\n{3,})r   r   r-   r   r   r+   rp   lstriprF   loggerinfor   r^   r   rA   rO   r/   nltkcorpuswordsLookupErrorr   r   )r   r1   r   rf   	last_words        r   post_process_singlez'NougatTokenizerFast.post_process_single  s    VV+Wj

  %%'
''(>E
 VVFJ^`^b^bc
  &9$r)9)9#)>)I)I#)NSVW\S]`aSaKKLuUWyXY5":.J)*5
88D
VVBB
Z\Z^Z^_
VV5w
RTRVRVW
VV0"jA
VVG


 VVC]T^_
VVJ

 VVCRT
.z:
z*& J88OZ0#J""#9:*,Jd#B'223KL#f,J"&,,S1"5	 1 1 7 7 99#%J ((4
''(;=NO
VVN

 VV;RL
VV/ZrttL
VVJJbddC
VVIvz:
&z223  "c!
"s   AM	 	MMnum_workersc                 d   t        | ddg       t        |t              ro|Lt        |t              r<t	        |      5 }|j                  t        | j                  |      |      cddd       S |D cg c]  }| j                  ||       c}S | j                  ||      S # 1 sw Y   yxY wc c}w )aP  
        Postprocess a generated text or a list of generated texts.

        This function can be used to perform postprocessing on generated text, such as fixing Markdown formatting.

        Postprocessing is quite slow so it is recommended to use multiprocessing to speed up the process.

        Args:
            generation (Union[str, List[str]]):
                The generated text or a list of generated texts.
            fix_markdown (`bool`, *optional*, defaults to `True`):
                Whether to perform Markdown formatting fixes.
            num_workers (`int`, *optional*):
                Optional number of workers to pass to leverage multiprocessing (postprocessing several texts in
                parallel).

        Returns:
            Union[str, List[str]]: The postprocessed text or list of postprocessed texts.
        r   levenshteinN)r   )r   rc   r&   rN   r   mapr   r   )r   r1   r   r   pra   s         r   post_process_generationz+NougatTokenizerFast.post_process_generationP  s    2 	$ 78j$'&:k3+G+& k!55)A)AP\!]_ijk k YccST000Ncc++J\+RRk k ds   'B!1B-!B*)NNFz<unk>z<s>z</s>z<pad>)T)TN)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesslow_tokenizer_classr   rd   r   r   boolr   r   r   rN   r   __classcell__)r   s   @r   r   r   r  s    B *$&67 %*%.3 3 <! ! !F]c ] ]QT ]D "	"S#tCy.)"S "S 	"S
 
sDI~	"Sr   r   )r   )   )'r   r   	functoolsr   multiprocessingr   typingr   r   numpyrn   $transformers.tokenization_utils_baser   $transformers.tokenization_utils_fastr   transformers.utilsr	   utilsr   r   r   r   Levenshteinr   r   
get_loggerr   r   r   rd   r   rA   rH   rN   r^   ri   r{   r   r   r   r   r   <module>r      s    
      I H 1 \ \ ! 
		H	%     &'78 'c 'c 'T9x $As AS A# AH	.;b> >B ,-S1 S .Sr   