
    sg\                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ  ej&                  e      ZdZ G d	 d
e      Z G d de      Z G d de      Z G d de      Z G d de      Zy)    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c            
       Z    e Zd ZdZ	 	 ddedededee   fdZd Z	d	e
j                  fd
Zy)TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    N	tokenizer	file_path
block_size	cache_dirc           
         t        j                  t        j                  d      t               t
        j                  j                  |      du rt        d| d      ||j                  d      z
  }t
        j                  j                  |      \  }}t
        j                  j                  ||n|d|j                  j                   d| d|       }|dz   }	t        |	      5  t
        j                  j                  |      rv|stt!        j                          }
t#        |d	      5 }t%        j&                  |      | _        d d d        t*        j-                  d
| dt!        j                          |
z
         nAt*        j-                  d|        g | _        t#        |d      5 }|j/                         }d d d        |j1                  |j3                              }t5        dt7        |      |z
  dz   |      D ]2  }| j(                  j9                  |j;                  ||||z                 4 t!        j                          }
t#        |d      5 }t%        j<                  | j(                  |t$        j>                         d d d        t*        j-                  d| dt!        j                          |
z
  dd       d d d        y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   ^xY w# 1 sw Y   y xY w)Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_textis                   _/var/www/html/venv/lib/python3.12/site-packages/transformers/data/datasets/language_modeling.py__init__zTextDataset.__init__-   s    	&&u 		
 77>>)$-/	{*EFF)"E"E5"E"QQ
 ggmmI6	8!ww||".II,,556a
|1XJO 
 )72	i  	ww~~23O		.5 8$*KK$7DM889M8Nn]_c_h_h_jmr_r
 Ei[QR ")g6 $!668D$ "+!@!@ASASTXAY!Zq#n"5
"BQ"F
S AMM((!BB>RSVWZdVdCef 		.5 YKKv@W@WXY78L7MWUYU^U^U`chUhilTmmpq;	 	8 8$ $Y Y7	 	sW   &AK,(KA%K,(K9BK,1K :K,K	K,K	K, K)	%K,,K5c                 ,    t        | j                        S NrD   r=   rI   s    rU   __len__zTextDataset.__len__j       4==!!    returnc                 f    t        j                  | j                  |   t         j                        S )Ndtype)torchtensorr=   longrI   rT   s     rU   __getitem__zTextDataset.__getitem__m   s     ||DMM!,EJJ??r]   )FN)r7   
__module____qualname____doc__r	   strintr   rV   r[   rb   Tensorrf    r]   rU   r   r   (   sV     #';&; ; 	; C=;z"@ @r]   r   c                   T    e Zd ZdZdededefdZd Zde	ee
j                  f   fdZy	)
LineByLineTextDatasetr   r   r   r   c                    t        j                  t        j                  d      t               t
        j                  j                  |      du rt        d| d      t        j                  d|        t        |d      5 }|j                         j                         D cg c]$  }t        |      dkD  s|j                         r#|& }}d d d         |d	d	|
      }|d   | _        | j                   D cg c])  }dt#        j$                  |t"        j&                        i+ c}| _        y c c}w # 1 sw Y   ixY wc c}w )Nr   Fr   r   r   r   r    r   Tadd_special_tokens
truncation
max_length	input_idsr`   )r*   r+   r,   r-   r.   r/   r0   r1   r2   r>   r?   r:   r@   
splitlinesrD   isspacer=   rb   rc   rd   )	rI   r   r   r   rQ   linelinesbatch_encodinges	            rU   rV   zLineByLineTextDataset.__init__v   s   &&u 		
 77>>)$-/	{*EFF 	=i[IJ)g. 	g!&'ffh&9&9&;fdD	AVZVbVbVdTfEf	g #5Td_ij&{3SWS`S`aa+u||AUZZ'HIa	 g	g 	g
 bs0   !D7$D28D2	D2D7;.E2D77E c                 ,    t        | j                        S rX   rY   rZ   s    rU   r[   zLineByLineTextDataset.__len__   r\   r]   r^   c                      | j                   |   S rX   r=   re   s     rU   rf   z!LineByLineTextDataset.__getitem__       }}Qr]   Nr7   rg   rh   ri   r	   rj   rk   rV   r[   r   rb   rc   rf   rm   r]   rU   ro   ro   q   sF    b"5 b# bSV b*" S%,,%6 7  r]   ro   c                   X    e Zd ZdZdedededefdZd Zde	ee
j                  f   fd	Zy
)LineByLineWithRefDatasetr   r   r   r   ref_pathc                 l   t        j                  t        j                  d      t               t
        j                  j                  |      du rt        d| d      t
        j                  j                  |      du rt        d| d      t        j                  d|        t        j                  d|        t        |d	      5 }|j                         }d d d        D cg c]2  }t        |      d
kD  s|j                         r#|j                         4 }}t        |d	      5 }|j!                         j#                         D cg c]7  }t        |      d
kD  s|j                         r#t%        j&                  |      9 }}d d d        t        |      t              k7  r)t        d| dt        |       d| dt        |              ||dd|      }	|	d   | _        | j(                  D 
cg c])  }
dt+        j,                  |
t*        j.                        i+ c}
| _        t        | j(                        }t1        |      D ]:  }t+        j,                  ||   t*        j.                        | j(                  |   d<   < y # 1 sw Y   xY wc c}w c c}w # 1 sw Y   xY wc c}
w )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r    r   zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trq   ru   r`   chinese_ref)r*   r+   r,   r-   r.   r/   r0   r1   r2   r>   r?   r:   	readlinesrD   rw   stripr@   rv   jsonloadsr=   rb   rc   rd   rC   )rI   r   r   r   r   rQ   datarx   refrz   r{   nrT   s                rU   rV   z!LineByLineWithRefDataset.__init__   sQ   &&y 		
 77>>)$-/	{*EFF77>>(#u,~i[
CDD 	=i[IJ1(<=)g. 	!!;;=D	!)-VTQt||~

VV(W- 	q010C0C0Ep#d)VW-`d`l`l`n4::d#pCp	qt9C VW`Vaaefijnfoep q##+*DS
< 
 #4DT^hi&{3SWS`S`aa+u||AUZZ'HIaq 	UA.3ll3q6.TDMM!]+	U#	! 	!V q	q 	q bsN   J(J<JJ.!J$J#J4JJ$9.J1JJ$$J.c                 ,    t        | j                        S rX   rY   rZ   s    rU   r[   z LineByLineWithRefDataset.__len__   r\   r]   r^   c                      | j                   |   S rX   r~   re   s     rU   rf   z$LineByLineWithRefDataset.__getitem__   r   r]   Nr   rm   r]   rU   r   r      sP    "U"5 "U# "USV "Ube "UH" S%,,%6 7  r]   r   c                   \    e Zd ZdZdededefdZddZd Z	de
eej                  f   fd	Zy
)LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    r   file_dirr   c                    t        j                  t        j                  d      t               t
        j                  j                  |      du rt        | d      t        j                  d|        g | _        t        j                  |      D ]$  }t
        j                  j                  ||      }t
        j                  j                  |      du rt        | d      d}t        |d      5 }|j!                         }g }	|D ]  }
d|
v rd	}
d
|
v rd}|	dd  D 
cg c]@  }
t#        |
      dkD  r0|
j%                         s |j'                  |j)                  |
            B }}
| j+                  |||      }| j                  j-                  |       g }	|s|	j/                  |
        	 d d d        ' t        j                  d       y c c}
w # 1 sw Y   MxY w)Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r    z<doc id=Tz</doc>r"   r   zDataset parse finished.)r*   r+   r,   r-   r.   r/   r0   isdirr2   r>   r?   r=   listdirr5   r1   r:   r   rD   rw   rA   rB   create_examples_from_documentextendrE   )rI   r   r   r   	file_namer   article_openrQ   original_linesarticle_linesrx   documentr=   s                rU   rV   z%LineByLineWithSOPTextDataset.__init__   s   &&u 		
 77=="e+z)<=>>DXJOP H- 	7IXy9Iww~~i(E1 I;n!=>> Li'2 7a!" "* 7D!T)'+!T)', )6ab(9$ $ #D	Adlln &;;I<N<Nt<TU$ $ $(#E#EhPZ\e#f,,X6(*')006!77 7	74 	-.$7 7s%   3,GAG
$5GGGG	c                    ||j                  d      z
  }|}t        j                         |k  rt        j                  d|      }g }g }d}	d}
|
t        |      k  r%||
   }|s|
dz  }
|j	                  |       |	t        |      z  }	|
t        |      dz
  k(  s|	|k\  r|rd}t        |      dk\  r"t        j                  dt        |      dz
        }g }t        |      D ]  }|j                  ||           g }t        |t        |            D ]  }|j                  ||           t        |      dk(  st        |      dk(  rt        j                         dk  rd}||}}nd}d } ||||       t        |      dk\  st        d	t        |       d
      t        |      dk\  st        dt        |       d
      |j                  ||      }|j                  ||      }t        j                  |t        j                        t        j                  |t        j                        t        j                  |rdndt        j                        d}|j	                  |       g }d}	|
dz  }
|
t        |      k  r%|S )'Creates examples for a single document.Tr      r   r"         ?Fc                     	 t        |       t        |      z   }||k  ryt        |       t        |      kD  r| n|}t        |      dk\  st        d      t        j                         dk  r|d= n|j                          })z;Truncates a pair of sequences to a maximum sequence length.r"   z8Sequence length to be truncated must be no less than oner   r   N)rD   r2   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokenss        rU   truncate_seq_pairzULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pair-  sz    "+.x=3x=+HL+~= %7:8}s8}7T8ZbL$'$5$:&01k&l l  &}}4$0O , 0 0 2 #r]   Length of sequence a is  which must be no less than 1Length of sequence b is r`   )ru   token_type_idssentence_order_label)r3   r   randintrD   rE   rC   r   r2   rF   $create_token_type_ids_from_sequencesrb   rc   rd   )rI   r   r   r   short_seq_probr   target_seq_lengthr=   current_chunkcurrent_lengthrT   segmenta_endr   jr   is_nextr   ru   r   examples                        rU   r   z:LineByLineWithSOPTextDataset.create_examples_from_document   s    $i&I&It&I&TT +==?^+ &q. A #h-qkGQ  )c'l*NCMA%%;L)L E=)Q. &q#m2Dq2H I!H"5\ : a(89:  "H"5#m*<= : a(89: 8})S]a-?  }},"'-5x("&3  &h.IMQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !* J J8U] ^I%.%S%ST\^f%gN &+\\)5::%N*/,,~UZZ*X05'QqX]XbXb0cG
 OOG, "!"FAM #h-N r]   c                 ,    t        | j                        S rX   rY   rZ   s    rU   r[   z$LineByLineWithSOPTextDataset.__len__S  r\   r]   r^   c                      | j                   |   S rX   r~   re   s     rU   rf   z(LineByLineWithSOPTextDataset.__getitem__V  r   r]   N)皙?)r7   rg   rh   ri   r	   rj   rk   rV   r   r[   r   rb   rc   rf   rm   r]   rU   r   r      sJ    '/"5 '/ '/RU '/RaF" S%,,%6 7  r]   r   c                   X    e Zd ZdZ	 	 	 ddededefdZdeee      dedefdZ	d	 Z
d
 Zy)$TextDatasetForNextSentencePredictionr   r   r   r   c           	         t        j                  t        j                  d      t               t
        j                  j                  |      st        d| d      || _	        || _
        t
        j                  j                  |      \  }}t
        j                  j                  |d|j                  j                   d| d|       }	|| _        |	dz   }
t!        |
      5  t
        j                  j#                  |	      rv|stt%        j$                         }t'        |	d      5 }t)        j*                  |      | _        d d d        t.        j1                  d|	 d	t%        j$                         |z
         nt.        j1                  d
|        g g| _        t'        |d      5 }	 |j5                         }|sn|j7                         }|s6t9        | j2                  d         dk7  r| j2                  j;                  g        |j=                  |      }|j?                  |      }|r| j2                  d   j;                  |       	 d d d        t.        j1                  dt9        | j2                         d       g | _        tA        | j2                        D ]  \  }}| jC                  |||        t%        j$                         }t'        |	d      5 }t)        jD                  | j,                  |t(        jF                         d d d        t.        j1                  d|	 dt%        j$                         |z
  dd       d d d        y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   ^xY w# 1 sw Y   y xY w)Nr   r   r   cached_nsp_r   r   r   r   r   r   r   r    r   zCreating examples from z documents.r#   r$   r&   r'   r(   r)   )$r*   r+   r,   r-   r.   r/   r0   r1   r2   short_seq_probabilitynsp_probabilityr4   r5   r6   r7   r   r   r8   r9   r:   r;   r<   r=   r>   r?   	documentsreadliner   rD   rE   rB   rA   	enumerater   rG   rH   )rI   r   r   r   rJ   r   r   rK   rL   rM   rN   rO   rP   rQ   rx   tokens	doc_indexr   s                     rU   rV   z-TextDatasetForNextSentencePrediction.__init___  s    	&&u 		
 ww~~i(/	{*EFF%:". ggmmI6	8!ww||)--667qAhZP 

 # )72	 i  %	ww~~23O		.5 8$*KK$7DM889M8Nn]_c_h_h_jmr_r Ei[QR"$)g6 >! zz|#!#zz|  $DNN2,>(?1(D NN11"5!*!3!3D!9!*!@!@!H! NN2.55f=  "	> 5c$..6I5J+VW "+4T^^+D X'Ix66xJWX 		.5 YKKv@W@WXY78L7MWUYU^U^U`chUhilTmmpqG%	 %	8 8> >*Y YC%	 %	sX    AM)"M=A&M)#B MBM)1M :M)M	M)M	M)M&	"M))M2r   r   c                    || j                   j                  d      z
  }|}t        j                         | j                  k  rt        j                  d|      }g }d}d}|t        |      k  r||   }	|j                  |	       |t        |	      z  }|t        |      dz
  k(  s||k\  r|rd}
t        |      dk\  r"t        j                  dt        |      dz
        }
g }t        |
      D ]  }|j                  ||           g }t        |      dk(  s!t        j                         | j                  k  rd}|t        |      z
  }t        d      D ]5  }t        j                  dt        | j                        dz
        }||k7  s5 n | j                     }t        j                  dt        |      dz
        }t        |t        |            D ]&  }|j                  ||          t        |      |k\  s& n t        |      |
z
  }||z  }n0d}t        |
t        |            D ]  }|j                  ||           t        |      dk\  st        dt        |       d	      t        |      dk\  st        d
t        |       d	      | j                   j                  ||      }| j                   j                  ||      }t        j                  |t        j                         t        j                  |t        j                         t        j                  |rdndt        j                         d}| j"                  j                  |       g }d}|dz  }|t        |      k  ryy)r   Tr   r   r   r"   
   Fr   r   r   r`   )ru   r   next_sentence_labelN)r   r3   r   r   r   rD   rE   rC   r   r   r   r2   rF   r   rb   rc   rd   r=   )rI   r   r   r   r   r   r   r   rT   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsru   r   r   s                           rU   r   zBTextDatasetForNextSentencePrediction.create_examples_from_document  s/    $dnn&N&NTX&N&YY +==?T777 &q. A#h-qkG  )c'l*NCMA%%;L)L  E=)Q. &q#m2Dq2H I!H"5\ : a(89:  "H=)Q.&--/DDXDX2X)-*;c(m*K "'r &A4:NN1c$..FY\]F]4^14	A %&
 +/..9N*O'-~~a_9MPQ9Q'R!&|S5I!J &A$OOOA,>?"8}? %& /2-.@5.H+00 */!&uc-.@!A >A$OOM!,<=>  MQ.(+CCM?Ro)pqqMQ.(+CCM?Ro)pqq !% O OPXZb cI%)^^%X%XYack%lN &+\\)5::%N*/,,~UZZ*X/4||AUV^c^h^h/iG MM((1 "!"FAI #h-r]   c                 ,    t        | j                        S rX   rY   rZ   s    rU   r[   z,TextDatasetForNextSentencePrediction.__len__  r\   r]   c                      | j                   |   S rX   r~   re   s     rU   rf   z0TextDatasetForNextSentencePrediction.__getitem__  r   r]   N)Fr   r   )r7   rg   rh   ri   r	   rj   rk   rV   r   r   r[   rf   rm   r]   rU   r   r   Z  sk     !S&S S 	SjXd49o XRU Xcf Xt" r]   r   )r   r/   r;   r   r9   r*   typingr   r   r   rb   filelockr   torch.utils.datar   tokenization_utilsr	   utilsr
   
get_loggerr7   r>   r,   r   ro   r   r   r   rm   r]   rU   <module>r      s     	     ' '   $ 5  
		H	%L F@' F@R G  B- w - `U 7 U px 7 x r]   