
    sg$                     ^   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ  ej8                  e      Z e ej@                               Z! e"d e!D              Z#e G d d             Z$ G d de      Z% G d de      Z&y)    N)	dataclassfield)Enum)DictListOptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)logging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   4   K   | ]  }|j                     y wN)
model_type).0confs     S/var/www/html/venv/lib/python3.12/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s     EDOOEs   c                      e Zd ZU dZ eddddj                  e      z   i      Zee	d<    edddi      Z
ee	d	<    ed
ddi      Zee	d<    ed
ddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    eddd i      Zee	d!<    ed"dd#i      Zee	d$<   y)%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r!   r#   intr$   r&   r(   r)   boolr*   r+   floatr-   r.   r0        r   r   r   %   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmr=   r   c                       e Zd ZdZdZy)SplittraindevN)r1   r2   r3   r@   rA   r<   r=   r   r?   r?   h   s    E
Cr=   r?   c                       e Zd ZU dZeed<   ee   ed<   eed<   e	ed<   dej                  dddfded	ed
ee   deeef   dee	   dee   dee   fdZd Zdeeej(                  f   fdZy)SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    || _         || _        |j                  r
t               n	t	               | _        t        |t              r
	 t        |   }|| _
        |j                  rdnd}t        j                  j                  ||n|j                  d|j                   d|j                   j"                   d|j$                   d|       }	|	dz   }
t'        |
      5  t        j                  j)                  |	      r|j*                  st-        j,                         }t/        j0                  |	      | _        | j2                  d   | _        | j2                  j7                  dd       | _        | j2                  j7                  d	d       | _        t<        j?                  d
|	 dt-        j,                         |z
         | j8                  | j:                  dt<        jA                  d|	 d       nI|t        jB                  k(  r+| j
                  jE                  |j                        | _        n*| j
                  jG                  |j                        | _        tI        | j:                  ||j$                  |jJ                  |jL                  |t        jN                  k(  |jP                  |      \  | _        | _        t-        j,                         }t/        jR                  | j4                  | j8                  | j:                  d|	       t<        j?                  d|	 dt-        j,                         |z
  dd       d d d        y # t        $ r t        d      w xY w# 1 sw Y   y xY w)Nzmode is not a valid split namev2v1cached__z.lockrE   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rS   rI   r#   r$   r&   is_trainingr0   return_dataset)rE   rR   rS   z!Saving features into cached file z [took z.3fz s])*rD   rG   r*   r   r   	processor
isinstancer7   r?   KeyErrorrF   ospathr5   r!   value	__class__r1   r#   r
   existsr)   timetorchloadold_featuresrE   getrR   rS   loggerinfowarningrA   get_dev_examplesget_train_examplesr   r$   r&   r@   r0   save)selfrD   rI   rJ   rF   rG   rK   rL   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__w   s    	%:"/3/K/K)+QaQcdC AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i  ,	ww~~23D<P<P		$)JJ/C$D! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqU,	 ,	  A?@@A,	 ,	s   	L< IM<MMc                 ,    t        | j                        S r   )lenrE   )ri   s    r   __len__zSquadDataset.__len__   s    4==!!r=   returnc                 (   | j                   |   }t        j                  |j                  t        j                        }t        j                  |j
                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }|||d}	| j                  j                  dv r|	d= | j                  j                  dv r|	j                  ||d       | j                  j                  r|	j                  d|i       | j                  rW|	j                  dt        j                   |j"                  t        j$                        | j                  j&                  z  i       | j(                  t*        j,                  k(  rrt        j                  |j.                  t        j                        }
t        j                  |j0                  t        j                        }|	j                  |
|d	       |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertrw   )xlnetrx   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rE   r_   tensorru   longrv   rw   r}   r~   r;   r   rD   r   updater*   rG   onesshapeint64r.   rF   r?   r@   start_positionend_position)ri   ifeatureru   rv   rw   r}   r~   r   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	VDEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMoP]^_r=   )r1   r2   r3   r4   r   r8   r   r   r?   r:   r@   r   r   r9   r	   r7   rn   rq   r   r_   Tensorr   r<   r=   r   rC   rC   m   s     %$=!!
K '+"'++05#'(,I(I 'I sm	I
 CJI  (~I C=I !IV" S%,,%6 7  r=   rC   )'rY   r^   dataclassesr   r   enumr   typingr   r   r   r	   r_   filelockr
   torch.utils.datar   models.auto.modeling_autor   tokenization_utilsr   utilsr   processors.squadr   r   r   r   
get_loggerr1   rc   listkeysMODEL_CONFIG_CLASSEStupler6   r   r?   rC   r<   r=   r   <module>r      s    
  (  . .   $ M 5  t t 
		H	%E@EEGH E0DEE ?n ?n ?nDD 
x7 xr=   