
    sg6                         d dl Z d dlZd dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
mZ  ej                  e      Ze G d d             Z ed	       G d
 d             Z G d d      Z G d de      Zy)    N)	dataclass)ListOptionalUnion   )is_tf_availableis_torch_availableloggingc                   T    e Zd ZU dZeed<   eed<   dZee   ed<   dZee   ed<   d Z	y)InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 \    t        j                  t        j                  |       d      dz   S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictselfs    U/var/www/html/venv/lib/python3.12/site-packages/transformers/data/processors/utils.pyto_json_stringzInputExample.to_json_string1   s#    zz+,,T21=DD    )
__name__
__module____qualname____doc__str__annotations__r   r   r   r    r   r   r   r      s5     IK FHSM E8C=Er   r   T)frozenc                   z    e Zd ZU dZee   ed<   dZeee      ed<   dZ	eee      ed<   dZ
eeeef      ed<   d Zy)InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 X    t        j                  t        j                  |             dz   S )r   r   r   r   s    r   r   zInputFeatures.to_json_stringK   s!    zz+,,T23d::r   )r    r!   r"   r#   r   intr%   r+   r   r,   r   r   floatr   r&   r   r   r)   r)   6   sV     Cy*.NHT#Y'.*.NHT#Y'.)-E8E#u*%&-;r   r)   c                   F    e Zd ZdZd Zd Zd Zd Zd Zd Z	e
d
d	       Zy)DataProcessorzEBase class for data converters for sequence classification data sets.c                     t               )z
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   tensor_dicts     r   get_example_from_tensor_dictz*DataProcessor.get_example_from_tensor_dictS   s     "##r   c                     t               )z8Gets a collection of [`InputExample`] for the train set.r3   r   data_dirs     r   get_train_examplesz DataProcessor.get_train_examples]       !##r   c                     t               )z6Gets a collection of [`InputExample`] for the dev set.r3   r8   s     r   get_dev_exampleszDataProcessor.get_dev_examplesa   r;   r   c                     t               )z7Gets a collection of [`InputExample`] for the test set.r3   r8   s     r   get_test_exampleszDataProcessor.get_test_examplese   r;   r   c                     t               )z*Gets the list of labels for this data set.r3   r   s    r   
get_labelszDataProcessor.get_labelsi   r;   r   c                     t        | j                               dkD  r+| j                         t        |j                           |_        |S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenrA   r.   r   )r   examples     r   tfds_mapzDataProcessor.tfds_mapm   s9    
 t !A% OO-c'--.@AGMr   Nc                     t        |dd      5 }t        t        j                  |d|            cddd       S # 1 sw Y   yxY w)z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openlistcsvreader)cls
input_filerL   fs       r   	_read_tsvzDataProcessor._read_tsvv   s@     *cK8 	LA

1	JK	L 	L 	Ls	   !:AN)r    r!   r"   r#   r6   r:   r=   r?   rA   rF   classmethodrT   r&   r   r   r1   r1   P   s9    O$$$$$ L Lr   r1   c                   |    e Zd ZdZddZd Zd Ze	 dd       Zedd       Z		 	 	 	 	 	 	 ddZ
	 dd	Z	 	 	 	 	 dd
Zy)%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.Nc                 L    |g n|| _         |g n|| _        || _        || _        y rU   )labelsexamplesmodeverbose)r   rZ   r[   r\   r]   s        r   __init__z.SingleSentenceClassificationProcessor.__init__   s+    "Nb&.H	r   c                 ,    t        | j                        S rU   )rD   r[   r   s    r   __len__z-SingleSentenceClassificationProcessor.__len__   s    4==!!r   c                     t        |t              r$t        | j                  | j                  |         S | j                  |   S )N)rZ   r[   )
isinstanceslicerX   rZ   r[   )r   idxs     r   __getitem__z1SingleSentenceClassificationProcessor.__getitem__   s9    c5!8VZVcVcdgVhii}}S!!r   c           
      H     | di |}|j                  ||||||dd       |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr&   )add_examples_from_csv)	rQ   	file_namerg   rh   ri   rj   rk   kwargs	processors	            r   create_from_csvz5SingleSentenceClassificationProcessor.create_from_csv   sB     M&M	''!%#)!# 	( 		
 r   c                 <     | di |}|j                  ||       |S )N)rZ   r&   )add_examples)rQ   texts_or_text_and_labelsrZ   rp   rq   s        r   create_from_examplesz:SingleSentenceClassificationProcessor.create_from_examples   s'    M&M	7Gr   c	                 X   | j                  |      }	|r|	dd  }	g }
g }g }t        |	      D ]i  \  }}|
j                  ||          |j                  ||          ||j                  ||          E|r| d| n
t        |      }|j                  |       k | j	                  |
||||      S )NrC   -)rl   rm   )rT   	enumerateappendr$   rt   )r   ro   rg   rh   ri   rj   rk   rl   rm   linestextsrZ   idsiliner   s                   r   rn   z;SingleSentenceClassificationProcessor.add_examples_from_csv   s     y)!"IE ' 	!GAtLLk*+MM$|,-$

4	?+.8*Qqc*c!f

4 	!   631AVh ! 
 	
r   c           	         |:t        |      t        |      k7  r#t        dt        |       dt        |             |:t        |      t        |      k7  r#t        dt        |       dt        |             |d gt        |      z  }|d gt        |      z  }g }t               }t        |||      D ]U  \  }}	}
t	        |t
        t        f      r|	|\  }}	n|}|j                  |	       |j                  t        |
|d |	             W |r|| _
        n| j                  j                  |       |rt        |      | _        | j                  S t        t        | j                        j                  |            | _        | j                  S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r   r   )rD   
ValueErrorsetziprb   tuplerN   addrz   r   r[   extendrZ   union)r   ru   rZ   r}   rl   rm   r[   added_labelstext_or_text_and_labelr   r   texts               r   rt   z2SingleSentenceClassificationProcessor.add_examples   s    #&>"?3v;"N:3?W;X:YY^_bci_j^kl  ?s#;<CHDSIaEbDcchilmpiqhrstt;&3788C>Vc":;;Fu367OQWY\3] 	\/"E405$-@U]4e-U#OOLd4TYZ[	\ $DMMM  * |,DK }} s4;;/55lCDDK}}r   c                 T
   ||j                   }t        | j                        D ci c]  \  }}||
 }	}}g }
t        | j                        D ]h  \  }}|dz  dk(  rt        j                  d|        |j                  |j                  dt        ||j                               }|
j                  |       j t        d |
D              }g t        t        |
| j                              D ]?  \  }\  }}|dz  dk(  r.t        j                  d| d	t        | j                                |rd
ndgt        |      z  }|t        |      z
  }|r|g|z  |z   }|rdnd
g|z  |z   }n||g|z  z   }||rdnd
g|z  z   }t        |      |k7  rt        dt        |       d|       t        |      |k7  rt        dt        |       d|       | j                  dk(  r|	|j                     }n:| j                  dk(  rt!        |j                        }nt        | j                        |dk  r| j"                  rt        j                  d       t        j                  d|j$                          t        j                  ddj'                  |D cg c]  }t)        |       c}              t        j                  ddj'                  |D cg c]  }t)        |       c}              t        j                  d|j                   d| d       j                  t+        |||             B |S |dk(  rt-               st/        d      ddl}fd}|j2                  j4                  j7                  ||j8                  |j8                  d|j:                  f|j=                  dg      |j=                  dg      d|j=                  g       f      }|S |dk(  rt?               st/        d      ddl }ddl!m"} |jG                  D cg c]  }|jH                   c}|jJ                         }
|jG                  D cg c]  }|jL                   c}|jJ                         }| j                  dk(  r6|jG                  D cg c]  }|j                   c}|jJ                         }nD| j                  dk(  r5|jG                  D cg c]  }|j                   c}|j                          } ||
|      }|S t        d!      c c}}w c c}w c c}w c c}w c c}w c c}w c c}w )"a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
            `InputFeatures` which can be fed to the model.

        Ni'  r   zTokenizing example T)add_special_tokens
max_lengthc              3   2   K   | ]  }t        |        y wrU   )rD   ).0r*   s     r   	<genexpr>zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>  s     Ii3y>Is   zWriting example /rC   zError with input length z vs classification
regression   z*** Example ***zguid: zinput_ids:  zattention_mask: zlabel: z (id = )r*   r+   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc               3   j   K   D ])  } | j                   | j                  d| j                  f + y w)Nr*   r+   r   )exfeaturess    r   genz?SingleSentenceClassificationProcessor.get_features.<locals>.genC  s9     " gB)+IZIZ[]_]e]effgs   03r   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDataset)dtypez,return_tensors should be one of 'tf' or 'pt')'max_lenry   rZ   r[   loggerinfoencoder   minrz   maxr   rD   r   r\   r   r/   r]   r   joinr$   r)   r   RuntimeError
tensorflowdataDatasetfrom_generatorint32int64TensorShaper	   torchtorch.utils.datar   tensorr*   longr+   )r   	tokenizerr   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensorsr~   r   	label_mapall_input_idsex_indexrE   r*   batch_lengthr+   padding_lengthxr   r   datasetr   r   rS   all_attention_mask
all_labelsr   s                             @r   get_featuresz2SingleSentenceClassificationProcessor.get_features   s   6 "**J.7.DE(!UUAXE	E!*4==!9 		,Hg%1$1(<=!((#'z9+<+<= ) I
   +		, I=II.7M4==8Y.Z #	l*H*y'%1$.xj#dmm:L9MNO $:aqAC	NRN *C	N:N'[>9YF	(>1A"F"W[i!i%)~)EF	!/9OAUV3WZh3h!i9~- #;C	N;K4P\~!^__>"l2 #;C<O;PPTUaTb!cddyy,,!'--0l*gmm, ++!|-.fW\\N34k#((I3NqCF3N*O)PQR.sxx8XAQ8X/Y.Z[\ggmm_GE7!DEOOMIndijkG#	lJ !Ot#"$"#dee#g ggoo44!xx288DbhhO!~~tf5Y]X^I_`bdbpbpqsbtuG
 Nt#%'"#]^^6!LLx)H!!++)HPUPZPZL[M!&.RAq/?/?.RZ_ZdZd!eyy,,"\\H*Eq177*EUZZ\X
l*"\\H*Eq177*EU[[\Y
#M3EzRGNKLLo F` 4O8X8 *I.R*E*Es)   TTTT TT 	T%)NNr   F) r   rC   NFrU   )r   r   rC   NFFF)NNFF)NFr   TN)r    r!   r"   r#   r^   r`   re   rV   rr   rv   rn   rt   r   r&   r   r   rX   rX   }   s    J""
 ej      
> kp#P #uMr   rX   )rO   r   r   r   typingr   r   r   utilsr   r	   r
   
get_loggerr    r   r   r)   r1   rX   r&   r   r   <module>r      s   "    ! ( ( A A 
		H	% E E E0 $; ; ;2*L *LZ`MM `Mr   