
    +sg\                        d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
  ej                  e      Z G d de      Zy)    )annotationsN)Dataset)SentenceTransformer)InputExamplec                      e Zd ZdZ	 	 d	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 d	 	 	 	 	 	 	 ddZd Zd Zd Z	d	 Z
d
 Zy)ParallelSentencesDatasetu  
    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
    sentence in different languages. For example, the file can look like this (EN	DE	ES):
    hello world     hallo welt  hola mundo
    second sentence zweiter satz    segunda oración

    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
    mapped to this English sentence embedding.

    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
    returns a list of sentence embeddings
    c                    || _         || _        g | _        g | _        g | _        g | _        g | _        g | _        || _        || _	        i | _
        d| _        y)a+  
        Parallel sentences dataset reader to train student model given a teacher model

        Args:
            student_model (SentenceTransformer): The student sentence embedding model that should be trained.
            teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
            batch_size (int, optional): The batch size for training. Defaults to 8.
            use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
        r   N)student_modelteacher_modeldatasetsdatasets_iteratordatasets_tokenizeddataset_indicescopy_dataset_indicescache
batch_sizeuse_embedding_cacheembedding_cachenum_sentences)selfr
   r   r   r   s        j/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__z!ParallelSentencesDataset.__init__    sa      +*!#"$!$&!
$#6 !    Nc           
        t         j                  d|z          g }|j                  d      rt        j                  |dd      nt	        |d      5 }d}|D ]t  }|j                         j                  d      }	|+|dkD  r&t        |	D 
cg c]  }
t        |
       c}
      |kD  rO|j                  |	       |d	z  }|h|dkD  sn||k\  st n ddd       | j                  ||||
       yc c}
w # 1 sw Y   $xY w)a  
        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

        Args:
            filepath (str): Filepath to the file.
            weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
            max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
            max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

        Returns:
            None
        zLoad z.gzrtutf8)encodingr   	N   )weightmax_sentencesmax_sentence_length)loggerinfoendswithgzipopenstripsplitmaxlenappendadd_dataset)r   filepathr    r!   r"   parallel_sentencesfIncountline	sentencessents              r   	load_dataz"ParallelSentencesDataset.load_data=   s     	Gh&'   ' IIhv6h0	 58E  JJL..t4	'3+a/9=4SY=>ATT")))4
 ,1BuP]G]	& 	v]`s 	 	
 >	 	s0   7C0C+#C0>C0C0
C0+C00C9c                   i }|D ]y  }|+|dkD  r&t        |D cg c]  }t        |       c}      |kD  r0|d   }||vrt               ||<   |D ]  }||   j                  |        |d|dkD  sjt        |      |k\  sy n t        |      dk(  ry | xj                  t        |D cg c]  }t        ||          c}      z  c_        t        | j                        }	| j                  j                  t        |j                                      | j                  j                  d       | j                  j                  |	g|z         y c c}w c c}w Nr   )r*   r+   setaddr   sumr   r,   listitemsr   r   extend)
r   r/   r    r!   r"   sentences_mapr3   r4   source_sentence
dataset_ids
             r   r-   z$ParallelSentencesDataset.add_datasetg   sI    + 	I#/'!+y9tT9:=PP'lOm314o.! 9o.22489 (]Q->3}CUYfCf!	$ }"c"V3}T':#;"VWW'
T-"5"5"789%%a(##ZL6$9:- :" #Ws   E

%Ec                ~   g }g }| j                   D ]8  }| j                  |      \  }}|j                  |       |j                  |       : | j                  |      }t	        ||      D ]3  \  }}|D ])  }| j
                  j                  t        |g|             + 5 t        j                  | j
                         y )N)textslabel)	r   
next_entryr,   get_embeddingszipr   r   randomshuffle)	r   source_sentences_listtarget_sentences_listdata_idxsrc_sentencetrg_sentencessrc_embeddingssrc_embeddingtrg_sentences	            r   generate_dataz&ParallelSentencesDataset.generate_data   s     " ",, 	8H*.//(*C'L-!((6!((7	8 ,,-BC,/@U,V 	[(M= - [

!!,l^="YZ[	[ 	tzz"r   c                .   | j                   |   | j                  |      \  }}| j                  |xx   dz  cc<   | j                  |   t        | j                   |         k\  r1d| j                  |<   t        j                  | j                   |          ||fS )Nr   r   )r   r   r+   rG   rH   )r   rK   sourcetarget_sentencess       r   rD   z#ParallelSentencesDataset.next_entry   s    #'==#:4;Q;QRZ;[#\  x(A-(!!(+s4==3J/KK/0D""8,NN4==23'''r   c                   | j                   s)| j                  j                  || j                  dd      S g }|D ]"  }|| j                  vs|j                  |       $ t        |      dkD  rL| j                  j                  || j                  dd      }t        ||      D ]  \  }}|| j                  |<    |D cg c]  }| j                  |    c}S c c}w )NFT)r   show_progress_barconvert_to_numpyr   )r   r   encoder   r   r,   r+   rF   )r   r3   new_sentencesr4   new_embeddings	embeddings         r   rE   z'ParallelSentencesDataset.get_embeddings   s    ''%%,,dooae -  
  	+D4///$$T*	+ }!!//66$//Uei 7 N $'}n#E 7i-6$$T*7 8AAt$$T*AAAs   =Cc                    | j                   S )N)r   )r   s    r   __len__z ParallelSentencesDataset.__len__   s    !!!r   c                    t        | j                        dk(  r| j                          | j                  j                         S r7   )r+   r   rQ   pop)r   idxs     r   __getitem__z$ParallelSentencesDataset.__getitem__   s/    tzz?a zz~~r   )   T)r
   r   r   r   r   intr   bool)d   N   )
r.   strr    rc   r!   rc   r"   rc   returnNone)r/   zlist[list[str]]r    rc   r!   rc   r"   rc   )__name__
__module____qualname____doc__r   r5   r-   rQ   rD   rE   r]   ra    r   r   r   r      s    ( $(* + 	
 "< gj(
(
%((
?B(
`c(
	(
Z !#&";+"; "; 	";
 !";H#"(B*" r   r   )
__future__r   r&   loggingrG   torch.utils.datar   sentence_transformersr   sentence_transformers.readersr   	getLoggerrj   r#   r   rn   r   r   <module>ru      s:    "    $ 5 6			8	$s w s r   