
    +sg2                    V    d dl mZ d dlZd dlmZ d dlmZmZ d dl	m
Z
  G d de      Zy)    )annotationsN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                  <    e Zd ZdZd fddZd Zd Zed	d       Zy)
DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                ,    t         j                  |       S N)r	   delete)ss    m/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>z$DenoisingAutoEncoderDataset.<lambda>   s    @[@b@bcd@e     c                    t               s2t        t        j                  | j                  j
                              || _        || _        y r   )r   ImportErrorr   format	__class____name__	sentencesnoise_fn)selfr   r   s      r   __init__z$DenoisingAutoEncoderDataset.__init__   s7     "/66t~~7N7NOPP" r   c                Z    | j                   |   }t        | j                  |      |g      S )N)texts)r   r   r   )r   itemsents      r   __getitem__z'DenoisingAutoEncoderDataset.__getitem__   s)    ~~d#4==#6"=>>r   c                ,    t        | j                        S r   )lenr   )r   s    r   __len__z#DenoisingAutoEncoderDataset.__len__!   s    4>>""r   c                N   ddl m} ddlm}  ||       }t	        |      }|dk(  r| S t
        j                  j                  |      |kD  }t        |      dk(  r"d|t
        j                  j                  |      <    |       j                  t        j                  |      |         }|S )Nr   )word_tokenize)TreebankWordDetokenizerT)nltkr#   nltk.tokenize.treebankr$   r    nprandomrandsumchoice
detokenizearray)text	del_ratior#   r$   wordsnkeep_or_notwords_processeds           r   r   z"DenoisingAutoEncoderDataset.delete%   s    &Bd#J6KiinnQ')3{q /3K		((+,13>>rxx{?[\r   N)r   z	list[str])g333333?)	r   
__module____qualname____doc__r   r   r!   staticmethodr    r   r   r	   r	   
   s0    	 7f !?#  r   r	   )
__future__r   numpyr'   torch.utils.datar   transformers.utils.import_utilsr   r   *sentence_transformers.readers.InputExampler   r	   r8   r   r   <module>r>      s     "  $ P C)' )r   