
    +sg                         d dl mZ d dlmZ d dlZd dlmc mZ d dlm	Z	mZ d dl
mZmZ  G d dej                        Zy)    )annotations)IterableN)Tensornn)SentenceTransformerutilc                  b     e Zd Z	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZedd       Z xZS )	MegaBatchMarginLossc                    t         |           || _        || _        || _        || _        |r| j                  | _        y| j                  | _        y)a  
        Given a large batch (like 500 or more examples) of (anchor_i, positive_i) pairs, find for each pair in the batch
        the hardest negative, i.e. find j != i such that cos_sim(anchor_i, positive_j) is maximal. Then create from this a
        triplet (anchor_i, positive_i, positive_j) where positive_j serves as the negative for this triplet.

        Then train as with the triplet loss.

        Args:
            model: SentenceTransformerModel
            positive_margin: Positive margin, cos(anchor, positive)
                should be > positive_margin
            negative_margin: Negative margin, cos(anchor, negative)
                should be < negative_margin
            use_mini_batched_version: As large batch sizes require a lot
                of memory, we can use a mini-batched version. We break
                down the large batch into smaller batches with fewer
                examples.
            mini_batch_size: Size for the mini-batches. Should be a
                devisor for the batch size in your data loader.

        References:
            - This loss function was inspired by the ParaNMT paper: https://www.aclweb.org/anthology/P18-1042/

        Requirements:
            1. (anchor, positive) pairs
            2. Large batches (500 or more examples)

        Inputs:
            +---------------------------------------+--------+
            | Texts                                 | Labels |
            +=======================================+========+
            | (anchor, positive) pairs              | none   |
            +---------------------------------------+--------+

        Recommendations:
            - Use ``BatchSamplers.NO_DUPLICATES`` (:class:`docs <sentence_transformers.training_args.BatchSamplers>`) to
              ensure that no in-batch negatives are duplicates of the anchor or positive samples.

        Example:
            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses
                from datasets import Dataset

                train_batch_size = 250
                train_mini_batch_size = 32

                model = SentenceTransformer('all-MiniLM-L6-v2')
                train_dataset = Dataset.from_dict({
                    "anchor": [f"This is sentence number {i}" for i in range(500)],
                    "positive": [f"This is sentence number {i}" for i in range(1, 501)],
                })
                loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)

                args = SentenceTransformerTrainingArguments(
                    output_dir="output",
                    per_device_train_batch_size=train_batch_size,
                )
                trainer = SentenceTransformerTrainer(
                    model=model,
                    args=args,
                    train_dataset=train_dataset,
                    loss=loss,
                )
                trainer.train()
        N)	super__init__modelpositive_marginnegative_marginmini_batch_sizeforward_mini_batchedforward_non_mini_batchedforward)selfr   r   r   use_mini_batched_versionr   	__class__s         c/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/losses/MegaBatchMarginLoss.pyr   zMegaBatchMarginLoss.__init__   sL    T 	
...4Lt00RVRoRo    c           
     ,   |\  }}t        |j                               }t        j                         5  | j                  j                          | j	                  |      d   j                         }| j                  j                          d d d        t        j                  t              t        |      |j                        }t        dt        |      | j                        D ]  }|| j                  z   }	| j	                  |D 
ci c]  }
|
||
   ||	  c}
      d   }|D 
ci c]  }
|
g  }}
t        j                         5  t        j                  ||      }|d|||	 z  z
  }t        j                  |d      \  }}d d d        D ]#  }|D ]  }
||
   j!                  ||
   |           % |D ]  }
t        j"                  ||
         ||
<    | j	                  |D 
ci c]  }
|
||
   ||	  c}
      d   }| j	                  |      d   }|j$                  |j$                  k(  sJ |j$                  |j$                  k(  sJ t'        j(                  ||      }t'        j(                  ||      }t'        j*                  | j,                  |z
        t'        j*                  || j.                  z
        z   }|j1                         }|	t              k  s|j3                           S # 1 sw Y   exY wc c}
w c c}
w # 1 sw Y   xY wc c}
w )Nsentence_embedding)devicer         dim)listkeystorchno_gradr   evaldetachtraineyelenr   ranger   r   pytorch_cos_simmaxappendstackshapeFcosine_similarityrelur   r   meanbackward)r   sentence_featureslabelsanchorpositivefeature_namesall_positive_embdiagonal_matrix	start_idxend_idxkey
anchor_embhard_negative_features
cos_scoresnegative_scoresnegatives_maxnegatives_idshard_negative_idpositive_embnegative_emb
pos_cosine
neg_cosinelossess                          r   r   z(MegaBatchMarginLoss.forward_mini_batched^   s   ,V[[]+]]_ 	JJOO#zz(34HIPPRJJ	
  ))C(8$93?O;PYiYpYpq q#&6"79M9MN (	"I$"6"66GTa$bSS&+i*H%H$bc$J :G%G#c2g%G"%G Q!11*>NO
_Yw%G!GG   05yya/P,}Q %2 X ( XC*3/66x}EU7VWXX % W.3kk:PQT:U.V&s+W  ::Xe&fQTsHSM)G,L'L&fg$L  ::&<=>RSL##|'9'9999##|'9'9999 ,,ZFJ,,ZFJVVD00:=>
UYUiUiHiAjjF[[]F Z(!Q(	"T e	 	 %c &HQ Q 'gs*   AK-K:
&
K?<L#L
-K7L	c                   |D cg c]  }| j                  |      d    }}|\  }}t        j                  ||      }t        j                  |      }|dt        j
                  |j                  d|j                  iz  z
  }	t        j                  |	d      \  }
}t        j                  | j                  |z
        t        j                  |
| j                  z
        z   }|j                         S c c}w )Nr   r   r   r   r   )r   r   r+   r#   diagonalr(   r/   r   r,   r0   r2   r   r   r3   )r   r5   r6   sentence_featurerepsembeddings_aembeddings_brA   positive_scoresrB   rC   _rJ   s                r   r   z,MegaBatchMarginLoss.forward_non_mini_batched   s    [lmGW

+,-ABmm%)"l)),E
..4$		:++FJ4E4EFF
 !99_!<q,,>?!&&Y]YmYmImBnn{{} ns   C0c                     y)Na  
@inproceedings{wieting-gimpel-2018-paranmt,
    title = "{P}ara{NMT}-50{M}: Pushing the Limits of Paraphrastic Sentence Embeddings with Millions of Machine Translations",
    author = "Wieting, John and Gimpel, Kevin",
    editor = "Gurevych, Iryna and Miyao, Yusuke",
    booktitle = "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2018",
    address = "Melbourne, Australia",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P18-1042",
    doi = "10.18653/v1/P18-1042",
    pages = "451--462",
}
 )r   s    r   citationzMegaBatchMarginLoss.citation   s    r   )g?g333333?T2   )r   r   r   floatr   rW   r   boolr   intreturnNone)r5   zIterable[dict[str, Tensor]]r6   r   rZ   r   )rZ   str)	__name__
__module____qualname__r   r   r   propertyrU   __classcell__)r   s   @r   r
   r
      s~     "%!$)-!Op"Op Op 	Op
 #'Op Op 
Opb6r  r   r
   )
__future__r   collections.abcr   r#   torch.nn.functionalr   
functionalr0   r   sentence_transformersr   r   Moduler
   rT   r   r   <module>rh      s,    " $     ;h")) hr   