
    +sg_%                        d dl mZ d dlZd dlZd dlmZ d dlZd dlZd dl	m
Z d dl	mZ d dlmZ d dlmZ d dlmZ d d	lmZ  G d
 dej*                        Zy)    )annotationsN)Path)	load_file)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)get_device_namec                       e Zd Z	 	 d	 	 	 	 	 	 	 d fdZddZddZddZedd       ZddZ	dddZ
ddZe	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd	       Zedd
       Z xZS )StaticEmbeddingc                   t         |           t        |t              r|j                  }nt        |t
              st        d      |Vt        |t        j                        rt        j                  |      }t        j                  j                  |d      | _        n7|*t        j                  |j                         |      | _        nt        d      | j                  j                   | _        | j                  j"                  | _        || _        | j$                  j'                          |j)                  dd      | _        y)u  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.array | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda (Ailuropoda melanoleuca; Chinese: 大熊猫; pinyin: dàxióngmāo), also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.9177]]) (If you use the distilled bge-base)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer	   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsembedding_dim	tokenizer
no_paddinggetr   )selfr   embedding_weightsr   kwargs	__class__s        _/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/models/StaticEmbedding.pyr   zStaticEmbedding.__init__   s   V 	i!89!,,IIy1^ 
 (+RZZ8$)$4$45F$G!__<<=NW\<]DN&__Y-E-E-GWDN^__"nn;;!^^99$-!!# !**\48    c                   | j                   j                  |d      }|D cg c]  }|j                   }}t        j                  t        j                  dg|d d D cg c]  }t        |       c}z               }t        j                  |D cg c]  }|D ]  }|  c}}t        j                        }	|	|dS c c}w c c}w c c}}w )NF)add_special_tokensr   )dtype)	input_idsoffsets)
r   encode_batchidsr   r   r   cumsumlentensorlong)
r"   textsr$   	encodingsencodingencodings_ids	token_idsr-   token_idr,   s
             r&   tokenizezStaticEmbedding.tokenize[   s    NN//%/P	6?@(@@""299aSTabeceTf3gyC	N3g-g#hiLLM!dyZc!dh(!d(!dlqlvlvw	&7;;	 A3g!ds   B>CC
c                <    | j                  |d   |d         |d<   |S )Nr,   r-   sentence_embedding)r   )r"   featuresr$   s      r&   forwardzStaticEmbedding.forwardc   s(    )-8MxXaOb)c%&r'   c                    i S N r"   s    r&   get_config_dictzStaticEmbedding.get_config_dictg   s    	r'   c                "    t         j                  S r@   )mathinfrB   s    r&   max_seq_lengthzStaticEmbedding.max_seq_lengthj   s    xxr'   c                    | j                   S r@   )r   rB   s    r&    get_sentence_embedding_dimensionz0StaticEmbedding.get_sentence_embedding_dimensionn   s    !!!r'   c                ^   |r9t        | j                         t        j                  j	                  |d             nBt        j                  | j                         t        j                  j	                  |d             | j                  j                  t        t        |      dz               y )Nmodel.safetensorspytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr   saver   strr   )r"   save_dirsafe_serializationr$   s       r&   rS   zStaticEmbedding.saveq   sl    !$//"3RWW\\(L_5`aJJt("'',,xAT*UVCX1A ABCr'   c                   t        j                  t        t        |       dz              }t        j
                  j                  t        j
                  j                  | d            r*t        t        j
                  j                  | d            }nIt        j                  t        j
                  j                  | d      t        j                  d      d      }|d   }t        ||      S )	NrM   rK   rL   cpuT)map_locationweights_onlyzembedding.weight)r#   )r   	from_filerT   r   rP   rQ   existsrR   load_safetensors_filer   loaddevicer   )load_dirr$   r   weightss       r&   r^   zStaticEmbedding.loadx   s    ''DN=M,M(NO	77>>"'',,x1DEF+BGGLLCV,WXGjjX':;%,,W\J]lpG ,-yGDDr'   c                R   	 ddl m} t               } |||||||      }t	        |j
                  t        j                        r t        j                  |j
                        }	n|j
                  j                  }	|j                  }
 | |
|	|      S # t        $ r t        d      w xY w)a  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`)
vocabularyr_   pca_dims
apply_zipfuse_subwordr#   r   )model2vec.distillrc   ImportErrorr
   r   r   r   r   r   r   weightr   )cls
model_namerd   r_   re   rf   rg   rc   static_modelr#   r   s              r&   from_distillationz!StaticEmbedding.from_distillation   s    <	1 !"!!#
 l,,bjj9 % 0 01G1G H , 6 6 = =+55	90AjYY)  	n 	s   B B&c                D   	 ddl m} |j                  |      }t	        |j
                  t        j                        r t        j                  |j
                        }n|j
                  j                  }|j                  } | |||      S # t        $ r t        d      w xY w)aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`rh   )	model2vecrq   rj   r   r   r   r   r   r   r   rk   r   )rl   model_id_or_pathrq   rn   r#   r   s         r&   from_model2veczStaticEmbedding.from_model2vec   s    "	u- #223CDl,,bjj9 % 0 01G1G H , 6 6 = =+55	90AN^__  	ustt	us   B
 
B)NN)r   z#Tokenizer | PreTrainedTokenizerFastr#   znp.array | torch.Tensor | Noner   
int | NonereturnNone)r4   z	list[str]rv   dict[str, torch.Tensor])r=   rx   rv   rx   )rv   zdict[str, float])rv   int)T)rU   rT   rV   boolrv   rw   )r`   rT   rv   r   )NN   TT)rm   rT   rd   zlist[str] | Noner_   z
str | Nonere   ru   rf   rz   rg   rz   rv   r   )rs   rT   rv   r   )__name__
__module____qualname__r   r:   r>   rC   propertyrG   rI   rS   r^   classmethodro   rt   __classcell__)r%   s   @r&   r   r      s     =A$(	F96F9 :F9 "	F9 
F9P<  "D	E  (,!" 3Z3Z %3Z 	3Z
 3Z 3Z 3Z 
3Z 3Zj ` `r'   r   )
__future__r   rE   rP   pathlibr   numpyr   r   safetensors.torchr   r]   r   rN   
tokenizersr   r   transformersr	   sentence_transformers.utilr
   Moduler   rA   r'   r&   <module>r      s=    "  	    @ @    0 6D`bii D`r'   