
    +sgE                        d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dlmZm	Z	 ddl
mZ  ej                  e      Z G d de	j                        Zy)	    )annotationsN)Literal)Tensornn   )WhitespaceTokenizerc                       e Zd ZdZi ddf	 	 	 	 	 	 	 d fdZddZddZd Z	 d	 	 	 	 	 ddZd	 Z	d
 Z
ed        Z xZS )BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   Tc                   t         |           t        t        |            }g d| _        || _        || _        || _        || _        g | _	        d}|D ]T  }|}||v r||   }n+|j                         |v r||j                            }n|dz  }| j                  j                  |       V t        j                  | dt        |       d|        t        |t               d      | _        t        |      | _        y )N)vocabword_weightsunknown_word_weightcumulative_term_frequencyr   r   z out of z0 words without a weighting value. Set weight to F)
stop_wordsdo_lower_case)super__init__listsetconfig_keysr   r   r   r   weightslowerappendloggerinfolenr   	tokenizersentence_embedding_dimension)	selfr   r   r   r   num_unknown_wordswordweight	__class__s	           S/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/models/BoW.pyr   zBoW.__init__   s     	SZ h
(#6 )B&  	(D(F|#%d+-%djjl3!Q&!LL'	( 	 !#e*5efyez{	
 -UsuTYZ,/J)    c                    |S N )r   featuress     r$   forwardzBoW.forward9   s    r%   c                    |D cg c]  } | j                   j                  |fi |! }}| j                  |      S c c}w r'   )r   tokenizeget_sentence_features)r   textskwargstext	tokenizeds        r$   r,   zBoW.tokenize=   sC    INO,T^^,,T<V<O	O)))44 Ps   $<c                    | j                   S r'   )r   )r   s    r$    get_sentence_embedding_dimensionz$BoW.get_sentence_embedding_dimensionA   s    000r%   c                J   g }|D ]  }t        j                  | j                         t         j                        }|D ];  }| j                  r||xx   | j
                  |   z  cc<   *| j
                  |   ||<   = |j                  |        dt        j                  |      iS )N)dtypesentence_embedding)torchzerosr3   float32r   r   r   stack)r   tokenized_textspad_seq_lengthvectorstokensvectortokens          r$   r-   zBoW.get_sentence_featuresD   s     % 	#F[[!F!F!HPUP]P]^F 8115MT\\%%88M$(LL$7F5M	8
 NN6"	# %ekk'&:;;r%   c                \    | j                   D ci c]  }|| j                  |    c}S c c}w r'   )r   __dict__)r   keys     r$   get_config_dictzBoW.get_config_dictT   s*    373C3CDCT]]3''DDDs   )c                    t        t        j                  j                  |d      d      5 }t	        j
                  | j                         |d       d d d        y # 1 sw Y   y xY w)Nconfig.jsonw   )indent)openospathjoinjsondumprD   )r   output_pathfOuts      r$   savezBoW.saveW   sK    "'',,{M:C@ 	>DIId**,d1=	> 	> 	>s   'AA$c                    t        t        j                  j                  | d            5 }t	        j
                  |      }d d d        t        di S # 1 sw Y   xY w)NrF   r(   )rJ   rK   rL   rM   rN   loadr
   )
input_pathfInconfigs      r$   rT   zBoW.load[   sK    "'',,z=9: 	$cYYs^F	$ }V}	$ 	$s   AA)r   	list[str]r   zdict[str, float]r   floatr   bool)r)   zdict[str, Tensor])r.   rX   returnz	list[int])r   )r;   zlist[list[int]]r<   intr[   z1dict[Literal['sentence_embedding'], torch.Tensor])__name__
__module____qualname____doc__r   r*   r,   r3   r-   rD   rR   staticmethodrT   __classcell__)r#   s   @r$   r
   r
      s     *,%&*.!7!7 '!7 #	!7
 $(!7F51 GH<.<@C<	:< E>  r%   r
   )
__future__r   rN   loggingrK   typingr   r7   r   r   r   r   	getLoggerr]   r   Moduler
   r(   r%   r$   <module>rh      sA    "   	    *			8	$P")) Pr%   