
    +sg	                    b    d dl mZ d dlZd dlZd dlZd dlZd dlmZ ddlm	Z	mZ  G d de      Z
y)    )annotationsN)Iterable   )ENGLISH_STOP_WORDSWordTokenizerc                  Z    e Zd ZdZg edf	 	 	 	 	 d
dZd ZddZddZddZ	e
dd       Zy	)WhitespaceTokenizerz
    Simple and fast white-space tokenizer. Splits sentence based on white spaces.
    Punctuation are stripped from tokens.
    Fc                T    t        |      | _        || _        | j                  |       y N)set
stop_wordsdo_lower_case	set_vocab)selfvocabr   r   s       m/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py__init__zWhitespaceTokenizer.__init__   s#     j/*u    c                    | j                   S r   )r   )r   s    r   	get_vocabzWhitespaceTokenizer.get_vocab   s    zzr   c                    || _         t        j                  t        |      D cg c]	  \  }}||f c}}      | _        y c c}}w r   )r   collectionsOrderedDict	enumerateword2idx)r   r   idxwords       r   r   zWhitespaceTokenizer.set_vocab   s7    
#//iX]N^0_d$0_`0_s   A
c                R   | j                   r|j                         }|j                         }g }|D ]  }|| j                  v r|| j                  v r|j                  | j                  |          ?|j                  t        j                        }|| j                  v rmt        |      dkD  r-|| j                  v r|j                  | j                  |          |j                         }|| j                  v r|| j                  v s|j                  | j                  |           |S )Nr   )
r   lowersplitr   r   appendstripstringpunctuationlen)r   textkwargstokenstokens_filteredtokens         r   tokenizezWhitespaceTokenizer.tokenize    s   ::<D 	E'$--'&&t}}U';<KK 2 23E'UaET]]$:&&t}}U';<KKME'$--'&&t}}U';<'	* r   c                2   t        t        j                  j                  |d      d      5 }t	        j
                  t        | j                  j                               t        | j                        | j                  d|       d d d        y # 1 sw Y   y xY w)Nwhitespacetokenizer_config.jsonw)r   r   r   )openospathjoinjsondumplistr   keysr   r   )r   output_pathfOuts      r   savezWhitespaceTokenizer.save>   su    "'',,{,MNPST 	X\II!$--"4"4"67"&t"7%)%7%7
 	 	 	s   ABBc                    t        t        j                  j                  | d            5 }t	        j
                  |      }d d d        t        di S # 1 sw Y   xY w)Nr-    )r/   r0   r1   r2   r3   loadr	   )
input_pathfInconfigs      r   r<   zWhitespaceTokenizer.loadI   sO    "'',,z+LMN 	$RUYYs^F	$ #,V,,	$ 	$s   AAN)r   Iterable[str]r   r@   r   bool)r   r@   )r&   strreturnz	list[int])r7   rB   )r=   rB   )__name__
__module____qualname____doc__r   r   r   r   r+   r9   staticmethodr<   r;   r   r   r	   r	      sX     &(EWot"5Bhla<	 - -r   r	   )
__future__r   r   r3   r0   r#   collections.abcr   r   r   r	   r;   r   r   <module>rK      s(    "   	  $ <B-- B-r   