
    +sgd                        d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZmZ  ej                  e      Z G d de      Zy)	    )annotationsN)Iterable)NLTK_IMPORT_ERRORis_nltk_available   )ENGLISH_STOP_WORDSWordTokenizerc                  f    e Zd ZdZg edddf	 	 	 	 	 	 	 	 	 ddZd ZddZddZdd	Z	e
dd
       Zy)PhraseTokenizera~  Tokenizes the text with respect to existent phrases in the vocab.

    This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
    in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
    F_   c                    t               s2t        t        j                  | j                  j
                              t        |      | _        || _        || _	        || _
        | j                  |       y N)r   ImportErrorr   format	__class____name__set
stop_wordsdo_lower_casengram_separatormax_ngram_length	set_vocab)selfvocabr   r   r   r   s         i/var/www/html/venv/lib/python3.12/site-packages/sentence_transformers/models/tokenizer/PhraseTokenizer.py__init__zPhraseTokenizer.__init__   sZ     !"/66t~~7N7NOPPj/*. 0u    c                    | j                   S r   )r   )r   s    r   	get_vocabzPhraseTokenizer.get_vocab)   s    zzr   c                   || _         t        j                  t        |      D cg c]	  \  }}||f c}}      | _        t               | _        t               | _        |D ]  }| j                  | j                  |v s|j                  | j                        dz   }| j                  | j                  z   |vsY|| j                  k  si| j                  j                  |       | j                  j                  |        t        |      dkD  rNt        j                  d| j                          t        j                  dt        | j                                y y c c}}w )Nr   r   z(PhraseTokenizer - Phrase ngram lengths: zPhraseTokenizer - Num phrases: )r   collectionsOrderedDict	enumerateword2idxr   ngram_lookupngram_lengthsr   countr   addlenloggerinfo)r   r   idxwordngram_counts        r   r   zPhraseTokenizer.set_vocab,   s-   
#//iX]N^0_d$0_`  E U 	8D##/D4H4HD4P"jj)=)=>B''$*>*>>dJ{^b^s^sOs%%))$/&&**;7	8 u:>KKB4CUCUBVWXKK9#d>O>O:P9QRS  1`s   E
c                   ddl m}  ||d      }t        | j                  d      D ]  }d}|t	        |      |z
  k  s| j
                  j                  ||||z          }|| j                  v r
|g||||z    n3|j                         | j                  v r|j                         g||||z    |dz  }|t	        |      |z
  k  r g }|D ]  }	|	| j                  v r|	| j                  v r|j                  | j                  |	          ?|	j                         }	|	| j                  v r^|	| j                  v r|j                  | j                  |	          |	j                  t        j                        }	|	| j                  v rt	        |	      dkD  s|	| j                  v s|j                  | j                  |	           |S )Nr   )word_tokenizeT)preserve_line)reverser   )nltkr1   sortedr'   r*   r   joinr&   lowerr   r%   appendstripstringpunctuation)
r   textkwargsr1   tokens	ngram_lenr-   ngramtokens_filteredtokens
             r   tokenizezPhraseTokenizer.tokenize?   s   &t48   2 2DA 	ICVy00,,11&sY2OPD---5:GF3y1[[]d&7&775:[[]OF3y1q Vy00	  	E'$--'&&t}}U';<KKME'$--'&&t}}U';<KK 2 23E'UaET]]$:&&t}}U';<'	* r   c           	     ^   t        t        j                  j                  |d      d      5 }t	        j
                  t        | j                  j                               t        | j                        | j                  | j                  | j                  d|       d d d        y # 1 sw Y   y xY w)Nphrasetokenizer_config.jsonw)r   r   r   r   r   )openospathr6   jsondumplistr%   keysr   r   r   r   )r   output_pathfOuts      r   savezPhraseTokenizer.saveh   s    "'',,{,IJCP 
	TXII!$--"4"4"67"&t"7%)%7%7'+';';(,(=(= 	
	 
	 
	s   A/B##B,c                    t        t        j                  j                  | d            5 }t	        j
                  |      }d d d        t        di S # 1 sw Y   xY w)NrE    )rG   rH   rI   r6   rJ   loadr   )
input_pathfInconfigs      r   rS   zPhraseTokenizer.loadu   sN    "'',,z+HIJ 	$cYYs^F	$ (((	$ 	$s   AAN)
r   Iterable[str]r   rW   r   boolr   strr   int)r   rW   )r<   rY   returnz	list[int])rN   rY   )rT   rY   )r   
__module____qualname____doc__r   r   r    r   rC   rP   staticmethodrS   rR   r   r   r   r      sw      "$6#" ! " 	
  "T&'R ) )r   r   )
__future__r   r"   rJ   loggingrH   r:   collections.abcr   transformers.utils.import_utilsr   r   r	   r   	getLoggerr   r+   r   rR   r   r   <module>re      s@    "    	  $ P <			8	$i)m i)r   