
     sgv                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ ddl ddlmZ dd	l m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.Z/d dl0mZ1 d dl2m3Z4 ddl5m6Z6 ddl7m8Z8  G d d      Z9 G d d      Z:ee1jv                  e/jx                     e1jv                  e/jz                     ge1jv                  e/jz                     f   Z> G d dee>         Z?ee1jv                  e/jx                     e1jv                  e/jz                     ge@f   ZA G d deeA         ZB G d de>      ZCy)    )annotationsN)AnyListLiteralOptionalUnion	GeneratorSequenceIteratorDequeCallableDict)deque)Path   )*)LlamaGrammar)BaseLlamaCache
LlamaCacheLlamaDiskCacheLlamaRAMCache)BaseLlamaTokenizerLlamaTokenizer)LlamaDraftModel)set_verbose)suppress_stdout_stderrc            *         e Zd ZdZdZdej                  dddddddej                  dddddej                  ej                  dddd	d
d	dddddddd	ddddddddddd)	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d=dZ
ed>d       Zed?d       Zed@d       ZedAd       ZedBd       ZedCd       Z	 dD	 	 	 	 	 	 	 dEdZ	 	 dF	 	 	 	 	 	 	 dGdZdHdZdIdZd ZdJdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dLd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dNd!Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dO	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dPd"Z	 dQ	 	 	 	 	 dRd#Z	 	 	 dS	 	 	 	 	 	 	 dTd$Zdd%dddd	ddg ddd	dddd	ddddddddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dUd&Zdd%dddd	ddg ddd	dddd	ddddddddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dVd'Zdd%dddd	ddg ddd	dddd	ddddddddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dWd(Zddddd)dddd	dg dddddd	d	dddddddddf	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dXd*Z 	 	 	 	 dYd+Z!d, Z"d- Z#dZd.Z$d[d/Z%d\d0Z&d\d1Z'd\d2Z(d]d3Z)d\d4Z*d\d5Z+d\d6Z,d^d7Z-d_d8Z.d_d9Z/e0	 d`	 	 	 	 	 dad:       Z1e0dbd;       Z2e3	 	 	 	 dc	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddd<       Z4y)eLlamaz0High-level Python wrapper for a llama.cpp model.Fr   NTi           g            ?g      @@@   ))n_gpu_layers
split_modemain_gputensor_splitrpc_servers
vocab_onlyuse_mmap	use_mlockkv_overridesseedn_ctxn_batchn_ubatch	n_threadsn_threads_batchrope_scaling_typepooling_typerope_freq_baserope_freq_scaleyarn_ext_factoryarn_attn_factoryarn_beta_fastyarn_beta_slowyarn_orig_ctx
logits_all	embeddingoffload_kqv
flash_attnlast_n_tokens_size	lora_base
lora_scale	lora_pathnumachat_formatchat_handlerdraft_model	tokenizertype_ktype_v
spm_infillverbosec       )   
     $    |* _         t        j                          _        t	        |*       t
        j                  s4t        |*      5  t        j                          ddd       dt
        _        t        |"t              r(|"rt        j                  nt        j                   _        n|" _         j                  t        j                  k7  r4t        |*      5  t        j                   j                         ddd       | _        t        j"                          _        |dk(  rdn| j$                  _        | j$                  _        | j$                  _        |(|j-                  d       j$                  _        | _        nd _        | _        d _         j2                  t7         j2                        t        j8                  kD  rt;        dt        j8                         t<        j>                  t        j8                  z  }, |,|  _         j4                   j$                  _        | j$                  _         |!|nd j$                  _!        |	 j$                  _"        |
 _#        |
t7        |
      d	z   }-t        jH                  |-z          _%        tM        |
jO                               D ]]  \  }.\  }/}0|/j-                  d       jJ                  |.   _(        t        |0t              rAt        jR                   jJ                  |.   _*        |0 jJ                  |.   jV                  _,        ~t        |0tZ              rAt        j\                   jJ                  |.   _*        |0 jJ                  |.   jV                  _/        t        |0t`              rBt        jb                   jJ                  |.   _*        |0 jJ                  |.   jV                  _2        !t        |0tf              r|0j-                  d      }1t7        |1      d
kD  rt;        d|/ d|0       |1ji                  d
d      }1t        jj                   jJ                  |.   _*        tm        jn                  tZ        t=        jp                   jJ                  |.   jV                        t        jr                  jt                  jv                  z         }2t=        jn                  |2t=        jx                  t<        jz                              }3t=        j|                  |3|1d
       Ot;        d|/ d|0        d jJ                  d   _(         jJ                   j$                  _#        t        ||       _@        |xs! t        t        j                         dz  d	       _D        |xs t        j                          _E        |xs t        j                   _G        t        j                          _I        | j                  _J         j                   j                  _@        t         j                  |       j                  _K         j                   j                  _D         j                   j                  _E        ||nt        j                   j                  _M        | j                  _N        |dk7  r|nd j                  _O        |dk7  r|nd j                  _P        |dk7  r|nd j                  _Q        |dk7  r|nd j                  _R        |dk7  r|nd j                  _S        |dk7  r|nd j                  _T        |dk7  r|nd j                  _U        |%|nd j                  _V        | j                  _W        | j                  _X        | j                  _Y        |'|' j                  _Z        |(|( j                  _[        | _\        d _]        | _^        |  __        |! _`        |) _a        t        j                  j                  |      st;        d|        j                  j                  t        j                  t        j                   j                    j$                   j                                      _i        |&xs t                _k        |dk(  r j                  j                         }t        ||       _@         j                  j                          j                  _J         j                   j                  _@        t         j                  |       j                  _K         j                  j                  t        j                  t        j                   j                   j                   j                                      _n         j                  j                  t        j                  t        j                   j                  d j                  j                   j                                      _p        d _q         j                  rt        j                   j                  j                   j                  j-                  d             _q         j                  t        d j                          fd}4 j                  j                  |4       t        j                   j                  j                   j                   j                        rt        d j                          j                   r<t        t        j                         j                  d      t        j                         |# _}        |$ _~        i  _        |% _         j                          _         j                          _         j	                          _         j                          _        t        j                   j                         _        d _        t        j                  |ft        j                         _        t        j                  |dk(  r|n| j                  ft        j                         _        t=        j>                  d       _        	  j                  j%                          _         j                   r)t        d j$                   t        j                          j                         }6 j)                         }7|6dk7  r j                  j+                  |6      nd }8|7dk7  r j                  j+                  |7      nd }9t-        d!  j$                  jO                         D              }:d" j$                  v r j$                  d"   |:d#<    j                   r?|:r=t        d$d%j/                  |:j1                                t        j                         |:jO                         D ]=  \  };}<t3        j4                  |<|8|9|6g&      j7                          j                  |;<   ?  j                  ։ j                  d#|:v rƐt3        j8                   j$                        }#|#2|# _}         j                   rt        d'|# t        j                         np j                   r]t        d(|:d#    t        j                         t        d)|8 t        j                         t        d*|9 t        j                         d# _}         j                  G j                  ;d+ _}         j                   r(t        d, j                   t        j                         d _        y# 1 sw Y   xY w# 1 sw Y   ExY w# t&        $ r<}5i  _         j                   rt        d|5 t        j                         Y d}5~5d}5~5ww xY w)-a
  Load a llama.cpp model from `model_path`.

        Examples:
            Basic usage

            >>> import llama_cpp
            >>> model = llama_cpp.Llama(
            ...     model_path="path/to/model",
            ... )
            >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"])
            the lazy dog

            Loading a chat model

            >>> import llama_cpp
            >>> model = llama_cpp.Llama(
            ...     model_path="path/to/model",
            ...     chat_format="llama-2",
            ... )
            >>> print(model.create_chat_completion(
            ...     messages=[{
            ...         "role": "user",
            ...         "content": "what is the meaning of life?"
            ...     }]
            ... ))

        Args:
            model_path: Path to the model.
            n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
            tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
            rpc_servers: Comma separated list of RPC servers to use for offloading
            vocab_only: Only load the vocabulary no weights.
            use_mmap: Use mmap if possible.
            use_mlock: Force the system to keep the model in RAM.
            kv_overrides: Key-value overrides for the model.
            seed: RNG seed, -1 for random
            n_ctx: Text context, 0 = from model
            n_batch: Prompt processing maximum batch size
            n_ubatch: Physical batch size
            n_threads: Number of threads to use for generation
            n_threads_batch: Number of threads to use for batch processing
            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
            pooling_type: Pooling type, from `enum llama_pooling_type`.
            rope_freq_base: RoPE base frequency, 0 = from model
            rope_freq_scale: RoPE frequency scaling factor, 0 = from model
            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
            yarn_attn_factor: YaRN magnitude scaling factor
            yarn_beta_fast: YaRN low correction dim
            yarn_beta_slow: YaRN high correction dim
            yarn_orig_ctx: YaRN original context size
            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
            embedding: Embedding mode only.
            offload_kqv: Offload K, Q, V to GPU.
            flash_attn: Use flash attention.
            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
            lora_path: Path to a LoRA file to apply to the model.
            numa: numa policy
            chat_format: String specifying the chat format to use when calling create_chat_completion.
            chat_handler: Optional chat handler to use when calling create_chat_completion.
            draft_model: Optional draft model to use for speculative decoding.
            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
            verbose: Print verbose output to stderr.
            type_k: KV cache data type for K (default: f16)
            type_v: KV cache data type for V (default: f16)
            spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

        Raises:
            ValueError: If the model path does not exist.

        Returns:
            A Llama instance.
        )disableNTiutf-8zZAttempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES=Fr      z
Value for z is too long:     zUnknown value type for z:    r   r   zModel path does not exist: )
path_modelparamsrJ   )modelrS   rJ   )n_tokensembd	n_seq_maxrJ   z2Failed to initialize LoRA adapter from lora path: c                 l     j                   y t        j                   j                          d  _         y N)_lora_adapter	llama_cppllama_lora_adapter_freeselfs   B/var/www/html/venv/lib/python3.12/site-packages/llama_cpp/llama.pyfree_lora_adapterz)Llama.__init__.<locals>.free_lora_adapter  s.    %%-11$2D2DE%)"    z+Failed to set LoRA adapter from lora path: file)n_vocabdtypeg      $@zFailed to load metadata: zModel metadata:  c              3  R   K   | ]  \  }}|j                  d       r	|dd |f ! yw)ztokenizer.chat_template.
   N)
startswith).0nametemplates      r_   	<genexpr>z!Llama.__init__.<locals>.<genexpr>  s4       
h9: "#Y! 
s   %'ztokenizer.chat_templatezchat_template.defaultz&Available chat formats from metadata: z, )rm   	eos_token	bos_tokenstop_token_idszGuessed chat format: zUsing gguf chat template: zUsing chat eos_token: zUsing chat bos_token: zllama-2zUsing fallback chat format: )rJ   
contextlib	ExitStack_stackr   r   _Llama__backend_initializedr   r[   llama_backend_init
isinstanceboolGGML_NUMA_STRATEGY_DISTRIBUTEGGML_NUMA_STRATEGY_DISABLEDrB   llama_numa_init
model_pathllama_model_default_paramsmodel_paramsr"   r#   r$   encoder&   _rpc_serversr%   _c_tensor_splitlenLLAMA_MAX_DEVICES
ValueErrorctypesc_floatr'   r(   r)   r*   llama_model_kv_override_kv_overrides_array	enumerateitemskeyLLAMA_KV_OVERRIDE_TYPE_BOOLtagvalueval_boolintLLAMA_KV_OVERRIDE_TYPE_INTval_i64floatLLAMA_KV_OVERRIDE_TYPE_FLOATval_f64strljustLLAMA_KV_OVERRIDE_TYPE_STRtypingcast	addressofllama_model_kv_override_valueval_stroffsetPOINTERc_charmemmoveminr-   maxmultiprocessing	cpu_countr/   r0   LLAMA_DEFAULT_SEED_seedllama_context_default_paramscontext_paramsr,   r.   #LLAMA_ROPE_SCALING_TYPE_UNSPECIFIEDr1   r2   r3   r4   r5   r6   r7   r8   r9   r:   
embeddingsr<   r=   rG   rH   r>   cacher?   r@   rA   rI   ospathexistsenter_contextclosing	internals
LlamaModel_modelr   
tokenizer_n_ctx_trainLlamaContext_ctx
LlamaBatch_batchrZ   llama_lora_adapter_initrT   RuntimeErrorcallbackllama_lora_adapter_setctxprintllama_print_system_infodecodesysstderrrC   rD   _chat_handlersrE   rd   _n_vocab_n_ctxtoken_nl	_token_nl	token_eos
_token_eosLlamaTokenDataArray_candidatesrU   npndarrayintc	input_idssinglescores_mirostat_mumetadata	Exception	token_bostoken_get_textdictjoinkeysllama_chat_formatJinja2ChatFormatterto_chat_handler$guess_chat_format_from_gguf_metadata_sampler)=r^   r|   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   kwargs
FloatArraykvo_array_lenikvv_bytesaddressbuffer_startr`   eeos_token_idbos_token_idro   rp   template_choicesrl   rm   s=   `                                                            r_   __init__zLlama.__init__<   s   N  **,G**'8 /,,./*.E'dD!  77:: I DI99	==='8 5))$))45 % &@@B&",J, 	& (2$%-"",7,>,>w,GD) +D $D(#(4$$%	(C(CC pqz  rM  rM  qN  O   )*E*EEJ#-$D  .2-A-AD*'1$1:1BX"&/# )#-1M11MA(D$ '|'9'9';< &I	6Aq23((72C((+/a& &AA ,,ABD,,Q/55>3' &@@ ,,@AD,,Q/55=5) &BB ,,@AD,,Q/55=3'hhw/G7|c)(:aSqc)JKK%mmC7G &@@ ,,$kk(()A)A!)D)J)JK#AAIIPPQG
 $*;;wv}}8U#VLNN$ %'>qcA3%GHHM&IT  $$-1-E-ED*5'*"Nc/*C*C*E*JA&N.M/2K2K2M 9Y99
 (DDF$)!&*ll#'*4<<'B$(,%.2.B.B+ !, >> 	-
 ,8(,3N 	*  /#5O1 	+  /#5O1 	+ !1C 7Q 	, -3N 	* -3N 	* >Ka=OMUV)%-J4 	& *3&*5')3&)/D&)/D&"4/3
"$"$ww~~j)::,GHHkk//$$#,, LL
 $;~d'; A:KK++-Eug.DL(,(?(?(AD%*.,,D'+.t||X+FD(KK--&&++.. LL
	 kk//$$!\\"1177 LL		
 HL>>!*!B!B!!%%g."D !!)"HHXY * KK  !23//		t114?? #A$..AQR  <<)335<<WECJJW&(  	 'jjl..*$88O/1zz5("''/R.0jj D(Ugt}}ERYY/
 #NN
	H KK002DM <<$T]]O43::F~~'~~' 9E8JDKK&&|4PR 	 9E8JDKK&&|4PR 	
    
"&--"5"5"7 
 
 %58<)945 <<,8CSCXCXCZ9[8\]ZZ
 /446 	 ND((9(M(M!## ,~	)
 o %	  $!!)'+;;+PPK &#. <<1+?cjjQ<<45EF]5^4_` ZZ 29+>SZZP29+>SZZP#: #(9(9(A(D||243C3C2DECJJ k
/ /5 5x  	HDM||1!5CJJG	Hs0   	|/ |< }	 /|9<}	
~1~

~c                .    | j                   j                  S rY   )r   r   r]   s    r_   r   z	Llama.ctx   s    yy}}ra   c                .    | j                   j                  S rY   )r   rT   r]   s    r_   rT   zLlama.model$  s    {{   ra   c                4    | j                   d | j                   S rY   )r   rU   r]   s    r_   
_input_idszLlama._input_ids(  s    ~~o..ra   c                @    | j                   d | j                  d d f   S rY   )r   rU   r]   s    r_   _scoreszLlama._scores,  s    {{?T]]?A-..ra   c                z    t        | j                  d | j                   j                         | j                        S )Nmaxlen)r   r   rU   tolistr   r]   s    r_   eval_tokenszLlama.eval_tokens0  s+    T^^Odmm4;;=dkkRRra   c                    t        | j                  d | j                  d d f   j                         | j                  j
                  r| j                        S d      S )Nr   r   )r   r   rU   r   r   r:   r   r]   s    r_   eval_logitszLlama.eval_logits4  sS    KK$--*+224"&"5"5"@"@4;;
 	
FG
 	
ra   c                <    | j                   j                  |||      S )aU  Tokenize a string.

        Args:
            text: The utf-8 encoded string to tokenize.
            add_bos: Whether to add a beginning of sequence token.
            special: Whether to tokenize special tokens.

        Raises:
            RuntimeError: If the tokenization failed.

        Returns:
            A list of tokens.
        )r   tokenize)r^   textadd_bosspecials       r_   r   zLlama.tokenize;  s      ''gw??ra   c                >    | j                   j                  |||      S )a?  Detokenize a list of tokens.

        Args:
            tokens: The list of tokens to detokenize.
            prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
            special: Whether to detokenize special tokens.

        Returns:
            The detokenized string.
        )prev_tokensr   )r   
detokenize)r^   tokensr  r   s       r_   r  zLlama.detokenizeM  s'      ))W * 
 	
ra   c                    || _         y)zKSet the cache.

        Args:
            cache: The cache to set.
        N)r   )r^   r   s     r_   	set_cachezLlama.set_cachea  s     
ra   c                    || _         y)zOSet the random seed.

        Args:
            seed: The random seed.
        N)r   )r^   r+   s     r_   set_seedzLlama.set_seedi  s     
ra   c                    d| _         y)zReset the model state.r   N)rU   r]   s    r_   resetzLlama.resetq  s	    ra   c                .   | j                   j                  d| j                  d       t        dt	        |      | j
                        D ]J  }||t        t	        |      || j
                  z          }| j                  }t	        |      }| j                  j                  ||| j                  j                         | j                   j                  | j                         || j                  |||z    | j                  j                  rv|}| j                  }t        j                  j!                  | j                   j#                         ||z  f      }|| j$                  |||z   ddf   j'                  d      dd n	 | xj                  |z  c_        M y)zfEvaluate a list of tokens.

        Args:
            tokens: The list of tokens to evaluate.
        rM   r   )batchn_pastr:   )shapeN)r   kv_cache_seq_rmrU   ranger   r-   r   r   	set_batchr   r:   r   r   r   r   	ctypeslibas_array
get_logitsr   reshape)	r^   r  r   r  r  rU   rowscolslogitss	            r_   evalz
Llama.evalu  sY    			!!"dmmR8q#f+t||4 	&A1s3v;DLL0@ABE]]F5zHKK!!Ft7J7J7U7U "  IIT[[)9>DNN6FX$56""--}}..II((*4$;. /  NTFVh%669:BB2FrJ MMX%M9	&ra   (   ffffff?皙?皙?皙?      @c                f    t        j                         }d	 fd}|j                  |       |j                   j                   j
                   j                   j                  ||||d	       ||j                   j                  |       |dk  r-|j                          |j                   j                         |S |dk(  r|j                          |S |
dk(  r-d}|j                   j                   j                  |||       |S |
dk(  r|j                   j                  ||       |S d}t!        d|      }|j#                  |       |j%                  ||       |j'                  ||       |j)                  ||       |j+                  |       |j                   j                         |S )
Nc           	         | j                   j                  }| j                   j                  }t        j                  |j                         }t        j                  |ft        j                  dt
        j                  fdt
        j                  fdt
        j                  fgd      t        j                  |z  j                  |            }D ],  } |j                  |j                        |j                  d d  . y )NidlogitpT)align)r  rf   buf)contentssizedatar   r   r   recarrayrf   r   r   r[   llama_token_datafrom_addressr   r#  )token_data_arrayr(  data_soadata_soa_addressr*  logit_processorlogits_processorr^   s         r_   
apply_funcz'Llama._init_sampler.<locals>.apply_func  s    '0055+4499#)#3#3H4E4E#F ;;'((7BII*>bii@PQ" #33d:HH(	 (8 YO(7(XHNN1%Yra   F)	rd   special_eos_idlinefeed_idpenalty_last_npenalty_repeatpenalty_freqpenalty_presentpenalize_nl
ignore_eosr   r   d   rQ   r   )r-  z"llama_cpp.llama_token_data_array_p)r   LlamaSampler
add_customadd_penaltiesr   r   r   r>   add_grammarr   add_softmaxadd_distr   
add_greedyadd_mirostatadd_mirostat_v2r   	add_top_kadd_typical	add_top_p	add_min_padd_temp)r^   top_ktop_pmin_p	typical_ptemprepeat_penaltyfrequency_penaltypresence_penaltytfs_zmirostat_modemirostat_etamirostat_taur9  r1  grammarsamplerr2  
mirostat_mn_probsmin_keeps   `             `      r_   _init_samplerzLlama._init_sampler  s   $ ((*'Y$ z*MM??22)*,# 	 
	
 W5#:!TZZ(8 7 S[ 4 1 ! 
$$MMJJ  ,  !#''JJ    q'?!!%(##Ix8!!%2!!%2  &  ,ra   c                2   | j                   dkD  sJ d}| j                  'd}| j                  |||||||||	|
|||||      | _        ||| j                   z
  nd}| j                  J | j                  j	                  | j
                  |      }|rd| _        |S )a0  Sample a token from the model.

        Args:
            top_k: The top-k sampling parameter.
            top_p: The top-p sampling parameter.
            temp: The temperature parameter.
            repeat_penalty: The repeat penalty parameter.

        Returns:
            The sampled token.
        r   FNTrJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  rU  rT  r9  r1  rV  rM   )rU   r   r[  r   sampler   )r^   rJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  rT  rU  r9  r1  rV  idxtmp_samplerridxtokens                       r_   r^  zLlama.sample  s    < }}q   == K ..#-"3!1+))'!1 / DM$ '*osT]]"2xx###$$TYY5 DMra   c              #    K   t        j                  d|z        | _        | j                  |||||||	|
|||||||      | _        |r| j
                  dkD  ryd}t        | j                  |dd       D ]  \  }}||k(  r|dz  } n |dkD  rEd}||d }|| _        | j                  r+t        d| d	t        |       d
t        j                         |r| j                          | j
                  t        |      z   dz
  }t        |      }	 | j                  |       || j
                  k  r| j!                  |||||||	|
||||||||      }|dz  }|5 || j                  d| | j"                  || j
                  z
  ddf         ry|}|j%                          |j'                  |       ||j)                  |       || j
                  k  rA|| j                  |   k7  r/|| _        | j*                  j-                  d| j
                  d       n|| j
                  k  r| j.                  || j0                  | j
                  | j
                  t        |      z    | j/                  | j0                  d| j
                  t        |      z          }|j)                  |j3                  t4              d| j6                  | j
                  z
  t        |      z
          Эw)a  Create a generator of tokens from a prompt.

        Examples:
            >>> llama = Llama("models/ggml-7b.bin")
            >>> tokens = llama.tokenize(b"Hello, world!")
            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0):
            ...     print(llama.detokenize([token]))

        Args:
            tokens: The prompt tokens.
            top_k: The top-k sampling parameter.
            top_p: The top-p sampling parameter.
            temp: The temperature parameter.
            repeat_penalty: The repeat penalty parameter.
            reset: Whether to reset the model state.

        Yields:
            The generated tokens.
        g       @r]  r   NrM   r   FzLlama.generate: z prefix-match hit, remaining z prompt tokens to evalrb   )rJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  rU  rT  r1  rV  r9  r_  )r   r   r   r[  r   rU   zipr   rJ   r   r   r   r   r
  listr  r^  r   clearappendextendr   r  rE   r   astyper   r   )r^   r  rJ  rK  rL  rM  rN  rO  r
  rP  rQ  rR  rS  rU  rT  r9  r1  stopping_criteriarV  longest_prefixab
sample_idxrb  tokens_or_nonedraft_tokenss                             r_   generatezLlama.generate1  s    R #NN3+=>**)/-'%%#- + 
& T]]Q&NDOOVCR[9 16"a'N	
 !0 .<<*>*: ;%%([M1GI ZZ JJL ]]S[014
f IIft}},'#1&7%5"/!-!-%5# +"! $ & a
$05FOOLj14<<
T]]@Z\]@]3^6 ',e$!-MM.1-%4??:;V2V$.DMII--b$--DE t}},H +NTt}}t}}s6{/JK#//NN#@T]]S[%@A   '',C$++5FCW s   HKB;Kc                    ||n| j                   }t        |t              r|n|g}| j                  |d      \  }}t	        |      D cg c]  \  }}d||d }}}d||||ddS c c}}w )zEmbed a string.

        Args:
            input: The utf-8 encoded string to embed.

        Returns:
            An embedding object.
        T)return_countr;   )objectr;   indexre  )prompt_tokenstotal_tokens)rt  r)  rT   usage)r|   rw   re  embedr   )	r^   inputrT   
model_nameembedsrw  r_  embr)  s	            r_   create_embeddingzLlama.create_embedding  s     $)#4%$//
#E40ug
  $zz%dzC &f-!
 S	 & !
 !
 !- ,	
 	
!
s   
A'c                0     j                          j                  } j                         t        j                  k(  } j
                  j                  du rt        d       j                  r)t        j                   j                  j                         t        |t              r|g}n|} j                  j                          g d
 fd}d}	g }
d}d}|D ]  } j!                  |j#                  d            }|r|d| }t%        |      }|	|z  }	||kD  rt'        d| d|       ||z   |kD  r ||
       g }
d}d} j                  j)                  |||       |
j+                  |       ||z  }|d	z  }  ||
        j                  r)t        j,                   j                  j                         t        |t              rd   n}t        j.                   j                  j                          j                          |r||	fS |S )zEmbed a string.

        Args:
            input: The utf-8 encoded string to embed.

        Returns:
            A list of embeddings
        FzCLlama model must be created with embedding=True to call this methodc           	     6   t        j                  j                  j                         j                  j	                  j
                         j
                  j                          t         j                  k(  rd}t        |       D ]  \  }}t        j                  j                  j                        }t        |      D cg c]  }|||	z  z   ||dz   	z  z     }}
r"|D cg c]  }t        j                  |       }}j                  |       ||z  } y t        t        |             D ]Y  }t        j                  j                  j                  |      }|d 	 }
rt        j                  |      }j                  |       [ y c c}w c c}w Nr   r   )r[   llama_kv_cache_clearr   r   r   r   r
  LLAMA_POOLING_TYPE_NONEr   llama_get_embeddingsr  r   normalize_embeddingrg  r   llama_get_embeddings_seq)	seq_sizesposr   r(  ptrjr;   r   r)  n_embd	normalizer2   r^   s           r_   decode_batchz!Llama.embed.<locals>.decode_batch  sn   **499==9IIT[[)KK y@@@(3  GAt#88GC "'t4 C!f*,sa!ev5E/EF4I 4 !FO%ABI99!<%	 % KK	*4KC  s9~. +A#<<TYY]]ANC-0&\I $-$A$A)$L	KK	*+4
%s   F)Fr   rN   NRequested tokens (z) exceed batch size of r   )r  	List[int])r  r-   r2   r[   r  r   r   r   rJ   llama_perf_context_resetr   r   rw   r   r   r
  r   r   r   r   add_sequencerg  llama_perf_context_printr  )r^   rz  r  truncaters  r-   r:   inputsr  rw  s_batcht_batchp_batchr   r  rU   outputr)  r  r2   s   ` `              @@@r_   ry  zLlama.embed  s    ,, ((*!Y%F%FF
))U2U  <<..tyy}}=eS!WFF 	 CE	+ 	+:   	D]]4;;w#78F)6{HH$L '! (
2I'S 
 !G+W% KK$$VWjA NN8$xGqLG7	< 	W<<..tyy}}=&uc2a&&tyy}}5

<''Mra      c              #  0!  ]^K   ||j                   t        u sJ dt        t        j                                }t	        t        j
                               }| j                         }| j                  j                         }| j                  j                         }| j                  j                         }| j                  j                         } | j                  j                         }!| j                  j                  dd      dk(  }"|dk7  r|n|g}#|dk7  r|n| j                         g}$t!        |t"              r|#| j                  j%                         r	|#d d dgk(  rg }#t!        |t"              r|| j                  j'                         s|dk(  rg }$d}%|"r|!dk\  r	|rd|z   }d}%t)        |      dkD  rg n|g}&|dk\  r||gng t!        |t              r3|d	k7  r,| j+                  |j-                  d
      d|dk  xs |d u       ng n|z   }'|!dk\  r0|.|!g|r&| j+                  |j-                  d
      dd      |%d  ng z   ng }(| dk\  r|| gng })|#| j.                  r|(|'z   |)z   n|'|(z   |)z   z   |$z   }*d}+d},t!        |
t"              r|
nt!        |
t              r|
gng }
||n| j0                  }-|*d d | j                         gdz  k(  rEt3        j4                  d| j                  j7                  | j                                dt8               |d|j;                         D ./ci c]  \  }.}/t	        |.      t=        |/       c}/}.]	 	 	 	 	 	 d,]fd}0t?        |0g      }1||1}n|jA                  |1      }| jB                  r| jD                  jG                          t)        |*      | jH                  k\  r7tK        dt)        |*       dtM        jN                  | jP                               ||dk  r| jH                  t)        |*      z
  }|t)        |*      z   | jH                  k  r|n| jH                  t)        |*      z
  }|
g k7  r|
D 2cg c]  }2|2j-                  d
       }3}2ng }3|#| jR                  jT                  du rtK        d      | jV                  r	 | jV                  |*   }4tX        j[                  |4j\                  j_                         |*      }5tX        j[                  | j`                  j_                         |*      }6|5|6kD  r8| jc                  |4       | jB                  rte        dtf        jh                         || jm                  |       n>| jm                  to        jp                  | jr                        ju                  dd             d}7d}8| jw                  |*|||||||||||||||      D ]-  }9tM        jx                  | j                  jz                  |9      r| j}                  |&|*      }+d}7 n|&j                  |9       | j}                  |&|*      }:t        |:dd        D ](  \  }.};d|.z
  }.dD ]  \  }<}=|<|.kD  s|=|;z  |=k(  s|<|.z
  }8 * |8dkD  r|8dz  }8|3D 2cg c]	  }2|2|:v s|2 }>}2t)        |>      dkD  r|>d   }?|:d |:j                  |?       }+d}7 nC|r|&|,d  }@| j}                  |@|*|&d |, z         ^t)        ^      }Ad}B|3D ]D  }2t        t        t)        |2      A      dd      D ]  }C^j                  |2d |C       sCBkD  rC}B D F d}D|@D ]  }9|9|k(  r
Dt)        | j}                  |9g|*|&d |, z               z  }D|DABz
  kD  r nb| j}                  |9g|*|&d |, z         j                  d
d       }Et)        |      t)        | j}                  |&d |, |*|&d |, z         j                  d
d             z   }Ft)        |*      |,z   }G| j                  |Gdz
  d d f   }HtX        j                  |H      j_                         }It#        t        t        |It        t)        |I                  d!"            }J|Jd | D KCci c])  \  }K}C| j}                  |Cg      j                  d
d       |K+ }L}K}C|Lj                  EIt	        |9         i       | j}                  |9g|*|&d |, z         j                  d
d       gFg|It	        |9         g|Lgd#}M|,dz  },|d$||-| j}                  |9g|*|&d |, z         j                  d
d       d|Md d%gd&  nt)        @      dkD  rd}Nt        dt)        @      dz         D ]3  }C	 | j}                  @d C |*|&d |, z         }O|Oj                  d
      }Pd!}N n nCNsn@Dt)        O      z  }D|DABz
  kD  rn)@Cd  }@|,|Cz  },|d$||-Pdd d d%gd& t)        |@      dkD  rt)        |&      |k\  s| j}                  |&|*      }+d}7 n |9 || j`                  | j                  dd d f         r| j}                  |&|*      }+d}7| jB                  r| jD                  j                          |r|&|,d  }@| j}                  |@|*|&d |, z         ^|3D 2cg c]	  }2|2^v s|2 }>}2t)        |>      dkD  rt        ^fd'|>D              }Qnt)        ^      }Qd}D@D ]"  }9Dt)        | j}                  |9g|*|&d |, z               z  }Dd }M|c|9|k(  r5| j}                  |9g      j                  d
d       }Et)        |      t)        | j}                  |&d |, |*|&d |, z               z   }Ft)        |*      |,z   dz
  }G| j                  |Gd d f   }HtX        j                  |H      j_                         }It#        t        t        |It        t)        |I                  d!"            }J|Jd | D KCci c])  \  }K}C| j}                  |Cg      j                  d
d       |K+ }L}K}C|Lj                  EIt	        |9         i       | j}                  |9g      j                  d
d       gFg|It	        |9         g|Lgd#}MDQk\  rV| j}                  |9g      }RDQdz
  k(  r ns|,dz  },|d$||-Rd t)        |R      DQz
  z
   j                  d
d       dMd d%gd&  n9|,dz  },|d$||-| j}                  |9g      j                  d
d       dMd d%gd& % |d$||-d	dd |7d%gd& | jV                  rn| jB                  rte        d(tf        jh                         | j                         | jV                  |*|&z   <   | jB                  rte        d)tf        jh                         y | jV                  rG| jB                  rte        d(tf        jh                         | j                         | jV                  |*|&z   <   |+j                  d
d       }S|	r|Sz   }S|!dk  r|S|z   }Sd }M||	rdn
t)        |      }F|	rdnt)        |*dd        }Gg }Tg }Ug }Vg }W|	r!|*|*d   | j                         k(  rdndd  |&z   }Xn|&}Xt        X      D C9cg c]-  \  }C}9| j}                  |9gXd |C       j                  d
d       / }Y}C}9tX        j                  | j                        Gd  }Zt        t        X|Y|Z            D ]  \  }[\  }9}E}\|9|k(  rTj                  Ft)        | j}                  Xd [       j                  d
d             z          Vj                  E       t#        t        t        \t        t)        |\                  d!"            }JUj                  |\t	        |9                |Jd | D KCci c].  \  }K}C| j}                  |CgXd [       j                  d
d       |K0 }L}K}C|Lj                  E\t	        |9         i       Wj                  |L       ! |	rt)        X      dkD  r
d Ud<   d Wd<   VTUWd#}M|d$||-SdM|7d%gt)        |*      t)        |&      t)        |*      t)        |&      z   d*d+ y c c}/}.w c c}2w # tj        $ r+ | jB                  rte        dtf        jh                         Y &w xY wc c}2w c c}C}Kw # t        $ r Y ;w xY wc c}2w c c}C}Kw c c}9}Cw c c}C}Kw w)-Nzcmpl-ztokenizer.ggml.add_space_prefixtruerM   r   r   u   ☺rQ   rg   rN   F)r   r   ra   zDetected duplicate leading "zN" in prompt, this will likely reduce response quality, consider removing it...c                x    t        j                  |      }j                         D ]  \  }}|||   z   ||<    |S rY   )r   copyr   )r   r   
new_scoresinput_idscorelogit_bias_maps        r_   logit_bias_processorz6Llama._create_completion.<locals>.logit_bias_processor  sP      WW
 (6';';'= DOHe+06(3C+CJx(D!!ra   r  z) exceed context window of zBlogprobs is not supported for models created with logits_all=Falsez#Llama._create_completion: cache hitrb   z$Llama._create_completion: cache missl        length)rJ  rK  rL  rM  rN  rR  rS  rU  rT  rP  rQ  rO  rj  r1  rV  )r  stop   ))rQ      )r     )      ignore)errorsT)reverse)r  text_offsettoken_logprobstop_logprobstext_completion)r   ru  logprobsfinish_reason)r"  rt  createdrT   choicesc              3  @   K   | ]  }j                  |        y wrY   )ru  )rk   r  remaining_texts     r_   rn   z+Llama._create_completion.<locals>.<genexpr>  s     J...t4Js   z$Llama._create_completion: cache savez%Llama._create_completion: cache saved)rv  completion_tokensrw  )r"  rt  r  rT   r  rx  r   npt.NDArray[np.intc]r   npt.NDArray[np.single]returnr  )N	__class__r   uuiduuid4r   timer   r   	token_cls	token_septoken_prefixtoken_middletoken_suffixr   getr   rw   re  add_bos_tokenadd_eos_tokenr   r   r   rI   r|   warningswarnr   RuntimeWarningr   r   LogitsProcessorListrh  rJ   r   reset_timingsr   r   r[   llama_n_ctxr   r   r:   r   r   longest_token_prefixr   r   r   
load_stater   r   r   KeyErrorr  randomRandomr   randintrq  llama_token_is_eogrT   r  rg  r   ru  r  r   endswithr   r   logits_to_logprobssortedrd  updateUnicodeErrorprint_timings
save_state)_r^   promptsuffix
max_tokenstemperaturerK  rL  rM  r  echor  rP  rQ  rO  rJ  streamr+   rR  rS  rU  rT  rT   rj  r1  rV  
logit_biascompletion_idr  r   cls_token_idsep_token_idprefix_token_idmiddle_token_idsuffix_token_idadd_space_prefix
bos_tokens
eos_tokenssuffix_space_prefixr  prefix_tokenssuffix_tokensmiddle_tokensrv  r   returned_tokensr{  r   r   r  _logit_bias_processorsstop_sequences
cache_itemcache_prefix_leneval_prefix_lenr  multibyte_fixrb  all_textcharnumpatternany_stop
first_stopremaining_tokensremaining_lengthfirst_stop_positionr   token_end_position	token_strr  token_offsetr  current_logprobssorted_logprobslogprobtop_logproblogprobs_or_nonedecode_successbstsend	last_texttext_strtext_offsetsr  r  r  
all_tokensall_token_strsall_logprobsr_  logprobs_tokenr  r  s_                                                                                                @@r_   _create_completionzLlama._create_completion^  s    < ~!1!1S!888$S%6$78499;' NN, KK113 KK113#{{779#{{779#{{779MM?HFR 	 2>1C V
(B.LDNN4D!


 %&.;;,,."1~"%Jvt$))+0BJ#$1 4V^F"# .1[1_r<. "1A!5&:L_RT &#& R< MM'*!,q0BFdN    $
4 !#(: !!
  MM&--"8%QVMW+,   	 "1A!5&:L_RT 	  ?? .>#m3mC	  	  tT*D*T3:OUW 	 $)#4%$//
!!1 2Q 66MM.t{{/I/I$..JZ/[.\  ]k  l !;E;K;K;MN41ac!feAh.NN	"/	".	" (	" %89M8N$O!'#8 #3#:#:;P#Q <<II##%},$S%7$88ST]TiTijnjrjrTsStu  qs='99J
 C..< ++M 22 	 2:9=>Aahhw/>N>ND$7$7$B$Be$KT  ::S!ZZ6
#(#=#=((//1=$  #("<"<OO**,m# $o5OOJ/||C#**U
 MM$MM&--

3;;AwGH ]]'%%/-)/-! # 
 	E$ ++DKK,=,=uE'8mT &$$U+'8mTH %Xbc]3 04E$B 0LCQw7T>W#<(+a00 q "#1CaQ(]CHC8}q %a[
 <(.."<= &#4_5E#F !%$ -0ABR?0S S "1 " $'~#6 
 '(#' "A"3s1v/?#@!RH ")221Ra59 #6667 3!	"" &'"' "2 I L0$*c OO!&,9"34D_"E-F , / * .,/BB "$(OO"G(5/0@A)B %4 % !&&:	 "
 '*&kC OO 12B? C,9"34D_"E-F ,  %fWXf>5 ' (+='9O'K!%lQ.>.A!B+0+C+CF+K+R+R+T(*." #$4eC@P<Q6R S(,+ />ix.H	' !+ !OOQC077 ' 8 &'' ' $**I7GE
7S+TU !%%*G0=&78H&I1J !0 !" #)&&"B' -8=/?E
/K.L-8M,( (1,"/&7'.%/ -1OO).4A*;<L_*M5N -< -& '-fWXf&F-.0@59	!"( qIV ./!3).!&q#.>*?!*C!D "A
%%)__$4Ra$80=&78H&I1J &5 &"
 &(YYw%715 %" "-!*c"g5*-,/BB "+;AB+?('1, #0&7'.%/ -/-.0459	!"( 7 ./!3T $%3'8mT (	B (->OOT\\"a%0.
 ??#4-?PD"M<<II##%01AB!__ ),=>N,OO - N $2IaQ.5HIHI8}q JJJ.)!") S"cOO$14EFV4W$W $ ' " BF ',  $ 8 ? ? !@ !I #&f+-.>?(5/0@A)B ( 1 #K $'}#5#G!#KL!\\,/:F','?'?'G'N'N'P$&* 0%<L8M2NO$('O +:)8*D#&GQ ,33GH3MwV#K #  &&	3CCJ3O'PQ !OOUG4;;GH;U# )4}+;CJ+G*H)4($ &, $ 8I)S1W4#q(O+"3#*!+ )2$Qc)n8JS8P&Q)""(&&"B)*,<15	$   1$'/&' %)OOUG$<$C$C ' %D % &'(8-1	  ISj $+"# !#!"$()6	  zz<<@szzR@D@Q

=+<<=<<A

S::||<3::N<@OO<MDJJ}'889;;wx;8(HQ6#5&(H9=#!VK $1#mAB.?*@L&(L46N "F=?L "}Q'74>>;K'K!QR"TU'( 
 /
 !** 5	 Au Z^DKKH L N  !33DLLA,-PL;DJ=< 177eY L(##
4C(89@@#H A  i("&NE#n2E,FGQU#
 %%nSZ&@A
 '6ix&@	; # OOQCZ5EOFMM N ; ; ""I~c%j/I#JK##K071> J!+$(q!"&Q +"0 ,	   ' % 0%2	 "%]!3%():%; #M 2S9J5K K
 	
s ON ?,  S<<@szzRSf DF'f $0 % $%d JL#T8;s  LAB A@"?DABA@(6ABB(A@- :DABABAB#	AA$-AA$1BAB
D<AB.AA)
4CAB/AA/3AAB:AB
BAB(	AA?2AA?6D>AB4.AB"IAB52AB
'C4AB3ABBAB@-0AA!AABA AA!A!ABA/	AA<A8ABA;AA<A<ABc                    | j                  |||dn|||||||	|
|||||||||||||||      }|r|}|S t        |      }|S )  Generate text from a prompt.

        Args:
            prompt: The prompt to generate text from.
            suffix: A suffix to append to the generated text. If None, no suffix is appended.
            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
            temperature: The temperature to use for sampling.
            top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
            typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
            logprobs: The number of logprobs to return. If None, no logprobs are returned.
            echo: Whether to echo the prompt.
            stop: A list of strings to stop generation when encountered.
            frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
            presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
            repeat_penalty: The penalty to apply to repeated tokens.
            top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
            stream: Whether to stream the results.
            seed: The seed to use for sampling.
            tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
            mirostat_mode: The mirostat sampling mode.
            mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
            mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
            model: The name to use for the model in the completion object.
            stopping_criteria: A list of stopping criteria to use.
            logits_processor: A list of logits processors to use.
            grammar: A grammar to use for constrained sampling.
            logit_bias: A logit bias to use.

        Raises:
            ValueError: If the requested tokens exceed the context window.
            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.

        Returns:
            Response object containing the generated text.
        rM   r  r  r  r  rK  rL  rM  r  r  r  rP  rQ  rO  rJ  r  r+   rR  rS  rU  rT  rT   rj  r1  rV  r  )r  next)r^   r  r  r  r  rK  rL  rM  r  r  r  rP  rQ  rO  rJ  r  r+   rR  rS  rU  rT  rT   rj  r1  rV  r  completion_or_chunkschunks
completions                                r_   create_completionzLlama.create_completion  s    @  $66'/rZ#/-)'%%/-!3  7  
6 ?SFM!%&:!;
ra   c                V    | j                  |||||||||	|
|||||||||||||||      S )r  r  )r  )r^   r  r  r  r  rK  rL  rM  r  r  r  rP  rQ  rO  rJ  r  r+   rR  rS  rU  rT  rT   rj  r1  rV  r  s                             r_   __call__zLlama.__call__+  sh    @ %%!#/-)'%%/-!3 & 
 	
ra   g?c                d   | j                   xsF | j                  j                  | j                        xs t	        j
                  | j                        } |di d| d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|S )a	  Generate a chat completion from a list of messages.

        Args:
            messages: A list of messages to generate a response for.
            functions: A list of functions to use for the chat completion.
            function_call: A function call to use for the chat completion.
            tools: A list of tools to use for the chat completion.
            tool_choice: A tool choice to use for the chat completion.
            temperature: The temperature to use for sampling.
            top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
            top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
            typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
            stream: Whether to stream the results.
            stop: A list of strings to stop generation when encountered.
            seed: The seed to use for sampling.
            response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
            presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
            frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
            repeat_penalty: The penalty to apply to repeated tokens.
            tfs_z: The tail-free sampling parameter.
            mirostat_mode: The mirostat sampling mode.
            mirostat_tau: The mirostat sampling tau parameter.
            mirostat_eta: The mirostat sampling eta parameter.
            model: The name to use for the model in the completion object.
            logits_processor: A list of logits processors to use.
            grammar: A grammar to use.
            logit_bias: A logit bias to use.

        Returns:
            Generated chat completion or a stream of chat completion chunks.
        llamamessages	functionsfunction_calltoolstool_choicer  rK  rJ  rL  rM  r  r  r  r  r+   response_formatr  rQ  rP  rO  rR  rS  rU  rT  rT   r1  rV  r   )rD   r   r  rC   r   get_chat_completion_handler)r^   r"  r#  r$  r%  r&  r  rK  rJ  rL  rM  r  r  r+   r'  r  rQ  rP  rO  rR  rS  rU  rT  rT   r1  rV  r  r  r  handlers                                 r_   create_chat_completionzLlama.create_chat_completion  sz   F  O""&&t'7'78O <<T=M=MN 	
  


  
 (	

 
 $
 $
 
 
 
  
 
 &
 
 
  !
" ,#
$ "%
& .'
( 0)
* *+
, -
. (/
0 &1
2 &3
4 5
6 .7
8 9
: ";
 	
ra   c           	         	 ddl m}m |j                  dd      }t	        |t
              sJ |rfd | j                  |i |D        S  |di  | j                  |i |S # t        $ r t        d      w xY w)a  Generate a chat completion with return type based on the the OpenAI v1 API.

        OpenAI python package is required to use this method.

        You can install it with `pip install openai`.

        Args:
            *args: Positional arguments to pass to create_chat_completion.
            **kwargs: Keyword arguments to pass to create_chat_completion.

        Returns:
            Generated chat completion or a stream of chat completion chunks.
        r   )ChatCompletionChatCompletionChunkr  Fc              3  .   K   | ]  } di |  y wNr(  r(  )rk   chunkr.  s     r_   rn   z9Llama.create_chat_completion_openai_v1.<locals>.<genexpr>  s     o+4e4os   zzTo use create_chat_completion_openai_v1, you must install the openai package.You can install it with `pip install openai`.r(  )openai.types.chatr-  r.  r  rw   rx   r+  ImportError)r^   argsr   r-  r  r.  s        @r_    create_chat_completion_openai_v1z&Llama.create_chat_completion_openai_v1  s    $	MZZ%0Ffd+++oB]$B]B]_cBngmBnoo%U(C(C(CT(TV(TUU 	@ 	s   AA$ A$ $A9c                   t        d)i d| j                  d| j                  j                  d| j                  j                  d| j                  j
                  d| j                  d| j                  j                  d| j                  j                  d| j                  j                  d	| j                  d
| j                  d| j                  j                  d| j                  d| j                  j                  d| j                  j                   d| j                  j"                  d| j                  j$                  d| j                  j&                  d| j                  j(                  d| j                  j*                  d| j                  j,                  d| j                  j.                  d| j                  j0                  d| j                  j2                  d| j                  j4                  d| j                  j6                  d| j                  j8                  d| j                  j:                  d| j                  j<                  d| j>                  d| j@                  d| jB                  d | jD                  d!| jF                  d"| jH                  d#| jJ                  d$| jL                  d%| j                  jN                  d&| j                  jP                  d'| jR                  d(| jT                  S )*Nr|   r"   r#   r$   r%   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rG   rH   rI   rJ   r(  )+r   r|   r~   r"   r#   r$   r%   r'   r(   r)   r*   r   r   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rG   rH   rI   rJ   r]   s    r_   __getstate__zLlama.__getstate__  s)    2
2
 **772
 ((33	2

 &&//2
 **2
 ((332
 &&//2
 ''112
 **2
 2
 %%++2
 LL2
 ((112
  ))33!2
" !//??#2
$ #11CC%2
& ,,99'2
(  ..==)2
* !//??+2
, !//??-2
. "00AA/2
0  ..==12
2  ..==32
4 --;;52
6 **5572
8 ))4492
: ++77;2
< **55=2
@  $66A2
D nnE2
F G2
H nnI2
L M2
P ((Q2
R **S2
V ((W2
Z &&--[2
\ &&--]2
` a2
b LLc2
 2	
ra   c                (     | j                   di | y r0  )r   )r^   states     r_   __setstate__zLlama.__setstate__D  s    ra   c                   | j                   rt        dt        j                         t	        j
                  | j                  j                        }| j                   rt        d| t        j                         t        j                  t        |      z         }| j                   rt        dt        j                         t	        j                  | j                  j                  |      }| j                   rt        d| t        j                         t        |      t        |      kD  rt        d      t        j                  t        |      z         }t        j                  j                  ||t        |             | j                   rt        d| dt        j                         t        | j                  j!                         | j"                  j!                         | j$                  t'        |      || j(                  	      S )
Nz$Llama.save_state: saving llama staterb   z"Llama.save_state: got state size: z!Llama.save_state: allocated statez&Llama.save_state: copied llama state: zFailed to copy llama state datazLlama.save_state: saving z bytes of llama state)r   r   rU   llama_statellama_state_sizer+   )rJ   r   r   r   r[   llama_get_state_sizer   r   r   c_uint8r   llama_copy_state_datar   r   
LlamaStater   r  r   rU   bytesr   )r^   
state_sizer<  n_bytesllama_state_compacts        r_   r  zLlama.save_stateG  sj   <<8szzJ33DIIMMB
<<6zlC#**U~~J7:<<5CJJG11$))--M<<:7)D3::Vw<#j/)@AA%~~G<?  !4k3w<P<<+G94IJZZ <<$$&nn))+]]12$
 	
ra   c                &   |j                   j                         | j                   d |j                  d d f<   | j                   |j                  d d d f   }d||dkD  <   |j                  j                         | _        |j                  | _        |j                  | _        |j                  }t        j                  |z  }|j                  |j                        }t        j                  | j                  j                  |      |k7  rt        d      y )Nr   r   zFailed to set llama state data)r   r  rU   r   r+   r   r=  r   r?  from_buffer_copyr<  r[   llama_set_state_datar   r   r   )r^   r9  restrC  LLamaStateArrayTyper<  s         r_   r  zLlama.load_statee  s    +0<<+<+<+>$enn$a'({{5>>+Q./TAX--/ZZ
++
$nnz9)::5;L;LM))$))--ES?@@ Tra   c                6    | j                   j                         S )zReturn the context window size.)r   r,   r]   s    r_   r,   zLlama.n_ctxt  s    yy  ra   c                6    | j                   j                         S )zReturn the embedding size.)r   r  r]   s    r_   r  zLlama.n_embdx  s    {{!!##ra   c                6    | j                   j                         S )zReturn the vocabulary size.)r   rd   r]   s    r_   rd   zLlama.n_vocab|  s    {{""$$ra   c                    t        |       S )z*Return the llama tokenizer for this model.)r   r]   s    r_   rF   zLlama.tokenizer  s    d##ra   c                6    | j                   j                         S )z!Return the end-of-sequence token.)r   r   r]   s    r_   r   zLlama.token_eos      {{$$&&ra   c                6    | j                   j                         S )z'Return the beginning-of-sequence token.)r   r   r]   s    r_   r   zLlama.token_bos  rP  ra   c                6    | j                   j                         S )zReturn the newline token.)r   r   r]   s    r_   r   zLlama.token_nl  s    {{##%%ra   c                6    | j                   j                         S )zReturn the pooling type.)r   r2   r]   s    r_   r2   zLlama.pooling_type  s    yy%%''ra   c                8    | j                   j                          y)z&Explicitly free the model from memory.N)rt   closer]   s    r_   rU  zLlama.close  s    ra   c                $    | j                          y rY   )rU  r]   s    r_   __del__zLlama.__del__  s    

ra   c                   t        j                  | |d      }|j                  dkD  rd|t        j                  |       <   nt        j                  |      sd}t        j                  | |t         j
                        }t        j                  |      }t        j                  d      5  t        j                  ||d      }t        j                  |      }d d d        ||z
  S # 1 sw Y   |z
  S xY w)NT)axiskeepdimsr   re   r  )divide)
r   amaxndimisfinitesubtractr   experrstatesumlog)r  rY  logits_maxssubtract_maxsr`  summedouts          r_   r  zLlama.logits_to_logprobs  s    
 #%''&td"Ka56K[112[)KFKryyIff]#[[) 	!VVCdT:F&&.C	! s""	! s""s   *.C%%C2c                L    d}t        | |      D ]  \  }}||k(  r|dz  } |S  |S r  )rd  )rl  rm  rk  _a_bs        r_   r  zLlama.longest_token_prefix  sB    !Qi 	FBRx!#	
 ra   c                   	 ddl m}m}	 ddlm}
  |
|        |	       }|j                  |d      D cg c]  }t        |t              r|d   n| }}g }|D ]6  }t        |      j                  |      }|j                  t        |             8 |D cg c]  }t        j                  ||      s| }}t        |      dk(  r't        d| d	| d
t!        j"                  |             t        |      dkD  r't        d| d| d
t!        j"                  |             |\  }t        t        |      j$                        }t        |      j&                  } |||||||       |r|D ]  }|D cg c]  }t        j                  ||      s| }}t        |      dk(  r't        d| d	| d
t!        j"                  |             t        |      dkD  r't        d| d| d
t!        j"                  |             |\  } |||||||        | |||||||d      }n t(        j*                  j-                  ||      } | dd|i|S # t
        $ r t        d      w xY wc c}w c c}w c c}w )a  Create a Llama model from a pretrained model name or path.
        This method requires the huggingface-hub package.
        You can install it with `pip install huggingface-hub`.

        Args:
            repo_id: The model repo id.
            filename: A filename or glob pattern to match the model file in the repo.
            additional_files: A list of filenames or glob patterns to match additional model files in the repo.
            local_dir: The local directory to save the model to.
            local_dir_use_symlinks: Whether to use symlinks when downloading the model.
            **kwargs: Additional keyword arguments to pass to the Llama constructor.

        Returns:
            A Llama model.r   )hf_hub_downloadHfFileSystem)validate_repo_idzrLlama.from_pretrained requires the huggingface-hub package. You can install it with `pip install huggingface-hub`.T)	recursiverl   zNo file found in z that match z

Available Files:
r   zMultiple files found in z
 matching )repo_idfilename	subfolder	local_dirlocal_dir_use_symlinks	cache_dir)rp  rq  rr  rs  rt  ru  local_files_onlyr|   r(  )huggingface_hubrl  rm  huggingface_hub.utilsrn  r3  lsrw   r   r   relative_torg  r   fnmatchr   r   jsondumpsparentrl   r   r   r   )clsrp  rq  additional_filesrs  rt  ru  r   rl  rm  rn  hffsrc   files	file_listrel_pathmatching_filesmatching_filerr  additonal_file_namematching_additional_filesmatching_additional_filer|   s                          r_   from_pretrainedzLlama.from_pretrained  s   2	E> 	!~ 48
 'tT2DL<
 
  "	 	,DDz--g6HS]+	,
 ,5X4h8W$XX~!##G9L
 C%%)ZZ	%:$;= 
 ~"*7):hZ H%%)ZZ%6$79 
 *]+223	&++ 	#9	
 '7 #>G,vd7??[_atKuT,v),v01Q6$+G9LAT@U V--1ZZ	-B,CE 
 01A5$27):FYEZ [--1ZZ->,?A 
 /H+)  #5''+A''8 (!##'=#!%J i:J  
!

 	
}  	I 	
 Y@ -ws(   I I%I*1I*1I/I/I")Tr|   r   r"   r   r#   r   r$   r   r%   zOptional[List[float]]r&   Optional[str]r'   rx   r(   rx   r)   rx   r*   z1Optional[Dict[str, Union[bool, int, float, str]]]r+   r   r,   r   r-   r   r.   r   r/   Optional[int]r0   r  r1   r  r2   r   r3   r   r4   r   r5   r   r6   r   r7   r   r8   r   r9   r   r:   rx   r;   rx   r<   rx   r=   rx   r>   r   r?   r  r@   r   rA   r  rB   zUnion[bool, int]rC   r  rD   z6Optional[llama_chat_format.LlamaChatCompletionHandler]rE   zOptional[LlamaDraftModel]rF   zOptional[BaseLlamaTokenizer]rG   r  rH   r  rI   rx   rJ   rx   )r  zllama_cpp.llama_context_p)r  zllama_cpp.llama_model_p)r  r  )r  r  )r  z
Deque[int])r  zDeque[List[float]])TF)r   rB  r   rx   r   rx   r  r  )NF)r  r  r  zOptional[List[int]]r   rx   r  rB  )r   zOptional[BaseLlamaCache])r+   r   )r  Sequence[int])r  r  r  r    r  r    r   r   r    r   r  r  TNN)rJ  r   rK  r   rL  r   rM  r   rN  r   rO  r   rP  r   rQ  r   rR  r   rS  r   rT  r   rU  r   r9  rx   r1  Optional[LogitsProcessorList]rV  Optional[LlamaGrammar])r  r  r  r    r  r    r   r   r    r   r  r  TNNN) rJ  r   rK  r   rL  r   rM  r   rN  r   rO  r   rP  r   rQ  r   rR  r   rS  r   rT  r   rU  r   r9  rx   r1  r  rV  r  r_  r  )r  r  r  r    r  r    Tr   r   r    r   r  r  TNNN)&r  r  rJ  r   rK  r   rL  r   rM  r   rN  r   rO  r   r
  rx   rP  r   rQ  r   rR  r   rS  r   rU  r   rT  r   r9  rx   r1  r  rj  Optional[StoppingCriteriaList]rV  r  r  z-Generator[int, Optional[Sequence[int]], None]rY   )rz  Union[str, List[str]]rT   r  r  CreateEmbeddingResponse)FTF)rz  r  r  rx   r  rx   rs  rx   )4r  Union[str, List[int]]r  r  r  r  r  r   rK  r   rL  r   rM  r   r  r  r  rx   r  Optional[Union[str, List[str]]]rP  r   rQ  r   rO  r   rJ  r   r  rx   r+   r  rR  r   rS  r   rU  r   rT  r   rT   r  rj  r  r1  r  rV  r  r  Optional[Dict[int, float]]r  zSUnion[Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]])4r  r  r  r  r  r  r  r   rK  r   rL  r   rM  r   r  r  r  rx   r  r  rP  r   rQ  r   rO  r   rJ  r   r  rx   r+   r  rR  r   rS  r   rU  r   rT  r   rT   r  rj  r  r1  r  rV  r  r  r  r  IUnion[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]])4r  r   r  r  r  r  r  r   rK  r   rL  r   rM  r   r  r  r  rx   r  r  rP  r   rQ  r   rO  r   rJ  r   r  rx   r+   r  rR  r   rS  r   rU  r   rT  r   rT   r  rj  r  r1  r  rV  r  r  r  r  r  ):r"  z"List[ChatCompletionRequestMessage]r#  z&Optional[List[ChatCompletionFunction]]r$  z+Optional[ChatCompletionRequestFunctionCall]r%  z"Optional[List[ChatCompletionTool]]r&  z(Optional[ChatCompletionToolChoiceOption]r  r   rK  r   rJ  r   rL  r   rM  r   r  rx   r  r  r+   r  r'  z-Optional[ChatCompletionRequestResponseFormat]r  r  rQ  r   rP  r   rO  r   rR  r   rS  r   rU  r   rT  r   rT   r  r1  r  rV  r  r  r  r  zOptional[bool]r  r  r  zQUnion[CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]])r4  r   r   r   )r  rA  )r9  rA  r  None)r  r   )r  r   )r  r   )r  r  )rM   )r  z#Union[npt.NDArray[np.single], List]rY  r   r  r  )rl  r  rm  r  )NNautoN)rp  r   rq  r  r  zOptional[List]rs  &Optional[Union[str, os.PathLike[str]]]rt  zUnion[bool, Literal['auto']]ru  r  r   r   r  z'Llama')5__name__
__module____qualname____doc__ru   r[   LLAMA_SPLIT_MODE_LAYERr   r   LLAMA_POOLING_TYPE_UNSPECIFIEDr   propertyr   rT   r   r   r   r   r   r  r  r  r
  r  r[  r^  rq  r~  ry  r  r  r  r+  r5  r7  r:  r  r  r,   r  rd   rF   r   r   r   r2   rU  rW  staticmethodr  r  classmethodr  r(  ra   r_   r   r   7   s   :! #::.2%) JN00#')- 99%DD #!$!%"% $ #   "$#'#'!&%)OS1526 $ $ qbb
 b b b ,b #b b b b Hb b  !b" #b$ %b& !'b( ')b*
+b0 1b2 3b4 5b6 7b8  9b: ;b< =b> ?b@ AbB CbD EbF GbJ  KbN !ObP QbR !SbV WbZ #[b\ M]b` /abd 0ebh ibj kbn obp qbH   ! ! / / / / S S 
 
 BG@@$(@:>@	@* ,0	

 )
 	

 

(#&N  ##&"%!! :>*.!WW W 	W
 W W W !W  W W W W W W 8W  (!Wv  ##&"%!! :>*.!#<< < 	<
 < < < !<  < < < < < < 8<  (!<" #<B  ##&"%!! :><@*.'JJ J 	J
 J J J J J !J  J J J J J  !J" 8#J$ :%J& ('J( 
7)JZ DH&
*&
3@&
	 &
V  "w$w w 	w
 wx !%$& "&02#&"% #"!!#<@:>*.155j	
%j	
 j	
 "	j	

 j	
 j	
 j	
 j	
  j	
 j	
 .j	
 !j	
  j	
 j	
 j	
  !j	
" #j	
$ %j	
& 'j	
( )j	
* +j	
, -j	
. :/j	
0 81j	
2 (3j	
4 /5j	
6
7j	
^ !%$& "&02#&"% #"!!#<@:>*.155_%_ _ "	_
 _ _ _ _  _ _ ._ !_  _ _ _  !_" #_$ %_& '_( )_* +_, -_. :/_0 81_2 (3_4 /5_6 
S7_H !%$& "&02#&"% #"!!#<@:>*.155Z
Z
 Z
 "	Z

 Z
 Z
 Z
 Z
  Z
 Z
 .Z
 !Z
  Z
 Z
 Z
  !Z
" #Z
$ %Z
& 'Z
( )Z
* +Z
, -Z
. :/Z
0 81Z
2 (3Z
4 /5Z
6 
S7Z
~ =AEI48@D 02"IM$("%#& #!!#:>*.15#'&*;e
4e
 :e
 C	e

 2e
 >e
 e
 e
 e
 e
 e
 e
 .e
 e
 Ge
  "!e
"  #e
$ !%e
& 'e
( )e
* +e
, -e
. /e
0 1e
2 83e
4 (5e
6 /7e
8 !9e
: $;e
<
=e
N B3
j
<A!$%$''&( AC#3#;>#	# #"   
 ,0<@?E<@|
|
  |
 )	|

 :|
 !=|
 :|
 |
 
|
 |
ra   r   c                  ,    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 ddZy)rA  c                X    || _         || _        || _        || _        || _        || _        y rY   )r   r   rU   r<  r=  r+   )r^   r   r   rU   r<  r=  r+   s          r_   r   zLlamaState.__init__8	  s0     # & 0	ra   N)r   r  r   r  rU   r   r<  rB  r=  r   r+   r   )r  r  r  r   r(  ra   r_   rA  rA  7	  sA    ' ' 	
   ra   rA  c                       e Zd Z	 	 	 	 	 	 ddZy)r  c                &    | D ]  } |||      } |S rY   r(  )r^   r   r   	processors       r_   r  zLogitsProcessorList.__call__O	  s$      	2Iy&1F	2ra   Nr  r  r  r  r  r(  ra   r_   r  r  N	  s    -7M	ra   r  c                       e Zd Z	 	 	 	 	 	 ddZy)StoppingCriteriaListc           	     L    t        | D cg c]  } |||       c}      S c c}w rY   )any)r^   r   r  rj  s       r_   r  zStoppingCriteriaList.__call__[	  s'     RVW=N%i8WXXWs   !N)r   r  r  r  r  rx   r  r(  ra   r_   r  r  Z	  s#    Y-Y7MY	Yra   r  c                  (    e Zd ZddZ	 	 	 	 	 	 ddZy)MinTokensLogitsProcessorc                .    || _         || _        d | _        y rY   )
min_tokensr   rv  )r^   r  r   s      r_   r   z!MinTokensLogitsProcessor.__init__b	  s    $"!ra   c                    | j                   t        |      | _         t        |      | j                   z
  | j                  k  rt        j                   || j
                  <   |S rY   )rv  r   r  r   infr   )r^   r   r   s      r_   r  z!MinTokensLogitsProcessor.__call__g	  sP     %!$YDy>D...@&(ffWF4>>"ra   N)r  r   r   r   r  )r  r  r  r   r  r(  ra   r_   r  r  a	  s$    "
-7M	ra   r  )D
__future__r   r   r   r  r  r|  r   r   r  r{  r  rr   r   r   r   r   r   r   r	   r
   r   r   r   r   collectionsr   pathlibr   llama_typesllama_grammarr   llama_cacher   r   r   r   llama_tokenizerr   r   llama_cpp.llama_cppr[   llama_cpp.llama_chat_formatr   llama_cpp.llama_speculativer   numpyr   numpy.typingnptllama_cpp._internals
_internalsr   _loggerr   _utilsr   r   rA  NDArrayr   r   LogitsProcessorr  rx   StoppingCriteriar  r  r(  ra   r_   <module>r     s2   " 	 
                 '  @ ' 7 7   (   *}#
 }#
@H $ [[3;;ryy12CKK		4JJ
$/  S[[13;;ryy3IJDPQ Y4 01 Y ra   