File size: 1,456 Bytes
8ffb1bd
b098a17
 
0670e63
8ffb1bd
b098a17
 
 
 
8ffb1bd
4f3c9e0
 
 
 
b098a17
 
 
 
 
 
5203933
8ffb1bd
b098a17
 
 
 
48024cb
b098a17
48024cb
 
b098a17
 
 
 
 
 
 
 
 
 
 
de8d85a
b0c258b
d124e37
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import psutil

## Download the GGUF model
model_name = "jackangel/LLama_3_Instruct_SPFx_Docs_Unsloth"
model_file = "Llama_3_Instruct_SPFx_Docs_Unsloth.Q4_K_M.gguf" # this is the specific model file we'll use in this example. It's a 4-bit quant, but other levels of quantization are available in the model repo if preferred
model_path = hf_hub_download(model_name, filename=model_file)


_ = psutil.cpu_count(logical=False) - 1
cpu_count: int = int(_) if _ else 1

## Instantiate model from downloaded file
llm = Llama(
    model_path=model_path,
    n_ctx=1024,  # Context length to use
    n_threads=cpu_count,            # Number of CPU threads to use
    n_gpu_layers=0        # Number of model layers to offload to GPU
)

## Generation kwargs
generation_kwargs = {
    "max_tokens":1000,
    "stop":["</s>"],
    "temperature":0.2,
    "echo":False, # Echo the prompt in the output
    "top_k":20,
    "top_p":0.7
}


def chatbot(message, history):
    prompt = "INSTRUCTION: You are a helpful assistant\nINPUT: " + message + "\nOUTPUT:"
    airemember =  ""
    for human,assistant in history:

        airemember += "USER: " + human + "\nASSISTANT:" + assistant+"\n\n"
    sendtoai = airemember + prompt
    result = llm(sendtoai, **generation_kwargs)
    text=result["choices"][0]["text"].strip()
    return text
app = gr.ChatInterface(chatbot)
app.launch()