File size: 2,734 Bytes
161a808
 
48db1b5
2f51dc8
 
48db1b5
 
 
eaf6d5b
48db1b5
161a808
48db1b5
eaf6d5b
b78b267
ad288d9
b78b267
ad288d9
 
ab8bbc9
b78b267
42b96f4
eaf6d5b
b78b267
eaf6d5b
 
ab8bbc9
eaf6d5b
20f53f9
42b96f4
eaf6d5b
 
 
48db1b5
 
 
 
 
 
 
 
 
20f53f9
 
48db1b5
20f53f9
 
 
 
 
48db1b5
20f53f9
 
 
 
 
 
 
48db1b5
 
20f53f9
 
 
 
 
 
 
48db1b5
 
 
 
 
eaf6d5b
 
 
 
 
48db1b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
print("START: BEFORE IMPORTS")

import os
import time

import gradio as gr
import copy
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

print("START: AFTER IMPORTS")

try:
    print("START: BEFORE MODEL DOWNLOAD")
    start_load_time = time.time()
    model_path = hf_hub_download(
        repo_id="NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
        filename="Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
        
    )
    print(f"START: AFTER MODEL DOWNLOAD -- {time.time() - start_load_time}s")
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_gpu_layers=-1,  # change n_gpu_layers if you have more or less VRAM
        verbose=True
    )
    
    print(f"START: AFTER LLAMA-CPP SETUP -- {time.time() - start_load_time}s")

except Exception as e:
    print(e)


def generate_text(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
): 
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""
    
    for chunk in llm.create_chat_completion(
        stream=True,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        messages=messages,
    ):
        part = chunk["choices"][0]["delta"].get("content", None)
        if part:
            response += part
        yield response
        
demo = gr.ChatInterface(
    generate_text,
    title="llama-cpp-python on GPU",
    description="Running LLM with https://github.com/abetlen/llama-cpp-python",
    examples=[
        ["How to setup a human base on Mars? Give short answer."],
        ["Explain theory of relativity to me like I’m 8 years old."],
        ["What is 9,000 * 9,000?"],
        ["Write a pun-filled happy birthday message to my friend Alex."],
        ["Justify why a penguin might make a good king of the jungle."],
    ],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()