File size: 3,951 Bytes
d1343e4
559ea97
1de5e4f
021692e
1391fc1
2a813c3
968018c
 
 
3bbfac9
968018c
2a813c3
d1343e4
021692e
43a82b2
021692e
c43baac
79c6bfe
021692e
79c6bfe
021692e
 
 
 
e543d33
d1343e4
 
536efdb
2a813c3
486de6b
 
2a813c3
13b479a
1de5e4f
36c93f6
021692e
 
486de6b
 
 
97bbc3c
486de6b
84b88a1
021692e
67288d4
021692e
 
84b88a1
0fef086
939cad2
0fef086
14f87da
84b88a1
 
 
0fef086
 
d073dcb
 
c43baac
d4735f7
d073dcb
1de5e4f
67288d4
021692e
1de5e4f
 
67288d4
021692e
9cb7ee7
021692e
a9310a4
021692e
 
 
 
 
67288d4
 
 
 
021692e
416e232
 
a526b93
5d264d1
a526b93
5d264d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Importing libraries
from llama_cpp import Llama
from time import time
import gradio as gr
import psutil

# Initing things                
print("! INITING LLAMA MODEL !")
llm = Llama(model_path="./model.bin")                              # LLaMa model
llama_model_name = "TheBloke/openchat_3.5-GGUF" 
print("! INITING DONE !")

# Preparing things to work
title = "llama.cpp API"
desc = '''<h1>Hello, world!</h1>
This is showcase how to make own server with Llama2 model.<br>
I'm using here 7b model just for example. Also here's only CPU power.<br>
But you can use GPU power as well!<br><br>
<h1>How to GPU?</h1>
Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
<h1>How to test it on own machine?</h1>
You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a> and <a href="https://www.gradio.app/">Gradio</a>.<br><br>'''

# Loading prompt
with open('system.prompt', 'r', encoding='utf-8') as f:
    prompt = f.read()
with open('system.message', 'r', encoding='utf-8') as f:
    system_message = f.read()

def generate_answer(request: str, max_tokens: int = 256, custom_prompt: str = None):
    t0 = time()
    logs = f"Request: {request}\nMax tokens: {max_tokens}\nCustom prompt: {custom_prompt}\n"
    try:
        maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
        userPrompt = prompt.replace("{prompt}", request)
        userPrompt = userPrompt.replace(
            "{system_message}",
            custom_prompt if isinstance(custom_prompt, str) and len(custom_prompt.strip()) > 1 and custom_prompt.strip() not in ['', None, ' '] else system_message
        )
        logs += f"\nFinal prompt: {userPrompt}\n"
    except:
        return "Not enough data! Check that you passed all needed data.", logs
    
    try:
        # this shitty fix will be until i willnt figure out why sometimes there is empty output
        counter = 1
        while counter <= 3:
            logs += f"Attempt {counter} to generate answer...\n"
            output = llm(userPrompt, max_tokens=maxTokens, stop=["<|im_end|>", "<|end_of_turn|>"], echo=False)
            text = output["choices"][0]["text"]
            if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
                break
            counter += 1
        logs += f"Final attempt: {counter}\n"
        if len(text.strip()) <= 1 or text.strip() in ['', None, ' ']:
            logs += f"Generated and aborted: {text}"
            text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs."
        
        logs += f"\nFinal: '''{text}'''"
        logs += f"\n\nTime spent: {time()-t0}"
        return text, logs
    except Exception as e:
        logs += str(e)
        logs += f"\n\nTime spent: {time()-t0}"
        return "Oops! Internal server error. Check the logs of space/instance.", logs

print("! LOAD GRADIO INTERFACE !")
demo = gr.Interface(
    fn=generate_answer,
    inputs=[
        gr.components.Textbox(label="Input"),
        gr.components.Number(value=256),
        gr.components.Textbox(label="Custom system prompt"),
    ],
    outputs=[
        gr.components.Textbox(label="Output"),
        gr.components.Textbox(label="Logs")
    ],
    title=title,
    description=desc,
    allow_flagging='never'
)
demo.queue()
print("! LAUNCHING GRADIO !")
demo.launch(server_name="0.0.0.0")