File size: 4,018 Bytes
d1343e4
 
559ea97
021692e
1391fc1
2a813c3
d1343e4
1391fc1
 
 
d1343e4
 
1391fc1
d1343e4
 
9cb7ee7
1391fc1
9cb7ee7
2a813c3
d1343e4
1391fc1
021692e
 
 
 
 
 
 
 
 
 
 
 
 
 
d1343e4
 
536efdb
2a813c3
 
a9310a4
021692e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb7ee7
021692e
a9310a4
021692e
 
 
 
 
 
 
 
 
5fc7bc2
 
9cb7ee7
5fc7bc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Importing libraries
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
from llama_cpp import Llama
import gradio as gr
import psutil

# Initing things
llm = Llama(model_path="./model.bin")                              # LLaMa model
llama_model_name = "TheBloke/Llama-2-13B-chat-GGUF"                 
translator_tokenizer = M2M100Tokenizer.from_pretrained(            # tokenizer for translator
    "facebook/m2m100_1.2B", cache_dir="translator/"
)
translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model
    "facebook/m2m100_1.2B", cache_dir="translator/"
)
print("! SETTING MODEL IN EVALUATION MODE !")
translator_model.eval()
print("! DONE !")

# Preparing things to work
translator_tokenizer.src_lang = "en"
title = "llama.cpp API"
desc = '''<style>a:visited{color:black;}</style>
<h1>Hello, world!</h1>
This is showcase how to make own server with Llama2 model.<br>
I'm using here 7b model just for example. Also here's only CPU power.<br>
But you can use GPU power as well!<br>
<h1>How to GPU?</h1>
Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code>, <code>`DLLAMA_METAL`</code> or <code>`DLLAMA_METAL`</code>.<br>
Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a>, <a href="https://quart.palletsprojects.com/">Quart</a> and <a href="https://www.uvicorn.org/">Uvicorn</a>.<br>
<h1>How to test it on own machine?</h1>
You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
<script>document.write("<b>URL of space:</b> "+window.location.href);</script>'''

# Loading prompt
with open('system.prompt', 'r', encoding='utf-8') as f:
    prompt = f.read()

def generate_answer(request: str, max_tokens: int = 256, language: str = "en", custom_prompt: str = None):
    try:
        maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
        if isinstance(custom_prompt, str):
            userPrompt = custom_prompt + "\n\nUser: " + request + "\nAssistant: "
        else:
            userPrompt = prompt + "\n\nUser: " + request + "\nAssistant: "
    except:
        return "Not enough data! Check that you passed all needed data."
    
    try:
        output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False)
        text = output["choices"][0]["text"]
        # i allowed only certain languages (its not discrimination, its just other popular language on my opinion!!!):
        # russian (ru), ukranian (uk), chinese (zh)
        if language in ["ru", "uk", "zh"]:
            encoded_input = translator_tokenizer(output, return_tensors="pt")
            generated_tokens = translator_model.generate(
                **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(language)
            )
            translated_text = translator_tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )[0]
            return translated_text
        return text
    except Exception as e:
        print(e)
        return "Oops! Internal server error. Check the logs of space/instance."

print("! LOAD GRADIO INTERFACE !")
demo = gr.Interface(
    fn=generate_answer,
    inputs=[
        gr.components.Textbox(label="Input"),
        gr.components.Number(value=256),
        gr.components.Dropdown(label="Target Language", value="en", choices=["en", "ru", "uk", "zh"]),
        gr.components.Textbox(label="Custom system prompt"),
    ],
    outputs=["text"],
    title=title,
    description=desc
).queue()
if __name__ == "__main__":
    print("! LAUNCHING GRADIO !")
    demo.launch()