toaster61 commited on
Commit
021692e
1 Parent(s): 1391fc1

first real gradio commit

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. gradio_app.py +117 -0
  3. app.py → quart_app.py +1 -1
Dockerfile CHANGED
@@ -29,4 +29,4 @@ RUN python3 -m pip install -U --no-cache-dir pip setuptools wheel
29
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
30
 
31
  # Now it's time to run Quart app using uvicorn! (It's faster, trust me.)
32
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
29
  RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
30
 
31
  # Now it's time to run Quart app using uvicorn! (It's faster, trust me.)
32
+ CMD ["python", "gradio_app.py"]
gradio_app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing libraries
2
+ from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
3
+ from llama_cpp import Llama
4
+ import gradio as gr
5
+ import psutil
6
+
7
+ # Initing things
8
+ llm = Llama(model_path="./model.bin") # LLaMa model
9
+ llama_model_name = "TheBloke/Llama-2-13B-chat-GGUF"
10
+ translator_tokenizer = M2M100Tokenizer.from_pretrained( # tokenizer for translator
11
+ "facebook/m2m100_1.2B", cache_dir="translator/"
12
+ )
13
+ translator_model = M2M100ForConditionalGeneration.from_pretrained( # translator model
14
+ "facebook/m2m100_1.2B", cache_dir="translator/"
15
+ )
16
+ translator_model.eval()
17
+
18
+ # Preparing things to work
19
+ translator_tokenizer.src_lang = "en"
20
+ title = "llama.cpp API"
21
+ desc = '''<style>a:visited{color:black;}</style>
22
+ <h1>Hello, world!</h1>
23
+ This is showcase how to make own server with Llama2 model.<br>
24
+ I'm using here 7b model just for example. Also here's only CPU power.<br>
25
+ But you can use GPU power as well!<br>
26
+ <h1>How to GPU?</h1>
27
+ Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code>, <code>`DLLAMA_METAL`</code> or <code>`DLLAMA_METAL`</code>.<br>
28
+ Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a>, <a href="https://quart.palletsprojects.com/">Quart</a> and <a href="https://www.uvicorn.org/">Uvicorn</a>.<br>
29
+ <h1>How to test it on own machine?</h1>
30
+ You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
31
+ Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
32
+ <br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
33
+ <script>document.write("<b>URL of space:</b> "+window.location.href);</script>'''
34
+
35
+ # Loading prompt
36
+ with open('system.prompt', 'r', encoding='utf-8') as f:
37
+ prompt = f.read()
38
+
39
+ # this model was loaded from https://hf.co/models
40
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
41
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
42
+ device = 0 if torch.cuda.is_available() else -1
43
+ LANGS = ["ace_Arab", "eng_Latn", "fra_Latn", "spa_Latn"]
44
+
45
+ def t1ranslate(text, src_lang, tgt_lang):
46
+ try:
47
+ maxTokens = data.get("max_tokens", 64)
48
+ if isinstance(data.get("system_prompt"), str):
49
+ userPrompt = data.get("system_prompt") + "\n\nUser: " + data['request'] + "\nAssistant: "
50
+ else:
51
+ userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: "
52
+ except:
53
+ return {"error": "Not enough data", "output": "Oops! Error occured! If you're a developer, using this API, check 'error' key."}, 400
54
+ try:
55
+ output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False)
56
+ text = output["choices"][0]["text"]
57
+ # i allowed only certain languages:
58
+ # russian (ru), ukranian (uk), chinese (zh)
59
+ if isinstance(data.get("target_lang"), str) and data.get("target_lang").lower() in ["ru", "uk", "zh"]:
60
+ encoded_input = translator_tokenizer(output, return_tensors="pt")
61
+ generated_tokens = translator_model.generate(
62
+ **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(data.get("target_lang"))
63
+ )
64
+ translated_text = translator_tokenizer.batch_decode(
65
+ generated_tokens, skip_special_tokens=True
66
+ )[0]
67
+ return {"output": text, "translated_output": translated_text}
68
+
69
+ return {"output": text}
70
+ except Exception as e:
71
+ print(e)
72
+ return {"error": str(e), "output": "Oops! Internal server error. Check the logs. If you're a developer, using this API, check 'error' key."}, 500
73
+
74
+ def translate(request: str, max_tokens: int = 256, language: str = "en", custom_prompt: str = None):
75
+ try:
76
+ maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
77
+ if isinstance(custom_prompt, str):
78
+ userPrompt = custom_prompt + "\n\nUser: " + request + "\nAssistant: "
79
+ else:
80
+ userPrompt = prompt + "\n\nUser: " + request + "\nAssistant: "
81
+ except:
82
+ return "Not enough data! Check that you passed all needed data."
83
+
84
+ try:
85
+ output = llm(userPrompt, max_tokens=maxTokens, stop=["User:", "\n"], echo=False)
86
+ text = output["choices"][0]["text"]
87
+ # i allowed only certain languages (its not discrimination, its just other popular language on my opinion!!!):
88
+ # russian (ru), ukranian (uk), chinese (zh)
89
+ if language in ["ru", "uk", "zh"]:
90
+ encoded_input = translator_tokenizer(output, return_tensors="pt")
91
+ generated_tokens = translator_model.generate(
92
+ **encoded_input, forced_bos_token_id=translator_tokenizer.get_lang_id(language)
93
+ )
94
+ translated_text = translator_tokenizer.batch_decode(
95
+ generated_tokens, skip_special_tokens=True
96
+ )[0]
97
+ return translated_text
98
+ return text
99
+ except Exception as e:
100
+ print(e)
101
+ return "Oops! Internal server error. Check the logs of space/instance."
102
+
103
+
104
+ demo = gr.Interface(
105
+ fn=translate,
106
+ inputs=[
107
+ gr.components.Textbox(label="Input"),
108
+ gr.components.Number(value=256),
109
+ gr.components.Dropdown(label="Target Language", value="en", choices=["en", "ru", "uk", "zh"]),
110
+ gr.components.Textbox(label="Custom system prompt"),
111
+ ],
112
+ outputs=["text"],
113
+ title=title,
114
+ description=desc
115
+ )
116
+ demo.queue()
117
+ demo.launch()
app.py → quart_app.py RENAMED
@@ -68,5 +68,5 @@ Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-pytho
68
  <h1>How to test it on own machine?</h1>
69
  You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
70
  Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
71
- <br>''' + f"Memory free: {psutil.virtual_memory()[2]}" + '''
72
  <script>document.write("<b>URL of space:</b> "+window.location.href);</script>'''
 
68
  <h1>How to test it on own machine?</h1>
69
  You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
70
  Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
71
+ <br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
72
  <script>document.write("<b>URL of space:</b> "+window.location.href);</script>'''