Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,27 +1,86 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import copy
|
3 |
import gradio as gr
|
4 |
import spaces
|
5 |
-
from
|
6 |
-
import
|
7 |
-
import
|
8 |
-
from huggingface_hub import hf_hub_download
|
9 |
|
10 |
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
12 |
-
|
13 |
-
REPO_ID = "bartowski/gemma-2-27b-it-GGUF"
|
14 |
MODEL_NAME = MODEL_ID.split("/")[-1]
|
15 |
-
MODEL_FILE = "gemma-2-27b-it-Q4_K_M.gguf"
|
16 |
|
17 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(MODEL_ID),
|
23 |
-
verbose=False,
|
24 |
-
)
|
25 |
|
26 |
TITLE = "<h1><center>Chatbox</center></h1>"
|
27 |
|
@@ -49,31 +108,33 @@ h3 {
|
|
49 |
|
50 |
|
51 |
@spaces.GPU(duration=90)
|
52 |
-
def stream_chat(message: str, history: list, temperature: float,
|
53 |
print(f'message is - {message}')
|
54 |
print(f'history is - {history}')
|
55 |
conversation = []
|
56 |
for prompt, answer in history:
|
57 |
-
conversation.extend([
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
print(f"Conversation is -\n{conversation}")
|
61 |
-
|
62 |
-
|
63 |
-
messages
|
64 |
-
|
65 |
top_p=top_p,
|
|
|
66 |
repeat_penalty=penalty,
|
67 |
-
|
68 |
-
stream =True,
|
69 |
-
temperature=temperature,
|
70 |
)
|
71 |
-
|
72 |
-
for out in output:
|
73 |
-
stream = copy.deepcopy(out)
|
74 |
-
temp += stream["choices"][0]["text"]
|
75 |
-
yield temp
|
76 |
|
|
|
|
|
77 |
|
78 |
|
79 |
chatbot = gr.Chatbot(height=600)
|
@@ -101,7 +162,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
|
|
101 |
maximum=2048,
|
102 |
step=1,
|
103 |
value=1024,
|
104 |
-
label="
|
105 |
render=False,
|
106 |
),
|
107 |
gr.Slider(
|
|
|
1 |
+
model_name = "gemma2:27b"
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
os.system("sudo apt install lshw")
|
6 |
+
os.system("curl https://ollama.ai/install.sh | sh")
|
7 |
+
|
8 |
+
import nest_asyncio
|
9 |
+
nest_asyncio.apply()
|
10 |
+
|
11 |
+
import os
|
12 |
+
import asyncio
|
13 |
+
|
14 |
+
# Run Async Ollama
|
15 |
+
# Taken from: https://stackoverflow.com/questions/77697302/how-to-run-ollama-in-google-colab
|
16 |
+
# NB: You may need to set these depending and get cuda working depending which backend you are running.
|
17 |
+
# Set environment variable for NVIDIA library
|
18 |
+
# Set environment variables for CUDA
|
19 |
+
os.environ['PATH'] += ':/usr/local/cuda/bin'
|
20 |
+
# Set LD_LIBRARY_PATH to include both /usr/lib64-nvidia and CUDA lib directories
|
21 |
+
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia:/usr/local/cuda/lib64'
|
22 |
+
|
23 |
+
async def run_process(cmd):
|
24 |
+
print('>>> starting', *cmd)
|
25 |
+
process = await asyncio.create_subprocess_exec(
|
26 |
+
*cmd,
|
27 |
+
stdout=asyncio.subprocess.PIPE,
|
28 |
+
stderr=asyncio.subprocess.PIPE
|
29 |
+
)
|
30 |
+
|
31 |
+
# define an async pipe function
|
32 |
+
async def pipe(lines):
|
33 |
+
async for line in lines:
|
34 |
+
print(line.decode().strip())
|
35 |
+
|
36 |
+
await asyncio.gather(
|
37 |
+
pipe(process.stdout),
|
38 |
+
pipe(process.stderr),
|
39 |
+
)
|
40 |
+
|
41 |
+
# call it
|
42 |
+
await asyncio.gather(pipe(process.stdout), pipe(process.stderr))
|
43 |
+
|
44 |
+
import asyncio
|
45 |
+
import threading
|
46 |
+
|
47 |
+
async def start_ollama_serve():
|
48 |
+
await run_process(['ollama', 'serve'])
|
49 |
+
|
50 |
+
def run_async_in_thread(loop, coro):
|
51 |
+
asyncio.set_event_loop(loop)
|
52 |
+
loop.run_until_complete(coro)
|
53 |
+
loop.close()
|
54 |
+
|
55 |
+
# Create a new event loop that will run in a new thread
|
56 |
+
new_loop = asyncio.new_event_loop()
|
57 |
+
|
58 |
+
# Start ollama serve in a separate thread so the cell won't block execution
|
59 |
+
thread = threading.Thread(target=run_async_in_thread, args=(new_loop, start_ollama_serve()))
|
60 |
+
thread.start()
|
61 |
+
|
62 |
+
# Load up model
|
63 |
+
|
64 |
+
os.system(f"ollama pull {model_name}")
|
65 |
+
|
66 |
+
|
67 |
import copy
|
68 |
import gradio as gr
|
69 |
import spaces
|
70 |
+
from llama_index.llms.ollama import Ollama
|
71 |
+
import llama_index
|
72 |
+
from llama_index.core.llms import ChatMessage
|
|
|
73 |
|
74 |
|
75 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
76 |
+
MODEL_ID_LIST = ["google/gemma-2-27b-it"]
|
|
|
77 |
MODEL_NAME = MODEL_ID.split("/")[-1]
|
|
|
78 |
|
79 |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
80 |
|
81 |
+
|
82 |
+
gemma2 = Ollama(model=model_name, request_timeout=30.0)
|
83 |
+
|
|
|
|
|
|
|
84 |
|
85 |
TITLE = "<h1><center>Chatbox</center></h1>"
|
86 |
|
|
|
108 |
|
109 |
|
110 |
@spaces.GPU(duration=90)
|
111 |
+
def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
|
112 |
print(f'message is - {message}')
|
113 |
print(f'history is - {history}')
|
114 |
conversation = []
|
115 |
for prompt, answer in history:
|
116 |
+
conversation.extend([
|
117 |
+
ChatMessage(
|
118 |
+
role="user", content=prompt
|
119 |
+
),
|
120 |
+
ChatMessage(role="assistant", content=answer),
|
121 |
+
])
|
122 |
+
messages = [ChatMessage(role="user", content=message)]
|
123 |
|
124 |
print(f"Conversation is -\n{conversation}")
|
125 |
+
|
126 |
+
resp = gemma2.stream_chat(
|
127 |
+
message = messages,
|
128 |
+
chat_history = conversation,
|
129 |
top_p=top_p,
|
130 |
+
top_k=top_k,
|
131 |
repeat_penalty=penalty,
|
132 |
+
context_window=context_window,
|
|
|
|
|
133 |
)
|
134 |
+
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
for r in resp:
|
137 |
+
yield r.delta
|
138 |
|
139 |
|
140 |
chatbot = gr.Chatbot(height=600)
|
|
|
162 |
maximum=2048,
|
163 |
step=1,
|
164 |
value=1024,
|
165 |
+
label="Context window",
|
166 |
render=False,
|
167 |
),
|
168 |
gr.Slider(
|