Llama-3.2-1B-Instruct

Running on Zero

App Files Files Community

vilarin commited on Jun 28

Commit

85585d6

•

1 Parent(s): 8716f81

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -30

app.py CHANGED Viewed

@@ -1,27 +1,86 @@
-import torch
 import copy
 import gradio as gr
 import spaces
-from llama_cpp import Llama
-import llama_cpp.llama_tokenizer
-import os
-from huggingface_hub import hf_hub_download
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = "google/gemma-2-27b-it"
-REPO_ID = "bartowski/gemma-2-27b-it-GGUF"
 MODEL_NAME = MODEL_ID.split("/")[-1]
-MODEL_FILE = "gemma-2-27b-it-Q4_K_M.gguf"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-llm = llama_cpp.Llama.from_pretrained(
-    repo_id=REPO_ID,
-    filename=MODEL_FILE,
-    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(MODEL_ID),
-    verbose=False,
-)
 TITLE = "<h1><center>Chatbox</center></h1>"
@@ -49,31 +108,33 @@ h3 {
 @spaces.GPU(duration=90)
-def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
     for prompt, answer in history:
-        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
-    conversation.append({"role": "user", "content": message})
     print(f"Conversation is -\n{conversation}")
-    output = llm(
-        messages=conversation,
-        top_k=top_k,
         top_p=top_p,
         repeat_penalty=penalty,
-        max_tokens=max_new_tokens,
-        stream =True,
-        temperature=temperature,
     )
-    for out in output:
-        stream = copy.deepcopy(out)
-        temp += stream["choices"][0]["text"]
-        yield temp
 chatbot = gr.Chatbot(height=600)
@@ -101,7 +162,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 maximum=2048,
                 step=1,
                 value=1024,
-                label="Max Tokens",
                 render=False,
             ),
             gr.Slider(

+model_name = "gemma2:27b"
+import os
+os.system("sudo apt install lshw")
+os.system("curl https://ollama.ai/install.sh | sh")
+import nest_asyncio
+nest_asyncio.apply()
+import os
+import asyncio
+# Run Async Ollama
+# Taken from: https://stackoverflow.com/questions/77697302/how-to-run-ollama-in-google-colab
+# NB: You may need to set these depending and get cuda working depending which backend you are running.
+# Set environment variable for NVIDIA library
+# Set environment variables for CUDA
+os.environ['PATH'] += ':/usr/local/cuda/bin'
+# Set LD_LIBRARY_PATH to include both /usr/lib64-nvidia and CUDA lib directories
+os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia:/usr/local/cuda/lib64'
+async def run_process(cmd):
+    print('>>> starting', *cmd)
+    process = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE
+    )
+    # define an async pipe function
+    async def pipe(lines):
+        async for line in lines:
+            print(line.decode().strip())
+        await asyncio.gather(
+            pipe(process.stdout),
+            pipe(process.stderr),
+        )
+    # call it
+    await asyncio.gather(pipe(process.stdout), pipe(process.stderr))
+import asyncio
+import threading
+async def start_ollama_serve():
+    await run_process(['ollama', 'serve'])
+def run_async_in_thread(loop, coro):
+    asyncio.set_event_loop(loop)
+    loop.run_until_complete(coro)
+    loop.close()
+# Create a new event loop that will run in a new thread
+new_loop = asyncio.new_event_loop()
+# Start ollama serve in a separate thread so the cell won't block execution
+thread = threading.Thread(target=run_async_in_thread, args=(new_loop, start_ollama_serve()))
+thread.start()
+# Load up model
+os.system(f"ollama pull {model_name}")
 import copy
 import gradio as gr
 import spaces
+from llama_index.llms.ollama import Ollama
+import llama_index
+from llama_index.core.llms import ChatMessage
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_ID_LIST = ["google/gemma-2-27b-it"]
 MODEL_NAME = MODEL_ID.split("/")[-1]
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+gemma2 = Ollama(model=model_name, request_timeout=30.0)
 TITLE = "<h1><center>Chatbox</center></h1>"
 @spaces.GPU(duration=90)
+def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
     for prompt, answer in history:
+        conversation.extend([
+            ChatMessage(
+            role="user", content=prompt
+            ),
+            ChatMessage(role="assistant", content=answer),
+        ])
+    messages = [ChatMessage(role="user", content=message)]
     print(f"Conversation is -\n{conversation}")
+    resp = gemma2.stream_chat(
+        message = messages,
+        chat_history = conversation,
         top_p=top_p,
+        top_k=top_k,
         repeat_penalty=penalty,
+        context_window=context_window,
     )
+    for r in resp:
+        yield r.delta
 chatbot = gr.Chatbot(height=600)
                 maximum=2048,
                 step=1,
                 value=1024,
+                label="Context window",
                 render=False,
             ),
             gr.Slider(