Mistral-Nemo

Running on Zero

App Files Files Community

vilarin commited on 17 days ago

Commit

6386510

•

1 Parent(s): 2b81f89

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -47

app.py CHANGED Viewed

@@ -2,49 +2,27 @@ import os
 import threading
 import time
 import subprocess
-OLLAMA = os.path.expanduser("~/ollama")
-if not os.path.exists(OLLAMA):
-    subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
-    os.chmod(OLLAMA, 0o755)
-def ollama_service_thread():
-    subprocess.run("~/ollama serve", shell=True)
-OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
-OLLAMA_SERVICE_THREAD.start()
-print("Giving ollama serve a moment")
-time.sleep(10)
-# Modify the model to what you want
-model = "gemma2"
-subprocess.run(f"~/ollama pull {model}", shell=True)
-import copy
 import gradio as gr
-from ollama import Client
-client = Client(host='http://localhost:11434', timeout=120)
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = os.environ.get("MODEL_ID", "google/gemma-2-9b-it")
 MODEL_NAME = MODEL_ID.split("/")[-1]
-TITLE = "<h1><center>ollama-Chat</center></h1>"
 DESCRIPTION = f"""
 <h3>MODEL: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></h3>
 <center>
-<p>Feel free to test models with ollama.
-<br>
-Easy to modify and running models you want.
-</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -57,6 +35,13 @@ h3 {
 }
 """
 def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
@@ -70,28 +55,29 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
     print(f"Conversation is -\n{conversation}")
-    response = client.chat(
-        model=model,
-        messages=conversation,
-        stream=True,
-        options={
-            'num_predict': max_new_tokens,
-            'temperature': temperature,
-            'top_p': top_p,
-            'top_k': top_k,
-            'repeat_penalty': penalty,
-            'low_vram': True,
-        },
     )
     buffer = ""
-    for chunk in response:
-        buffer += chunk["message"]["content"]
         yield buffer
-chatbot = gr.Chatbot(height=600)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)

 import threading
 import time
 import subprocess
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_ID = os.environ.get("MODEL_ID", None)
 MODEL_NAME = MODEL_ID.split("/")[-1]
+TITLE = "<h1><center>internlm2.5-7b-chat</center></h1>"
 DESCRIPTION = f"""
 <h3>MODEL: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></h3>
+"""
+PLACEHOLDER = """
 <center>
+<p>Feel free to test models <b>without</b> any logs.</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
 }
 """
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    trust_remote_code=True).cuda()
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = model.eval()
 def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     print(f"Conversation is -\n{conversation}")
+    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,})
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id = [2,92542],
     )
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
     buffer = ""
+    for new_text in streamer:
+        buffer += new_text
         yield buffer
+chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)