Flux.1-Fill-dev

Running on Zero

App Files Files Community

vilarin commited on Sep 4

Commit

00adabe

•

1 Parent(s): 5c3a975

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -128

app.py CHANGED Viewed

@@ -1,59 +1,24 @@
 import os
-import signal
-import threading
 import time
-import subprocess
-import asyncio
-OLLAMA = os.path.expanduser("~/ollama")
-process = None
-OLLAMA_SERVICE_THREAD = None
-if not os.path.exists(OLLAMA):
-    subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
-    os.chmod(OLLAMA, 0o755)
-def ollama_service_thread():
-    global process
-    process = subprocess.Popen("~/ollama serve", shell=True, preexec_fn=os.setsid)
-    process.wait()
-def terminate():
-    global process, OLLAMA_SERVICE_THREAD
-    if process:
-        os.killpg(os.getpgid(process.pid), signal.SIGTERM)
-    if OLLAMA_SERVICE_THREAD:
-        OLLAMA_SERVICE_THREAD.join()
-    process = None
-    OLLAMA_SERVICE_THREAD = None
-    print("Ollama service stopped.")
-# Uncomment and modify the model to what you want locally
-# model = "moondream"
-# model = os.environ.get("MODEL")
-# subprocess.run(f"~/ollama pull {model}", shell=True)
-import ollama
 import gradio as gr
-from ollama import AsyncClient
-client = AsyncClient(host='http://localhost:11434', timeout=120)
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-TITLE = "<h1><center>ollama-Chat</center></h1>"
-DESCRIPTION = f"""
 <center>
-<p>Feel free to test models with ollama.
-<br>
-First run please type <em>/init</em> to launch process.
-<br>
-Type <em>/pull model_name</em> to pull model.
-</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
@@ -65,86 +30,68 @@ h3 {
     text-align: center;
 }
 """
-INIT_SIGN = ""
-def init():
-    global OLLAMA_SERVICE_THREAD
-    OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
-    OLLAMA_SERVICE_THREAD.start()
-    print("Giving ollama serve a moment")
-    time.sleep(10)
-    global INIT_SIGN
-    INIT_SIGN = "FINISHED"
-def ollama_func(command):
-    if " " in command:
-        c1, c2 = command.split(" ")
-    else:
-        c1 = command
-        c2 = ""
-    function_map = {
-        "/init": init,
-        "/pull": lambda: ollama.pull(c2),
-        "/list": ollama.list,
-        "/bye": terminate,
-    }
-    if c1 in function_map:
-        function_map.get(c1)()
-        return "Running..."
-    else:
-        return "No supported command."
-def launch():
-    global OLLAMA_SERVICE_THREAD
-    OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
-    OLLAMA_SERVICE_THREAD.start()
-async def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
-    print(f"message: {message}")
     conversation = []
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
-    print(f"Conversation is -\n{conversation}")
-    if message.startswith("/"):
-        resp = ollama_func(message)
-        yield resp
-    else:
-        if not INIT_SIGN:
-            yield "Please initialize Ollama"
-        else:
-            if not process:
-                launch()
-                print("Giving ollama serve a moment")
-                time.sleep(10)
-            buffer = ""
-            async for part in await client.chat(
-                model=model,
-                stream=True,
-                messages=conversation,
-                keep_alive="60s",
-                options={
-                    'num_predict': max_new_tokens,
-                    'temperature': temperature,
-                    'top_p': top_p,
-                    'top_k': top_k,
-                    'repeat_penalty': penalty,
-                    'low_vram': True,
-                    },
-                ):
-                buffer += part['message']['content']
-                yield buffer
-chatbot = gr.Chatbot(height=600, placeholder=DESCRIPTION)
-with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
@@ -153,32 +100,27 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
-            gr.Textbox(
-                value="qwen2:0.5b",
-                label="Model",
-                render=False,
-            ),
             gr.Slider(
                 minimum=0,
                 maximum=1,
                 step=0.1,
-                value=0.8,
                 label="Temperature",
                 render=False,
             ),
             gr.Slider(
                 minimum=128,
-                maximum=2048,
                 step=1,
                 value=1024,
-                label="Max New Tokens",
                 render=False,
             ),
             gr.Slider(
                 minimum=0.0,
                 maximum=1.0,
                 step=0.1,
-                value=0.8,
                 label="top_p",
                 render=False,
             ),
@@ -194,7 +136,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 minimum=0.0,
                 maximum=2.0,
                 step=0.1,
-                value=1.0,
                 label="Repetition penalty",
                 render=False,
             ),
@@ -210,4 +152,4 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
 if __name__ == "__main__":
-    demo.launch()

 import os
 import time
+import spaces
+import torch
+from transformers import OlmoeForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
+from threading import Thread
+MODEL_LIST = ["allenai/OLMoE-1B-7B-0924-Instruct"]
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL = os.environ.get("MODEL_ID")
+TITLE = "<h1><center>OLMoE</center></h1>"
+PLACEHOLDER = """
 <center>
+<p>Fully open, state-of-the-art Mixture of Expert model with 1.3 billion active and 6.9 billion total parameters.</p>
 </center>
 """
 CSS = """
 .duplicate-button {
     margin: auto !important;
     text-align: center;
 }
 """
+device = "cuda" # for GPU usage or "cpu" for CPU usage
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model = OlmoeForCausalLM.from_pretrained(
+    MODEL,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    ignore_mismatched_sizes=True)
+@spaces.GPU()
+def stream_chat(
+    message: str,
+    history: list,
+    temperature: float = 0.3,
+    max_new_tokens: int = 1024,
+    top_p: float = 1.0,
+    top_k: int = 20,
+    penalty: float = 1.2,
+):
+    print(f'message: {message}')
+    print(f'history: {history}')
     conversation = []
     for prompt, answer in history:
         conversation.extend([
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
+    input_text=tokenizer.apply_chat_template(conversation, tokenize=False)
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids=inputs,
+        max_new_tokens = max_new_tokens,
+        do_sample = False if temperature == 0 else True,
+        top_p = top_p,
+        top_k = top_k,
+        temperature = temperature,
+        streamer=streamer,
+        repetition_penalty=penalty,
+        pad_token_id = 1,
+        eos_token_id = 50279,
+    )
+    with torch.no_grad():
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer
+chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
+with gr.Blocks(css=CSS, theme="Nymbo/Nymbo_Theme") as demo:
     gr.HTML(TITLE)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Slider(
                 minimum=0,
                 maximum=1,
                 step=0.1,
+                value=0.3,
                 label="Temperature",
                 render=False,
             ),
             gr.Slider(
                 minimum=128,
+                maximum=8192,
                 step=1,
                 value=1024,
+                label="Max new tokens",
                 render=False,
             ),
             gr.Slider(
                 minimum=0.0,
                 maximum=1.0,
                 step=0.1,
+                value=1.0,
                 label="top_p",
                 render=False,
             ),
                 minimum=0.0,
                 maximum=2.0,
                 step=0.1,
+                value=1.2,
                 label="Repetition penalty",
                 render=False,
             ),
 if __name__ == "__main__":
+    demo.launch()