Spaces:

mgoin
/

hermes-mistral-7b-vllm

Paused

mgoin commited on Mar 12

Commit

f51b330

•

1 Parent(s): e3233e4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import gradio as gr
-import spaces
 import torch
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
@@ -15,18 +16,15 @@ DESCRIPTION = """\
 """
 if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if torch.cuda.is_available():
-    model_id = "nm-testing/OpenHermes-2.5-Mistral-7B-pruned50"
-    model = LLM(model_id, max_model_len=MAX_INPUT_TOKEN_LENGTH)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.use_default_system_prompt = False
-@spaces.GPU
-def generate(
     message: str,
     chat_history: list[tuple[str, str]],
     system_prompt: str,
@@ -35,7 +33,7 @@ def generate(
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
-) -> str:
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
@@ -53,11 +51,11 @@ def generate(
         repetition_penalty=repetition_penalty,
     )
-    outputs = model.generate(formatted_conversation, sampling_params)
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        return generated_text
 chat_interface = gr.ChatInterface(

 import os
+import uuid
 import gradio as gr
+# import spaces
 import torch
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 """
 if not torch.cuda.is_available():
+    raise ValueError("Running on CPU 🥶 This demo does not work on CPU.")
+model_id = "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
+model = LLM(model_id, max_model_len=MAX_INPUT_TOKEN_LENGTH)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.use_default_system_prompt = False
+# @spaces.GPU
+async def generate(
     message: str,
     chat_history: list[tuple[str, str]],
     system_prompt: str,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
+):
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
         repetition_penalty=repetition_penalty,
     )
+    stream = await model.add_request(uuid.uuid4().hex, formatted_conversation, sampling_params)
+    async for request_output in stream:
+        text = request_output.outputs[0].text
+        yield text
 chat_interface = gr.ChatInterface(