Spaces:

xi0v
/

ather-7b-v1

Runtime error

App Files Files Community

xi0v commited on Apr 28

Commit

d6164db

•

1 Parent(s): 6e43b71

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -45

app.py CHANGED Viewed

@@ -1,65 +1,129 @@
 #!/usr/bin/env python
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
-import time
-import numpy as np
-from torch.nn import functional as F
-import os
-from threading import Thread
-print(f"Starting to load the model to memory")
-m = AutoModelForCausalLM.from_pretrained(
-    "xi0v/aether-7b-chat-v1.0", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=False)
-tok = AutoTokenizer.from_pretrained("xi0v/aether-7b-chat-v1.0", trust_remote_code=False)
-# using CUDA for an optimal experience
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-m = m.to(device)
-print(f"Sucessfully loaded the model to the memory")
-start_message = "You are a Helpful assistant"
-def user(message, history):
-    # Append the user's message to the conversation history
-    return "", history + [[message, ""]]
 @spaces.GPU
-def chat(message, history):
-    chat = []
-    for item in history:
-        chat.append({"role": "user", "content": item[0]})
-        if item[1] is not None:
-            chat.append({"role": "assistant", "content": item[1]})
-    chat.append({"role": "user", "content": message})
-    messages = tok.apply_chat_template(chat, tokenize=True, add_generation_prompt=True)
-    # Tokenize the messages string
-    model_inputs = tok([messages], return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(
-        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        model_inputs,
         streamer=streamer,
-        max_new_tokens=1024,
         do_sample=True,
-        top_p=0.95,
-        top_k=1000,
-        temperature=0.75,
         num_beams=1,
     )
-    t = Thread(target=m.generate, kwargs=generate_kwargs)
     t.start()
-    # Initialize an empty string to store the generated text
-    partial_text = ""
-    for new_text in streamer:
-        # print(new_text)
-        partial_text += new_text
-        # Yield an empty string to cleanup the message textbox and the updated conversation history
-        yield partial_text
-demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Stable LM 2 Zephyr 1.6b")
-demo.launch()

 #!/usr/bin/env python
+import os
+from threading import Thread
+from typing import Iterator
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+DESCRIPTION = "# Aether-7b v1.0"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on GPU 🥶</p>"
+MAX_MAX_NEW_TOKENS = 4096
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+if torch.cuda.is_available():
+    model_id = "xi0v/aether-7b-chat-v1.0"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
 @spaces.GPU
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str = "",
+    max_new_tokens: int = 1024,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    top_k: int = 50,
+    repetition_penalty: float = 1.0,
+) -> Iterator[str]:
+    conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt", add_generation_prompt=True)
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        {"input_ids": input_ids},
         streamer=streamer,
+        max_new_tokens=max_new_tokens,
         do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
         num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(
+            label="System prompt",
+            lines=6,
+            placeholder="You are a friendly chatbot who always responds in the style of a pirate.",
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.7,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.95,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.0,
+        ),
+    ],
+    stop_btn=None,
+)
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()