Spaces:

macadeliccc
/

laser-dolphin-mixtral-chat

Running on Zero

App Files Files Community

macadeliccc commited on Jan 17

Commit

f7d8c6a

•

1 Parent(s): ce3745c

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -51

app.py CHANGED Viewed

@@ -1,58 +1,82 @@
 import spaces
 import gradio as gr
 import torch
-from gradio import State
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Select the device (GPU if available, else CPU)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("berkeley-nest/Starling-LM-7B-alpha")
-model = AutoModelForCausalLM.from_pretrained("berkeley-nest/Starling-LM-7B-alpha").to(device)
-model.eval()  # Set the model to evaluation mode
 @spaces.GPU
-def generate_response(user_input, chat_history):
-    try:
-        prompt = "GPT4 Correct User: " + user_input + "GPT4 Correct Assistant: "
-        if chat_history:
-            prompt = chat_history[-1024:] + prompt  # Keep last 1024 tokens of history
-        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
-        inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to the same device as the model
-        with torch.no_grad():
-            output = model.generate(**inputs, max_length=512, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
-        response = tokenizer.decode(output[0], skip_special_tokens=True)
-        new_history = chat_history + prompt + response
-        return response, new_history[-1024:]  # Return last 1024 tokens of history
-    except Exception as e:
-        return f"Error occurred: {e}", chat_history
-# Gradio Interface
-def clear_chat():
-    return "", ""
-with gr.Blocks(gr.themes.Soft()) as app:
-    with gr.Row():
-        gr.Markdown("## Starling Chatbot")
-        gr.Markdown("Run with your own hardware. This application exceeds 24GB VRAM")
-        gr.Markdown("```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \
-	registry.hf.space/macadeliccc-starling-lm-7b-alpha-chat:latest python app.py```")
-    with gr.Row():
-        chatbot = gr.Chatbot()
-    with gr.Row():
-        user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
-        send = gr.Button("Send")
-        clear = gr.Button("Clear")
-    chat_history = gr.State()  # Holds the chat history
-    send.click(generate_response, inputs=[user_input, chat_history], outputs=[chatbot, chat_history])
-    clear.click(clear_chat, outputs=[chatbot, chat_history])
-app.launch()

 import spaces
 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
+from threading import Thread
+# Lazy loading the model to meet huggingface stateless GPU requirements
+# Defining a custom stopping criteria class for the model's text generation.
+class StopOnTokens(StoppingCriteria):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        stop_ids = [50256, 50295]  # IDs of tokens where the generation should stop.
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:  # Checking if the last generated token is a stop token.
+                return True
+        return False
+# Function to generate model predictions.
 @spaces.GPU
+def predict(message, history):
+    torch.set_default_device("cuda")
+    # Loading the tokenizer and model from Hugging Face's model hub.
+    tokenizer = AutoTokenizer.from_pretrained(
+        "macadeliccc/laser-dolphin-mixtral-2x7b-dpo",
+        trust_remote_code=True
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        "macadeliccc/laser-dolphin-mixtral-2x7b-dpo",
+        torch_dtype="auto",
+        load_in_4bit=True,
+        trust_remote_code=True
+    )
+    history_transformer_format = history + [[message, ""]]
+    stop = StopOnTokens()
+    # Formatting the input for the model.
+    system_prompt = "<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>"
+    messages = system_prompt + "".join(["".join(["\n<|im_start|>user\n" + item[0], "<|im_end|>\n<|im_start|>assistant\n" + item[1]]) for item in history_transformer_format])
+    input_ids = tokenizer([messages], return_tensors="pt").to('cuda')
+    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        top_p=0.95,
+        top_k=50,
+        temperature=0.7,
+        num_beams=1,
+        stopping_criteria=StoppingCriteriaList([stop])
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()  # Starting the generation in a separate thread.
+    partial_message = ""
+    for new_token in streamer:
+        partial_message += new_token
+        if '<|im_end|>' in partial_message:  # Breaking the loop if the stop token is generated.
+            break
+        yield partial_message
+# Setting up the Gradio chat interface.
+gr.ChatInterface(predict,
+                 description="""
+                 <center><img src="https://huggingface.co/macadeliccc/laser-dolphin-mixtral-2x7b-dpo/resolve/main/dolphin_moe.png" width="33%"></center>\n\n
+                 Chat with [macadeliccc/SOLAR-math-2x10.7b-v0.2](https://huggingface.co/macadeliccc/SOLAR-math-2x10.7b-v0.2), the first Mixture of Experts made by merging two fine-tuned [upstage/SOLAR-10.7B-v1.0](https://huggingface.co/upstage/SOLAR-10.7B-v1.0) models.
+                 This model (19.2B param) scores top 5 on several evaluations. Output is considered experimental.\n\n
+                 ❤️ If you like this work, please follow me on [Hugging Face](https://huggingface.co/macadeliccc) and [LinkedIn](https://www.linkedin.com/in/tim-dolan-python-dev/).
+                 """,
+                 examples=[
+                     'Can you solve the equation 2x + 3 = 11 for x?',
+                     'How does Fermats last theorem impact number theory?',
+                     'What is a vector in the scope of computer science rather than physics?',
+                     'Use a list comprehension to create a list of squares for numbers from 1 to 10.',
+                     'Recommend some popular science fiction books.',
+                     'Can you write a short story about a time-traveling detective?'
+                 ],
+                 theme=gr.themes.Soft(primary_hue="purple"),
+                 ).launch()