allenai-OLMoE-1B-7B-0924-cpu

Runtime error

App Files Files Community

nisten commited on Sep 4

Commit

9ca55ad

•

1 Parent(s): be3574c

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -12

app.py CHANGED Viewed

@@ -4,16 +4,23 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import subprocess
 import sys
-# Force install the latest transformers version and flash attention
-subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "transformers", "flash-attn"])
 model_name = "allenai/OLMoE-1B-7B-0924"
 # Wrap model loading in a try-except block to handle potential errors
 try:
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").to(DEVICE)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
@@ -32,20 +39,18 @@ def generate_response(message, history, temperature, max_new_tokens):
     full_prompt = f"{system_prompt}\n\nHuman: {message}\n\nAssistant:"
-    inputs = tokenizer(full_prompt, return_tensors="pt")
-    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
     with torch.no_grad():
         generate_ids = model.generate(
             **inputs,
-            max_length=inputs['input_ids'].shape[1] + max_new_tokens,
             do_sample=True,
             temperature=temperature,
         )
-    response = tokenizer.decode(generate_ids[0], skip_special_tokens=True)
-    # Extract only the assistant's response
-    assistant_response = response.split("Assistant:")[-1].strip()
-    return assistant_response
 css = """
   #output {
@@ -56,9 +61,9 @@ css = """
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Nisten's Karpathy Chatbot with OSS olMoE")
     chatbot = gr.Chatbot(elem_id="output")
-    msg = gr.Textbox(label="Your prompt")
     with gr.Row():
         temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
         max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=1000, step=50, label="Max New Tokens")

 import torch
 import subprocess
 import sys
+import os
+# Force upgrade transformers to the latest version
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "transformers"])
 model_name = "allenai/OLMoE-1B-7B-0924"
 # Wrap model loading in a try-except block to handle potential errors
 try:
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        trust_remote_code=True,
+        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+        low_cpu_mem_usage=True,
+        device_map="auto"
+    )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
     full_prompt = f"{system_prompt}\n\nHuman: {message}\n\nAssistant:"
+    inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         generate_ids = model.generate(
             **inputs,
+            max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
+            eos_token_id=tokenizer.eos_token_id,
         )
+    response = tokenizer.decode(generate_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    return response.strip()
 css = """
   #output {
 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Nisten's Karpathy Chatbot with OSS OLMoE")
     chatbot = gr.Chatbot(elem_id="output")
+    msg = gr.Textbox(label="Your message")
     with gr.Row():
         temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
         max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=1000, step=50, label="Max New Tokens")