Spaces:

dillibabukadati
/

emotional-ai

Runtime error

App Files Files Community

dillibabukadati commited on Apr 14

Commit

58505de

verified ·

1 Parent(s): 1a8c8c3

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -53

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import gradio as gr
 import os
 import gc
@@ -7,87 +8,75 @@ import gc
 # Free up memory
 gc.collect()
-# Model name
-model_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"
-# First, try to load just the configuration
-print("Loading model configuration...")
-config = AutoConfig.from_pretrained(model_name)
-# Modify configuration to bypass quantization
-if hasattr(config, "quantization_config"):
-    print("Removing quantization configuration...")
-    delattr(config, "quantization_config")
-# Try loading with modified config
-print("Loading model with modified configuration...")
 try:
     base_model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        config=config,
         device_map="auto",
         torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-        quantization_config=None,  # Explicitly set to None
-        trust_remote_code=True
     )
-    print("Model loaded successfully in float16")
 except Exception as e:
-    print(f"Error loading model: {e}")
-    # Try the direct approach with safetensors
-    try:
-        print("Attempting to load using safetensors...")
-        base_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-            quantization_config=None,
-            trust_remote_code=True
-        )
-        print("Model loaded successfully with safetensors")
-    except Exception as e2:
-        print(f"Error loading with safetensors: {e2}")
-        raise RuntimeError("Could not load model in any format")
 # Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 # Function to generate response
 def generate_response(message, history):
-    # Generate system prompt based on history
-    prompt = ""
-    if history:
-        for user_msg, assistant_msg in history:
-            prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
-    prompt += f"User: {message}\nAssistant: "
-    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
     # Free up memory before generation
     gc.collect()
     with torch.no_grad():  # Disable gradient calculation to save memory
-        outputs = base_model.generate(
             **inputs,
             max_new_tokens=300,
             do_sample=True,
             temperature=0.7,
-            top_k=50,
             top_p=0.95
         )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the assistant's response from the output
-    if "Assistant: " in response:
-        response = response.split("Assistant: ")[-1]
-    return response
-# Launch Gradio UI with memory-efficient settings
 with gr.Blocks() as demo:
     gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
     chatbot = gr.ChatInterface(generate_response)
-demo.launch(show_api=False)  # Disable API to reduce memory usage

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel, PeftConfig
 import gradio as gr
 import os
 import gc
 # Free up memory
 gc.collect()
+# Define paths and model names
+model_name = "meta-llama/Meta-Llama-3.2-3B-Instruct"  # Base model (not quantized)
+adapter_name = "unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit"  # Your adapter
+print("Loading base model in float16...")
 try:
+    # Load the base model first (non-quantized)
     base_model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="auto",
         torch_dtype=torch.float16,
+        low_cpu_mem_usage=True
     )
+    print("Base model loaded successfully")
+    # Load your adapter configuration
+    peft_config = PeftConfig.from_pretrained(adapter_name)
+    # Apply the adapter to the base model
+    print("Applying adapter to base model...")
+    model = PeftModel.from_pretrained(base_model, adapter_name)
+    print("Model with adapter loaded successfully")
 except Exception as e:
+    print(f"Error loading model with adapter: {e}")
+    raise RuntimeError("Could not load model")
 # Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Function to generate response
 def generate_response(message, history):
+    # Format conversation history for the model
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    # Convert messages to the format expected by the model
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Tokenize and generate
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # Free up memory before generation
     gc.collect()
     with torch.no_grad():  # Disable gradient calculation to save memory
+        outputs = model.generate(
             **inputs,
             max_new_tokens=300,
             do_sample=True,
             temperature=0.7,
+            top_k=50,
             top_p=0.95
         )
+    # Decode the response
+    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract just the assistant's response
+    assistant_response = full_response.split("<|assistant|>")[-1].strip()
+    return assistant_response
+# Launch Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("### 🦙 Chat with Your Fine-tuned LLaMA 3.2 3B")
     chatbot = gr.ChatInterface(generate_response)
+demo.launch(show_api=False)