Spaces:

Mat17892
/

iris

Runtime error

App Files Files Community

desert commited on 15 days ago

Commit

038ef00

•

1 Parent(s): d13f282

init inference

Browse files

Files changed (1) hide show

app.py +26 -38

app.py CHANGED Viewed

@@ -1,66 +1,54 @@
-import os
-import subprocess
 import gradio as gr
 from huggingface_hub import hf_hub_download
 # Hugging Face repository IDs
 base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
 adapter_repo = "Mat17892/llama_lora_gguf"
-# Download the base model GGUF file
 print("Downloading base model...")
 base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
-# Download the LoRA adapter GGUF file
 print("Downloading LoRA adapter...")
 lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
-# Define the llama-cli path explicitly
-llama_cli_path = "./llama.cpp/build/bin/llama-cli"
-if not os.access(llama_cli_path, os.X_OK):  # Check if the file is executable
-    os.chmod(llama_cli_path, 0o755)  # Set executable permissions
-# Function to run `llama-cli` with base model and adapter
-def run_llama_cli(prompt):
-    print("Running inference with llama-cli...")
-    cmd = [
-        llama_cli_path,  # Path to the llama-cli executable
-        "-c", "2048",    # Context length
-        "-cnv",          # Enable conversational mode
-        "-m", base_model_path,
-        "--lora", lora_adapter_path,
-        "--prompt", prompt,
-    ]
-    try:
-        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-        if process.returncode != 0:
-            print("Error during inference:")
-            print(stderr.decode())
-            return "Error: Could not generate response."
-        return stdout.decode().strip()
-    except Exception as e:
-        print(f"Exception occurred: {e}")
-        return "Error: Could not generate response."
-# Gradio interface
-def chatbot_fn(user_input, chat_history):
-    # Build the full chat history as the prompt
     prompt = ""
     for user, ai in chat_history:
         prompt += f"User: {user}\nAI: {ai}\n"
     prompt += f"User: {user_input}\nAI:"  # Add latest user input
-    # Generate response using llama-cli
-    response = run_llama_cli(prompt)
     # Update chat history
     chat_history.append((user_input, response))
     return chat_history, chat_history
-# Build the Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
     chatbot = gr.Chatbot(label="Chat with the Model")
@@ -75,7 +63,7 @@ with gr.Blocks() as demo:
     # Link components
     submit_btn.click(
-        chatbot_fn,
         inputs=[user_input, chat_history],
         outputs=[chatbot, chat_history],
         show_progress=True,

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel, PeftConfig
 from huggingface_hub import hf_hub_download
 # Hugging Face repository IDs
 base_model_repo = "unsloth/Llama-3.2-3B-Instruct-GGUF"
 adapter_repo = "Mat17892/llama_lora_gguf"
+# Download model and adapter
 print("Downloading base model...")
 base_model_path = hf_hub_download(repo_id=base_model_repo, filename="Llama-3.2-3B-Instruct-Q8_0.gguf")
 print("Downloading LoRA adapter...")
 lora_adapter_path = hf_hub_download(repo_id=adapter_repo, filename="llama_lora_adapter.gguf")
+# Load the tokenizer and base model
+print("Loading base model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
+# Load the LoRA adapter
+print("Loading LoRA adapter...")
+config = PeftConfig.from_pretrained(lora_adapter_path)
+model = PeftModel.from_pretrained(base_model, lora_adapter_path)
+print("Model is ready!")
+# Function for inference
+def chat_with_model(user_input, chat_history):
+    """
+    Generate a response from the model using the chat history and user input.
+    """
+    # Prepare the prompt
     prompt = ""
     for user, ai in chat_history:
         prompt += f"User: {user}\nAI: {ai}\n"
     prompt += f"User: {user_input}\nAI:"  # Add latest user input
+    # Tokenize input
+    inputs = tokenizer(prompt, return_tensors="pt")
+    # Generate response
+    outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Update chat history
     chat_history.append((user_input, response))
     return chat_history, chat_history
+# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# 🦙 LLaMA Chatbot with Base Model and LoRA Adapter")
     chatbot = gr.Chatbot(label="Chat with the Model")
     # Link components
     submit_btn.click(
+        chat_with_model,
         inputs=[user_input, chat_history],
         outputs=[chatbot, chat_history],
         show_progress=True,