Update app.py
Browse files
app.py
CHANGED
|
@@ -2,27 +2,39 @@ import os
|
|
| 2 |
import gradio as gr
|
| 3 |
import torch
|
| 4 |
import json
|
| 5 |
-
from transformers import AutoTokenizer
|
|
|
|
| 6 |
|
| 7 |
# Set Hugging Face Token for Authentication (ensure it's set in your environment)
|
| 8 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# Function to load Llama model
|
| 11 |
-
def load_llama_model(
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
return tokenizer, model
|
| 27 |
|
| 28 |
# Load Llama 3.2 model
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import torch
|
| 4 |
import json
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 6 |
+
from peft import PeftModel
|
| 7 |
|
| 8 |
# Set Hugging Face Token for Authentication (ensure it's set in your environment)
|
| 9 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
| 10 |
|
| 11 |
+
# Base model (needed for QLoRA adapter)
|
| 12 |
+
BASE_MODEL = "meta-llama/Llama-3-1B-Instruct"
|
| 13 |
+
QLORA_ADAPTER = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8"
|
| 14 |
+
|
| 15 |
# Function to load Llama model
|
| 16 |
+
def load_llama_model():
|
| 17 |
+
print("Loading base model...")
|
| 18 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 19 |
+
BASE_MODEL,
|
| 20 |
+
torch_dtype=torch.bfloat16 if torch.has_bfloat16 else torch.float32, # Use bfloat16 if available, else float32
|
| 21 |
+
device_map="cpu", # Ensure it runs on CPU
|
| 22 |
+
token=HUGGINGFACE_TOKEN
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
print("Loading tokenizer...")
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, token=HUGGINGFACE_TOKEN)
|
| 27 |
+
|
| 28 |
+
print("Loading QLoRA adapter...")
|
| 29 |
+
model = PeftModel.from_pretrained(
|
| 30 |
+
model,
|
| 31 |
+
QLORA_ADAPTER,
|
| 32 |
+
token=HUGGINGFACE_TOKEN
|
| 33 |
)
|
| 34 |
|
| 35 |
+
print("Merging LoRA weights...")
|
| 36 |
+
model = model.merge_and_unload() # Merge LoRA weights for inference
|
| 37 |
+
|
| 38 |
return tokenizer, model
|
| 39 |
|
| 40 |
# Load Llama 3.2 model
|