samith-a's picture
Update app.py
e04c892 verified
import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
# Try to import Unsloth; fallback if failed
try:
from unsloth import FastLanguageModel
HAS_UNSLOTH = True
except NotImplementedError:
HAS_UNSLOTH = False
except ImportError:
HAS_UNSLOTH = False
class ModelManager:
_instance = None
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model, self.tokenizer = self.load_model()
@classmethod
def get_instance(cls):
if cls._instance is None:
cls._instance = cls()
return cls._instance
def load_model(self):
if HAS_UNSLOTH and self.device != "cpu":
# GPU via Unsloth + LoRA
backbone, tokenizer = FastLanguageModel.from_pretrained(
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
load_in_4bit=True,
dtype=torch.float16,
device_map="auto",
)
try:
model = PeftModel.from_pretrained(
backbone,
"samith-a/Django-orm-code-gen",
torch_dtype=torch.float16,
device_map="auto",
)
print("Loaded LoRA adapter via Unsloth.")
except Exception as e:
print(f"❗ Adapter load failed, using backbone only: {e}")
model = backbone
FastLanguageModel.for_inference(model)
return model, tokenizer
# --- Fallback: CPU-only via HF Transformers + PEFT ---
print("Falling back to CPU-only Transformers + PEFT")
base_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # non-4bit to run on CPU
tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
base_name,
device_map={"": "cpu"},
torch_dtype=torch.float32,
)
try:
model = PeftModel.from_pretrained(
base,
"samith-a/Django-orm-code-gen",
device_map={"": "cpu"},
torch_dtype=torch.float32,
)
print("Loaded LoRA adapter via PEFT.")
except Exception as e:
print(f"❗ Adapter load failed, using base model: {e}")
model = base
model.eval()
return model, tokenizer
def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
alpaca_template = (
"### Instruction:\n{}\n\n"
"### Input:\n{}\n\n"
"### Response:\n"
)
prompt = alpaca_template.format(instruction, input_text)
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7
)
raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return raw.split("### Response:")[-1].strip()
# Initialize once
manager = ModelManager.get_instance()
def predict(instruction, context, max_tokens):
return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
demo = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(lines=2, label="Instruction"),
gr.Textbox(lines=5, label="Context / Code"),
gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
],
outputs=gr.Textbox(label="Generated Code"),
title="Django-ORM Code Generator",
description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
)
if __name__ == "__main__":
demo.launch(share=True)