Spaces:
Running
Running
import gradio as gr | |
import torch | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
# Try to import Unsloth; fallback if failed | |
try: | |
from unsloth import FastLanguageModel | |
HAS_UNSLOTH = True | |
except NotImplementedError: | |
HAS_UNSLOTH = False | |
except ImportError: | |
HAS_UNSLOTH = False | |
class ModelManager: | |
_instance = None | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.model, self.tokenizer = self.load_model() | |
def get_instance(cls): | |
if cls._instance is None: | |
cls._instance = cls() | |
return cls._instance | |
def load_model(self): | |
if HAS_UNSLOTH and self.device != "cpu": | |
# GPU via Unsloth + LoRA | |
backbone, tokenizer = FastLanguageModel.from_pretrained( | |
"Qwen/Qwen2.5-Coder-1.5B-Instruct", | |
load_in_4bit=True, | |
dtype=torch.float16, | |
device_map="auto", | |
) | |
try: | |
model = PeftModel.from_pretrained( | |
backbone, | |
"samith-a/Django-orm-code-gen", | |
torch_dtype=torch.float16, | |
device_map="auto", | |
) | |
print("Loaded LoRA adapter via Unsloth.") | |
except Exception as e: | |
print(f"β Adapter load failed, using backbone only: {e}") | |
model = backbone | |
FastLanguageModel.for_inference(model) | |
return model, tokenizer | |
# --- Fallback: CPU-only via HF Transformers + PEFT --- | |
print("Falling back to CPU-only Transformers + PEFT") | |
base_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # non-4bit to run on CPU | |
tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True) | |
base = AutoModelForCausalLM.from_pretrained( | |
base_name, | |
device_map={"": "cpu"}, | |
torch_dtype=torch.float32, | |
) | |
try: | |
model = PeftModel.from_pretrained( | |
base, | |
"samith-a/Django-orm-code-gen", | |
device_map={"": "cpu"}, | |
torch_dtype=torch.float32, | |
) | |
print("Loaded LoRA adapter via PEFT.") | |
except Exception as e: | |
print(f"β Adapter load failed, using base model: {e}") | |
model = base | |
model.eval() | |
return model, tokenizer | |
def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str: | |
alpaca_template = ( | |
"### Instruction:\n{}\n\n" | |
"### Input:\n{}\n\n" | |
"### Response:\n" | |
) | |
prompt = alpaca_template.format(instruction, input_text) | |
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) | |
outputs = self.model.generate( | |
**inputs, | |
max_new_tokens=max_new_tokens, | |
do_sample=True, | |
temperature=0.7 | |
) | |
raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return raw.split("### Response:")[-1].strip() | |
# Initialize once | |
manager = ModelManager.get_instance() | |
def predict(instruction, context, max_tokens): | |
return manager.generate(instruction, context, max_new_tokens=int(max_tokens)) | |
demo = gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Textbox(lines=2, label="Instruction"), | |
gr.Textbox(lines=5, label="Context / Code"), | |
gr.Slider(16, 512, step=16, label="Max new tokens", value=128), | |
], | |
outputs=gr.Textbox(label="Generated Code"), | |
title="Django-ORM Code Generator", | |
description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)." | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) | |