Spaces:
Running
Running
File size: 3,789 Bytes
e3d6072 14dbea3 8a813f7 ce44dab 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 e04c892 8a813f7 14dbea3 8a813f7 e04c892 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 b10f07b 8a813f7 14dbea3 b10f07b 14dbea3 8a813f7 14dbea3 8a813f7 14dbea3 b10f07b 14dbea3 a5e6f20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
# Try to import Unsloth; fallback if failed
try:
from unsloth import FastLanguageModel
HAS_UNSLOTH = True
except NotImplementedError:
HAS_UNSLOTH = False
except ImportError:
HAS_UNSLOTH = False
class ModelManager:
_instance = None
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model, self.tokenizer = self.load_model()
@classmethod
def get_instance(cls):
if cls._instance is None:
cls._instance = cls()
return cls._instance
def load_model(self):
if HAS_UNSLOTH and self.device != "cpu":
# GPU via Unsloth + LoRA
backbone, tokenizer = FastLanguageModel.from_pretrained(
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
load_in_4bit=True,
dtype=torch.float16,
device_map="auto",
)
try:
model = PeftModel.from_pretrained(
backbone,
"samith-a/Django-orm-code-gen",
torch_dtype=torch.float16,
device_map="auto",
)
print("Loaded LoRA adapter via Unsloth.")
except Exception as e:
print(f"❗ Adapter load failed, using backbone only: {e}")
model = backbone
FastLanguageModel.for_inference(model)
return model, tokenizer
# --- Fallback: CPU-only via HF Transformers + PEFT ---
print("Falling back to CPU-only Transformers + PEFT")
base_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # non-4bit to run on CPU
tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
base_name,
device_map={"": "cpu"},
torch_dtype=torch.float32,
)
try:
model = PeftModel.from_pretrained(
base,
"samith-a/Django-orm-code-gen",
device_map={"": "cpu"},
torch_dtype=torch.float32,
)
print("Loaded LoRA adapter via PEFT.")
except Exception as e:
print(f"❗ Adapter load failed, using base model: {e}")
model = base
model.eval()
return model, tokenizer
def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
alpaca_template = (
"### Instruction:\n{}\n\n"
"### Input:\n{}\n\n"
"### Response:\n"
)
prompt = alpaca_template.format(instruction, input_text)
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7
)
raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return raw.split("### Response:")[-1].strip()
# Initialize once
manager = ModelManager.get_instance()
def predict(instruction, context, max_tokens):
return manager.generate(instruction, context, max_new_tokens=int(max_tokens))
demo = gr.Interface(
fn=predict,
inputs=[
gr.Textbox(lines=2, label="Instruction"),
gr.Textbox(lines=5, label="Context / Code"),
gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
],
outputs=gr.Textbox(label="Generated Code"),
title="Django-ORM Code Generator",
description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
)
if __name__ == "__main__":
demo.launch(share=True)
|