import gradio as gr import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer # Try to import Unsloth; fallback if failed try: from unsloth import FastLanguageModel HAS_UNSLOTH = True except NotImplementedError: HAS_UNSLOTH = False except ImportError: HAS_UNSLOTH = False class ModelManager: _instance = None def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model, self.tokenizer = self.load_model() @classmethod def get_instance(cls): if cls._instance is None: cls._instance = cls() return cls._instance def load_model(self): if HAS_UNSLOTH and self.device != "cpu": # GPU via Unsloth + LoRA backbone, tokenizer = FastLanguageModel.from_pretrained( "Qwen/Qwen2.5-Coder-1.5B-Instruct", load_in_4bit=True, dtype=torch.float16, device_map="auto", ) try: model = PeftModel.from_pretrained( backbone, "samith-a/Django-orm-code-gen", torch_dtype=torch.float16, device_map="auto", ) print("Loaded LoRA adapter via Unsloth.") except Exception as e: print(f"❗ Adapter load failed, using backbone only: {e}") model = backbone FastLanguageModel.for_inference(model) return model, tokenizer # --- Fallback: CPU-only via HF Transformers + PEFT --- print("Falling back to CPU-only Transformers + PEFT") base_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # non-4bit to run on CPU tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True) base = AutoModelForCausalLM.from_pretrained( base_name, device_map={"": "cpu"}, torch_dtype=torch.float32, ) try: model = PeftModel.from_pretrained( base, "samith-a/Django-orm-code-gen", device_map={"": "cpu"}, torch_dtype=torch.float32, ) print("Loaded LoRA adapter via PEFT.") except Exception as e: print(f"❗ Adapter load failed, using base model: {e}") model = base model.eval() return model, tokenizer def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str: alpaca_template = ( "### Instruction:\n{}\n\n" "### Input:\n{}\n\n" "### Response:\n" ) prompt = alpaca_template.format(instruction, input_text) inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) outputs = self.model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7 ) raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return raw.split("### Response:")[-1].strip() # Initialize once manager = ModelManager.get_instance() def predict(instruction, context, max_tokens): return manager.generate(instruction, context, max_new_tokens=int(max_tokens)) demo = gr.Interface( fn=predict, inputs=[ gr.Textbox(lines=2, label="Instruction"), gr.Textbox(lines=5, label="Context / Code"), gr.Slider(16, 512, step=16, label="Max new tokens", value=128), ], outputs=gr.Textbox(label="Generated Code"), title="Django-ORM Code Generator", description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)." ) if __name__ == "__main__": demo.launch(share=True)