File size: 3,789 Bytes
e3d6072
14dbea3
 
8a813f7
 
ce44dab
8a813f7
 
 
 
 
 
 
14dbea3
 
 
8a813f7
14dbea3
 
 
8a813f7
14dbea3
 
 
 
 
 
 
8a813f7
 
 
e04c892
8a813f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14dbea3
8a813f7
 
e04c892
8a813f7
 
 
 
 
 
14dbea3
 
8a813f7
14dbea3
8a813f7
 
14dbea3
8a813f7
14dbea3
8a813f7
 
14dbea3
8a813f7
14dbea3
 
 
 
 
 
 
 
 
8a813f7
 
 
 
 
 
 
 
14dbea3
 
 
8a813f7
14dbea3
b10f07b
8a813f7
14dbea3
b10f07b
14dbea3
 
 
8a813f7
 
 
14dbea3
 
 
8a813f7
14dbea3
b10f07b
14dbea3
a5e6f20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Try to import Unsloth; fallback if failed
try:
    from unsloth import FastLanguageModel
    HAS_UNSLOTH = True
except NotImplementedError:
    HAS_UNSLOTH = False
except ImportError:
    HAS_UNSLOTH = False

class ModelManager:
    _instance = None

    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.tokenizer = self.load_model()

    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = cls()
        return cls._instance

    def load_model(self):
        if HAS_UNSLOTH and self.device != "cpu":
            # GPU via Unsloth + LoRA
            backbone, tokenizer = FastLanguageModel.from_pretrained(
                "Qwen/Qwen2.5-Coder-1.5B-Instruct",
                load_in_4bit=True,
                dtype=torch.float16,
                device_map="auto",
            )
            try:
                model = PeftModel.from_pretrained(
                    backbone,
                    "samith-a/Django-orm-code-gen",
                    torch_dtype=torch.float16,
                    device_map="auto",
                )
                print("Loaded LoRA adapter via Unsloth.")
            except Exception as e:
                print(f"❗ Adapter load failed, using backbone only: {e}")
                model = backbone
            FastLanguageModel.for_inference(model)
            return model, tokenizer

        # --- Fallback: CPU-only via HF Transformers + PEFT ---
        print("Falling back to CPU-only Transformers + PEFT")
        base_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"  # non-4bit to run on CPU
        tokenizer = AutoTokenizer.from_pretrained(base_name, use_fast=True)
        base = AutoModelForCausalLM.from_pretrained(
            base_name,
            device_map={"": "cpu"},
            torch_dtype=torch.float32,
        )
        try:
            model = PeftModel.from_pretrained(
                base,
                "samith-a/Django-orm-code-gen",
                device_map={"": "cpu"},
                torch_dtype=torch.float32,
            )
            print("Loaded LoRA adapter via PEFT.")
        except Exception as e:
            print(f"❗ Adapter load failed, using base model: {e}")
            model = base

        model.eval()
        return model, tokenizer

    def generate(self, instruction: str, input_text: str, max_new_tokens: int = 128) -> str:
        alpaca_template = (
            "### Instruction:\n{}\n\n"
            "### Input:\n{}\n\n"
            "### Response:\n"
        )
        prompt = alpaca_template.format(instruction, input_text)

        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7
        )
        raw = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return raw.split("### Response:")[-1].strip()

# Initialize once
manager = ModelManager.get_instance()

def predict(instruction, context, max_tokens):
    return manager.generate(instruction, context, max_new_tokens=int(max_tokens))

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(lines=2, label="Instruction"),
        gr.Textbox(lines=5, label="Context / Code"),
        gr.Slider(16, 512, step=16, label="Max new tokens", value=128),
    ],
    outputs=gr.Textbox(label="Generated Code"),
    title="Django-ORM Code Generator",
    description="LoRA-finetuned LLaMA3.2 for Django ORM code (CPU/GPU fallback)."
)

if __name__ == "__main__":
    demo.launch(share=True)