| import gradio as gr |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
|
|
| BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct" |
| LORA_REPO = "alxstuff/Lumen-7b-v2" |
|
|
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
| print("Loading base model...") |
| model = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True, |
| low_cpu_mem_usage=True, |
| ) |
|
|
| print("Loading LoRA adapter...") |
| model.load_adapter(LORA_REPO) |
| model.eval() |
| print("✅ Lumen ready!") |
|
|
| def chat(message, history): |
| prompt = "<|im_start|>system\nYou are Lumen, an expert AI coding assistant built by TheAlxLabs. You write clean, efficient code and explain it clearly.<|im_end|>\n" |
| for user, assistant in history: |
| prompt += f"<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n{assistant}<|im_end|>\n" |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
| thread = Thread(target=model.generate, kwargs={ |
| **inputs, |
| "streamer": streamer, |
| "max_new_tokens": 1024, |
| "temperature": 0.2, |
| "do_sample": True, |
| }) |
| thread.start() |
|
|
| response = "" |
| for token in streamer: |
| response += token |
| yield response |
|
|
| gr.ChatInterface( |
| fn=chat, |
| title="⚡ Lumen — AI Coding Assistant", |
| description="Local-first AI coding assistant by TheAlxLabs.", |
| examples=[ |
| "Write a Python function to reverse a linked list", |
| "Explain what this does: `[x for x in range(10) if x % 2 == 0]`", |
| "Fix this bug: TypeError: 'NoneType' object is not subscriptable" |
| ], |
| ).launch() |