File size: 2,411 Bytes
9b459ae e5dde1f 9b459ae e5dde1f 5e0182c 3a28db6 0299ece 5e0182c e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 5e0182c e5dde1f fc62f09 e5dde1f 5e0182c e5dde1f 5e0182c e5dde1f 5e0182c e5dde1f 6238dbd e5dde1f 5e0182c e5dde1f 5e0182c fc62f09 34b863f 5e0182c e5dde1f 5e0182c 0299ece |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# Standard library imports
import os
import threading
# Third-party imports
import gradio as gr
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(
"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
token=HF_TOKEN,
trust_remote_code=True
)
base_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
device_map="auto",
torch_dtype="auto",
token=HF_TOKEN
)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(
base_model,
"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
token=HF_TOKEN
)
model.eval()
def respond(
message: str,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
):
messages = [{"role": "system", "content": system_message}]
for u, a in history:
if u:
messages.append({"role": "user", "content": u})
if a:
messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
timeout=600.0,
skip_prompt=True,
skip_special_tokens=True
)
generation_kwargs = {
**inputs,
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"streamer": streamer,
}
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
output = ""
for chunk in streamer:
output += chunk
yield output
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful coding assistant.", label="System message"),
gr.Slider(minimum=512, maximum=8192, value=2048, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|