deveg commited on
Commit
150ed74
·
verified ·
1 Parent(s): 58e54b2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +339 -61
  2. requirements.txt +4 -0
app.py CHANGED
@@ -1,69 +1,347 @@
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
- def respond(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
  ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
-
23
- messages.append({"role": "user", "content": message})
24
-
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- with gr.Blocks() as demo:
63
- with gr.Sidebar():
64
- gr.LoginButton()
65
- chatbot.render()
 
66
 
67
 
68
- if __name__ == "__main__":
69
- demo.launch()
 
1
+ import inspect
2
+ import os
3
+ import threading
4
+
5
  import gradio as gr
6
+ import torch
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
8
+
9
+
10
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
11
+
12
+ MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B")
13
+ MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "256"))
14
+ MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "1536"))
15
+ MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3"))
16
+ N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1))))
17
+ DEFAULT_SYSTEM_PROMPT = os.getenv(
18
+ "SYSTEM_PROMPT",
19
+ "You are a helpful assistant. Keep answers clear and concise.",
20
+ )
21
+
22
+ PRESETS = {
23
+ "Math": {
24
+ "system": "You are a careful math tutor. Think through the problem, then give a short final answer.",
25
+ "prompt": "Solve: If 2x^2 - 7x + 3 = 0, what are the real solutions?",
26
+ "thinking": True,
27
+ "sample_reasoning": "The discriminant is 49 - 24 = 25, so the roots are easy to compute with the quadratic formula.",
28
+ "sample_answer": "The real solutions are x = 3 and x = 1/2.",
29
+ },
30
+ "Coding": {
31
+ "system": "You are a Python assistant. Prefer short, readable code.",
32
+ "prompt": "Write a Python function that merges two sorted lists into one sorted list.",
33
+ "thinking": True,
34
+ "sample_reasoning": "Use two pointers. Compare the current elements, append the smaller one, then append the leftovers.",
35
+ "sample_answer": "Here is a compact merge function plus a tiny example.",
36
+ },
37
+ "Structured output": {
38
+ "system": "Return compact JSON and avoid extra commentary.",
39
+ "prompt": "Extract JSON from: Call Mina by Friday, priority high, budget about $2400, topic is launch video edits.",
40
+ "thinking": False,
41
+ "sample_reasoning": "Reasoning is disabled here so the output stays short and machine-friendly.",
42
+ "sample_answer": '{"person":"Mina","deadline":"Friday","priority":"high","budget_usd":2400,"topic":"launch video edits"}',
43
+ },
44
+ "Function calling style": {
45
+ "system": "You are an assistant that plans tool use when it helps. If a tool would help, say what tool you would call and with which arguments.",
46
+ "prompt": "Pretend you have tools. For 18.75 * 42 - 199 and converting 12 km to miles, explain which tool calls you would make, then give the result.",
47
+ "thinking": True,
48
+ "sample_reasoning": "I would use a calculator tool for the arithmetic and a unit-conversion tool for the distance conversion.",
49
+ "sample_answer": "Calculator(18.75 * 42 - 199) -> 588.5\nConvert(12 km -> miles) -> about 7.46 miles",
50
+ },
51
+ "Creative writing": {
52
+ "system": "Write vivid, tight prose.",
53
+ "prompt": "Write a two-sentence opening for a sci-fi heist story set on a drifting museum ship.",
54
+ "thinking": False,
55
+ "sample_reasoning": "Reasoning is disabled for a faster clean draft.",
56
+ "sample_answer": "By the time the museum ship crossed into the dead zone, every priceless relic aboard had started broadcasting a heartbeat. Nia took that as her cue to cut the lights and steal the one artifact already trying to escape.",
57
+ },
58
+ }
59
+
60
+
61
+ torch.set_num_threads(N_THREADS)
62
+ try:
63
+ torch.set_num_interop_threads(max(1, min(2, N_THREADS)))
64
+ except RuntimeError:
65
+ pass
66
+
67
+ _tokenizer = None
68
+ _model = None
69
+ _load_lock = threading.Lock()
70
+ _generate_lock = threading.Lock()
71
+
72
+
73
+ def make_chatbot(label, height=520):
74
+ kwargs = {"label": label, "height": height}
75
+ if "type" in inspect.signature(gr.Chatbot.__init__).parameters:
76
+ kwargs["type"] = "messages"
77
+ return gr.Chatbot(**kwargs)
78
+
79
+
80
+ def get_model():
81
+ global _tokenizer, _model
82
+ if _model is None or _tokenizer is None:
83
+ with _load_lock:
84
+ if _model is None or _tokenizer is None:
85
+ _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
86
+ _model = AutoModelForCausalLM.from_pretrained(
87
+ MODEL_ID,
88
+ torch_dtype=torch.float32,
89
+ )
90
+ _model.eval()
91
+ return _tokenizer, _model
92
+
93
+
94
+ def clone_messages(messages):
95
+ return [dict(item) for item in (messages or [])]
96
 
97
 
98
+ def load_preset(name):
99
+ preset = PRESETS[name]
100
+ return (
101
+ preset["system"],
102
+ preset["prompt"],
103
+ preset["thinking"],
104
+ preset["sample_reasoning"],
105
+ preset["sample_answer"],
106
+ )
107
+
108
+
109
+ def clear_all():
110
+ return [], [], [], ""
111
+
112
+
113
+ def strip_non_think_specials(text):
114
+ text = text or ""
115
+ for token in ["<|im_end|>", "<|endoftext|>", "<|end▁of▁sentence|>"]:
116
+ text = text.replace(token, "")
117
+ return text
118
+
119
+
120
+ def final_cleanup(text):
121
+ text = strip_non_think_specials(text)
122
+ text = text.replace("<think>", "").replace("</think>", "")
123
+ return text.strip()
124
+
125
+
126
+ def split_stream_text(raw_text, thinking):
127
+ raw_text = strip_non_think_specials(raw_text)
128
+ if not thinking:
129
+ return "", final_cleanup(raw_text), False
130
+
131
+ raw_text = raw_text.replace("<think>", "")
132
+ if "</think>" in raw_text:
133
+ reasoning, answer = raw_text.split("</think>", 1)
134
+ return reasoning.strip(), answer.strip(), True
135
+
136
+ return raw_text.strip(), "", False
137
+
138
+
139
+ def respond_stream(
140
  message,
141
+ system_prompt,
142
+ thinking,
143
+ model_history,
144
+ reasoning_chat,
145
+ answer_chat,
 
146
  ):
147
+ message = (message or "").strip()
148
+ if not message:
149
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), ""
150
+ return
151
+
152
+ model_history = list(model_history or [])
153
+ reasoning_chat = clone_messages(reasoning_chat)
154
+ answer_chat = clone_messages(answer_chat)
155
+
156
+ reasoning_chat.append({"role": "user", "content": message})
157
+ reasoning_chat.append(
158
+ {
159
+ "role": "assistant",
160
+ "content": "(thinking...)" if thinking else "(reasoning disabled)",
161
+ }
162
+ )
163
+ answer_chat.append({"role": "user", "content": message})
164
+ answer_chat.append({"role": "assistant", "content": ""})
165
+
166
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
167
+
168
+ try:
169
+ tokenizer, model = get_model()
170
+ short_history = model_history[-2 * MAX_HISTORY_TURNS :]
171
+ messages = [
172
+ {"role": "system", "content": (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT},
173
+ *short_history,
174
+ {"role": "user", "content": message},
175
+ ]
176
+
177
+ prompt = tokenizer.apply_chat_template(
178
+ messages,
179
+ tokenize=False,
180
+ add_generation_prompt=True,
181
+ enable_thinking=thinking,
182
+ )
183
+ inputs = tokenizer(prompt, return_tensors="pt")
184
+ input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:]
185
+ attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:]
186
+
187
+ streamer = TextIteratorStreamer(
188
+ tokenizer,
189
+ skip_prompt=True,
190
+ skip_special_tokens=False,
191
+ clean_up_tokenization_spaces=False,
192
+ timeout=None,
193
+ )
194
+
195
+ generation_kwargs = {
196
+ "input_ids": input_ids,
197
+ "attention_mask": attention_mask,
198
+ "max_new_tokens": MAX_NEW_TOKENS,
199
+ "do_sample": True,
200
+ "temperature": 0.6 if thinking else 0.7,
201
+ "top_p": 0.95 if thinking else 0.8,
202
+ "top_k": 20,
203
+ "pad_token_id": tokenizer.eos_token_id,
204
+ "streamer": streamer,
205
+ }
206
+
207
+ generation_error = {}
208
+
209
+ def run_generation():
210
+ try:
211
+ with _generate_lock:
212
+ model.generate(**generation_kwargs)
213
+ except Exception as exc:
214
+ generation_error["message"] = str(exc)
215
+ streamer.on_finalized_text("", stream_end=True)
216
+
217
+ thread = threading.Thread(target=run_generation, daemon=True)
218
+ thread.start()
219
+
220
+ raw_text = ""
221
+ saw_end_think = False
222
+
223
+ for chunk in streamer:
224
+ raw_text += chunk
225
+ reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking)
226
+ saw_end_think = saw_end_think or saw_end_now
227
+
228
+ if thinking:
229
+ if saw_end_think:
230
+ reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
231
+ else:
232
+ reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)"
233
+ else:
234
+ reasoning_chat[-1]["content"] = "(reasoning disabled)"
235
+
236
+ answer_chat[-1]["content"] = answer_text
237
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
238
+
239
+ thread.join()
240
+
241
+ if generation_error:
242
+ reasoning_chat[-1]["content"] = ""
243
+ answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}"
244
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
245
+ return
246
+
247
+ reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking)
248
+ if thinking and not saw_end_think:
249
+ reasoning_text = ""
250
+ answer_text = final_cleanup(raw_text)
251
+
252
+ if thinking:
253
+ reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
254
+ else:
255
+ reasoning_chat[-1]["content"] = "(reasoning disabled)"
256
+
257
+ answer_chat[-1]["content"] = answer_text or "(empty response)"
258
+ model_history = short_history + [
259
+ {"role": "user", "content": message},
260
+ {"role": "assistant", "content": answer_chat[-1]["content"]},
261
+ ]
262
+
263
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
264
+
265
+ except Exception as exc:
266
+ reasoning_chat[-1]["content"] = ""
267
+ answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}"
268
+ yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
269
+
270
+
271
+ with gr.Blocks(title="Local CPU split-reasoning chat") as demo:
272
+ gr.Markdown(
273
+ "# Local CPU split-reasoning chat\n"
274
+ f"Running a local safetensors model on CPU from `{MODEL_ID}`. No GGUF and no external inference provider.\n\n"
275
+ "The first request downloads the model, so the cold start is slower."
276
+ )
277
+
278
+ with gr.Row():
279
+ preset = gr.Dropdown(
280
+ choices=list(PRESETS.keys()),
281
+ value="Math",
282
+ label="Preset prompt",
283
+ )
284
+ thinking = gr.Checkbox(label="Enable thinking", value=True)
285
+
286
+ system_prompt = gr.Textbox(
287
+ label="System prompt",
288
+ value=PRESETS["Math"]["system"],
289
+ lines=3,
290
+ )
291
+
292
+ user_input = gr.Textbox(
293
+ label="Your message",
294
+ value=PRESETS["Math"]["prompt"],
295
+ lines=4,
296
+ )
297
+
298
+ with gr.Row():
299
+ sample_reasoning = gr.Textbox(
300
+ label="Sample reasoning",
301
+ value=PRESETS["Math"]["sample_reasoning"],
302
+ lines=5,
303
+ interactive=False,
304
+ )
305
+ sample_answer = gr.Textbox(
306
+ label="Sample answer",
307
+ value=PRESETS["Math"]["sample_answer"],
308
+ lines=5,
309
+ interactive=False,
310
+ )
311
+
312
+ with gr.Row():
313
+ send_btn = gr.Button("Send", variant="primary")
314
+ clear_btn = gr.Button("Clear")
315
+
316
+ with gr.Row():
317
+ reasoning_bot = make_chatbot("Reasoning", height=520)
318
+ answer_bot = make_chatbot("Assistant", height=520)
319
+
320
+ model_history_state = gr.State([])
321
+
322
+ preset.change(
323
+ fn=load_preset,
324
+ inputs=preset,
325
+ outputs=[system_prompt, user_input, thinking, sample_reasoning, sample_answer],
326
+ )
327
+
328
+ send_btn.click(
329
+ fn=respond_stream,
330
+ inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
331
+ outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
332
+ )
333
+ user_input.submit(
334
+ fn=respond_stream,
335
+ inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
336
+ outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
337
+ )
338
 
339
+ clear_btn.click(
340
+ fn=clear_all,
341
+ inputs=None,
342
+ outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
343
+ )
344
 
345
 
346
+ demo.queue()
347
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==6.5.1
2
+ transformers>=4.51.0
3
+ torch>=2.2.0
4
+ safetensors>=0.4.0