Spaces:

deveg
/

test

Sleeping

App Files Files Community

deveg commited on 17 days ago

Commit

150ed74

verified ·

1 Parent(s): 58e54b2

Upload 2 files

Browse files

Files changed (2) hide show

app.py +339 -61
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,69 +1,347 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
     message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
 ):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
-if __name__ == "__main__":
-    demo.launch()

+import inspect
+import os
+import threading
 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B")
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "256"))
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "1536"))
+MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3"))
+N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1))))
+DEFAULT_SYSTEM_PROMPT = os.getenv(
+    "SYSTEM_PROMPT",
+    "You are a helpful assistant. Keep answers clear and concise.",
+)
+PRESETS = {
+    "Math": {
+        "system": "You are a careful math tutor. Think through the problem, then give a short final answer.",
+        "prompt": "Solve: If 2x^2 - 7x + 3 = 0, what are the real solutions?",
+        "thinking": True,
+        "sample_reasoning": "The discriminant is 49 - 24 = 25, so the roots are easy to compute with the quadratic formula.",
+        "sample_answer": "The real solutions are x = 3 and x = 1/2.",
+    },
+    "Coding": {
+        "system": "You are a Python assistant. Prefer short, readable code.",
+        "prompt": "Write a Python function that merges two sorted lists into one sorted list.",
+        "thinking": True,
+        "sample_reasoning": "Use two pointers. Compare the current elements, append the smaller one, then append the leftovers.",
+        "sample_answer": "Here is a compact merge function plus a tiny example.",
+    },
+    "Structured output": {
+        "system": "Return compact JSON and avoid extra commentary.",
+        "prompt": "Extract JSON from: Call Mina by Friday, priority high, budget about $2400, topic is launch video edits.",
+        "thinking": False,
+        "sample_reasoning": "Reasoning is disabled here so the output stays short and machine-friendly.",
+        "sample_answer": '{"person":"Mina","deadline":"Friday","priority":"high","budget_usd":2400,"topic":"launch video edits"}',
+    },
+    "Function calling style": {
+        "system": "You are an assistant that plans tool use when it helps. If a tool would help, say what tool you would call and with which arguments.",
+        "prompt": "Pretend you have tools. For 18.75 * 42 - 199 and converting 12 km to miles, explain which tool calls you would make, then give the result.",
+        "thinking": True,
+        "sample_reasoning": "I would use a calculator tool for the arithmetic and a unit-conversion tool for the distance conversion.",
+        "sample_answer": "Calculator(18.75 * 42 - 199) -> 588.5\nConvert(12 km -> miles) -> about 7.46 miles",
+    },
+    "Creative writing": {
+        "system": "Write vivid, tight prose.",
+        "prompt": "Write a two-sentence opening for a sci-fi heist story set on a drifting museum ship.",
+        "thinking": False,
+        "sample_reasoning": "Reasoning is disabled for a faster clean draft.",
+        "sample_answer": "By the time the museum ship crossed into the dead zone, every priceless relic aboard had started broadcasting a heartbeat. Nia took that as her cue to cut the lights and steal the one artifact already trying to escape.",
+    },
+}
+torch.set_num_threads(N_THREADS)
+try:
+    torch.set_num_interop_threads(max(1, min(2, N_THREADS)))
+except RuntimeError:
+    pass
+_tokenizer = None
+_model = None
+_load_lock = threading.Lock()
+_generate_lock = threading.Lock()
+def make_chatbot(label, height=520):
+    kwargs = {"label": label, "height": height}
+    if "type" in inspect.signature(gr.Chatbot.__init__).parameters:
+        kwargs["type"] = "messages"
+    return gr.Chatbot(**kwargs)
+def get_model():
+    global _tokenizer, _model
+    if _model is None or _tokenizer is None:
+        with _load_lock:
+            if _model is None or _tokenizer is None:
+                _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+                _model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_ID,
+                    torch_dtype=torch.float32,
+                )
+                _model.eval()
+    return _tokenizer, _model
+def clone_messages(messages):
+    return [dict(item) for item in (messages or [])]
+def load_preset(name):
+    preset = PRESETS[name]
+    return (
+        preset["system"],
+        preset["prompt"],
+        preset["thinking"],
+        preset["sample_reasoning"],
+        preset["sample_answer"],
+    )
+def clear_all():
+    return [], [], [], ""
+def strip_non_think_specials(text):
+    text = text or ""
+    for token in ["<|im_end|>", "<|endoftext|>", "<｜end▁of▁sentence｜>"]:
+        text = text.replace(token, "")
+    return text
+def final_cleanup(text):
+    text = strip_non_think_specials(text)
+    text = text.replace("<think>", "").replace("</think>", "")
+    return text.strip()
+def split_stream_text(raw_text, thinking):
+    raw_text = strip_non_think_specials(raw_text)
+    if not thinking:
+        return "", final_cleanup(raw_text), False
+    raw_text = raw_text.replace("<think>", "")
+    if "</think>" in raw_text:
+        reasoning, answer = raw_text.split("</think>", 1)
+        return reasoning.strip(), answer.strip(), True
+    return raw_text.strip(), "", False
+def respond_stream(
     message,
+    system_prompt,
+    thinking,
+    model_history,
+    reasoning_chat,
+    answer_chat,
 ):
+    message = (message or "").strip()
+    if not message:
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), ""
+        return
+    model_history = list(model_history or [])
+    reasoning_chat = clone_messages(reasoning_chat)
+    answer_chat = clone_messages(answer_chat)
+    reasoning_chat.append({"role": "user", "content": message})
+    reasoning_chat.append(
+        {
+            "role": "assistant",
+            "content": "(thinking...)" if thinking else "(reasoning disabled)",
+        }
+    )
+    answer_chat.append({"role": "user", "content": message})
+    answer_chat.append({"role": "assistant", "content": ""})
+    yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+    try:
+        tokenizer, model = get_model()
+        short_history = model_history[-2 * MAX_HISTORY_TURNS :]
+        messages = [
+            {"role": "system", "content": (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT},
+            *short_history,
+            {"role": "user", "content": message},
+        ]
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=thinking,
+        )
+        inputs = tokenizer(prompt, return_tensors="pt")
+        input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:]
+        attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:]
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=False,
+            clean_up_tokenization_spaces=False,
+            timeout=None,
+        )
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "do_sample": True,
+            "temperature": 0.6 if thinking else 0.7,
+            "top_p": 0.95 if thinking else 0.8,
+            "top_k": 20,
+            "pad_token_id": tokenizer.eos_token_id,
+            "streamer": streamer,
+        }
+        generation_error = {}
+        def run_generation():
+            try:
+                with _generate_lock:
+                    model.generate(**generation_kwargs)
+            except Exception as exc:
+                generation_error["message"] = str(exc)
+                streamer.on_finalized_text("", stream_end=True)
+        thread = threading.Thread(target=run_generation, daemon=True)
+        thread.start()
+        raw_text = ""
+        saw_end_think = False
+        for chunk in streamer:
+            raw_text += chunk
+            reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking)
+            saw_end_think = saw_end_think or saw_end_now
+            if thinking:
+                if saw_end_think:
+                    reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
+                else:
+                    reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)"
+            else:
+                reasoning_chat[-1]["content"] = "(reasoning disabled)"
+            answer_chat[-1]["content"] = answer_text
+            yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+        thread.join()
+        if generation_error:
+            reasoning_chat[-1]["content"] = ""
+            answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}"
+            yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+            return
+        reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking)
+        if thinking and not saw_end_think:
+            reasoning_text = ""
+            answer_text = final_cleanup(raw_text)
+        if thinking:
+            reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
+        else:
+            reasoning_chat[-1]["content"] = "(reasoning disabled)"
+        answer_chat[-1]["content"] = answer_text or "(empty response)"
+        model_history = short_history + [
+            {"role": "user", "content": message},
+            {"role": "assistant", "content": answer_chat[-1]["content"]},
+        ]
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+    except Exception as exc:
+        reasoning_chat[-1]["content"] = ""
+        answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}"
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+with gr.Blocks(title="Local CPU split-reasoning chat") as demo:
+    gr.Markdown(
+        "# Local CPU split-reasoning chat\n"
+        f"Running a local safetensors model on CPU from `{MODEL_ID}`. No GGUF and no external inference provider.\n\n"
+        "The first request downloads the model, so the cold start is slower."
+    )
+    with gr.Row():
+        preset = gr.Dropdown(
+            choices=list(PRESETS.keys()),
+            value="Math",
+            label="Preset prompt",
+        )
+        thinking = gr.Checkbox(label="Enable thinking", value=True)
+    system_prompt = gr.Textbox(
+        label="System prompt",
+        value=PRESETS["Math"]["system"],
+        lines=3,
+    )
+    user_input = gr.Textbox(
+        label="Your message",
+        value=PRESETS["Math"]["prompt"],
+        lines=4,
+    )
+    with gr.Row():
+        sample_reasoning = gr.Textbox(
+            label="Sample reasoning",
+            value=PRESETS["Math"]["sample_reasoning"],
+            lines=5,
+            interactive=False,
+        )
+        sample_answer = gr.Textbox(
+            label="Sample answer",
+            value=PRESETS["Math"]["sample_answer"],
+            lines=5,
+            interactive=False,
+        )
+    with gr.Row():
+        send_btn = gr.Button("Send", variant="primary")
+        clear_btn = gr.Button("Clear")
+    with gr.Row():
+        reasoning_bot = make_chatbot("Reasoning", height=520)
+        answer_bot = make_chatbot("Assistant", height=520)
+    model_history_state = gr.State([])
+    preset.change(
+        fn=load_preset,
+        inputs=preset,
+        outputs=[system_prompt, user_input, thinking, sample_reasoning, sample_answer],
+    )
+    send_btn.click(
+        fn=respond_stream,
+        inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+    user_input.submit(
+        fn=respond_stream,
+        inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+    clear_btn.click(
+        fn=clear_all,
+        inputs=None,
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+demo.queue()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==6.5.1
+transformers>=4.51.0
+torch>=2.2.0
+safetensors>=0.4.0