Spaces:
Running
Running
| import copy | |
| import re | |
| import time | |
| import html | |
| from openai import OpenAI | |
| import gradio as gr | |
| stop_generation = False | |
| def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): | |
| global stop_generation | |
| client = OpenAI() | |
| response = client.chat.completions.create( | |
| model="glm-4.5", | |
| messages=messages, | |
| temperature=temperature, | |
| stream=True, | |
| max_tokens=32000, | |
| extra_body={ | |
| "thinking": | |
| { | |
| "type": "enabled" if thinking_enabled else "disabled", | |
| } | |
| } | |
| ) | |
| print(response) | |
| for chunk in response: | |
| if stop_generation: | |
| break | |
| if chunk.choices and chunk.choices[0].delta: | |
| delta = chunk.choices[0].delta | |
| yield delta | |
| class GLM45Model: | |
| def _strip_html(self, text: str) -> str: | |
| return re.sub(r"<[^>]+>", "", text).strip() | |
| def _wrap_text(self, text: str): | |
| return [{"type": "text", "text": text}] | |
| def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False): | |
| think_html = "" | |
| if reasoning_content and not skip_think: | |
| think_content = html.escape(reasoning_content).replace("\n", "<br>") | |
| think_html = ( | |
| "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>" | |
| "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>" | |
| + think_content | |
| + "</div></details>" | |
| ) | |
| answer_html = "" | |
| if content: | |
| content_escaped = html.escape(content) | |
| content_formatted = content_escaped.replace("\n", "<br>") | |
| answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>" | |
| return think_html + answer_html | |
| def _build_messages(self, raw_hist, sys_prompt): | |
| msgs = [] | |
| if sys_prompt.strip(): | |
| msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]}) | |
| for h in raw_hist: | |
| if h["role"] == "user": | |
| msgs.append({"role": "user", "content": self._wrap_text(h["content"])}) | |
| else: | |
| raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL) | |
| clean_content = self._strip_html(raw).strip() | |
| if clean_content: | |
| msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)}) | |
| return msgs | |
| def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0): | |
| global stop_generation | |
| stop_generation = False | |
| msgs = self._build_messages(raw_hist, sys_prompt) | |
| reasoning_buffer = "" | |
| content_buffer = "" | |
| try: | |
| for delta in stream_from_vllm(msgs, thinking_enabled, temperature): | |
| if stop_generation: | |
| break | |
| if hasattr(delta, 'reasoning_content') and delta.reasoning_content: | |
| reasoning_buffer += delta.reasoning_content | |
| elif hasattr(delta, 'content') and delta.content: | |
| content_buffer += delta.content | |
| else: | |
| if isinstance(delta, dict): | |
| if 'reasoning_content' in delta and delta['reasoning_content']: | |
| reasoning_buffer += delta['reasoning_content'] | |
| if 'content' in delta and delta['content']: | |
| content_buffer += delta['content'] | |
| elif hasattr(delta, 'content') and delta.content: | |
| content_buffer += delta.content | |
| yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled) | |
| except Exception as e: | |
| error_msg = f"Error during streaming: {str(e)}" | |
| yield self._stream_fragment("", error_msg) | |
| glm45 = GLM45Model() | |
| def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): | |
| global stop_generation | |
| stop_generation = False | |
| if not msg.strip(): | |
| return raw_hist, copy.deepcopy(raw_hist), "" | |
| user_rec = {"role": "user", "content": msg.strip()} | |
| if raw_hist is None: | |
| raw_hist = [] | |
| raw_hist.append(user_rec) | |
| place = {"role": "assistant", "content": ""} | |
| raw_hist.append(place) | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| try: | |
| for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): | |
| if stop_generation: | |
| break | |
| place["content"] = chunk | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| except Exception as e: | |
| error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>" | |
| place["content"] = error_content | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| yield raw_hist, copy.deepcopy(raw_hist), "" | |
| def reset(): | |
| global stop_generation | |
| stop_generation = True | |
| time.sleep(0.1) | |
| return [], [], "" | |
| demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft()) | |
| with demo: | |
| gr.Markdown( | |
| "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>" | |
| "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>" | |
| "This space uses the API version of the service for faster response.<br>" | |
| "Chat only. For tool use, MCP support, and web search, please refer to the API.</div>" | |
| "<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> | " | |
| "<a href='https://github.com/THUDM/GLM-4.5'>Github</a> | " | |
| "<a href='https://www.bigmodel.cn'>API</a></div>" | |
| ) | |
| raw_history = gr.State([]) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbox = gr.Chatbot( | |
| label="Chat", | |
| type="messages", | |
| height=600, | |
| elem_classes="chatbot-container", | |
| sanitize_html=False, | |
| line_breaks=True | |
| ) | |
| textbox = gr.Textbox(label="Message", lines=3) | |
| with gr.Row(): | |
| send = gr.Button("Send", variant="primary") | |
| clear = gr.Button("Clear") | |
| with gr.Column(scale=1): | |
| thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) | |
| gr.Markdown( | |
| "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>" | |
| "ON: Enable model thinking.<br>" | |
| "OFF: Not enable model thinking, the model will directly answer the question without reasoning." | |
| "</div>" | |
| ) | |
| temperature_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=1.0, | |
| step=0.01, | |
| label="Temperature" | |
| ) | |
| sys = gr.Textbox(label="System Prompt", lines=8) | |
| send.click( | |
| chat, | |
| inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| textbox.submit( | |
| chat, | |
| inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| clear.click( | |
| reset, | |
| outputs=[chatbox, raw_history, textbox] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", share=True) |