Spaces:

Deign86
/

mathpulse-api-v3test

Running

App Files Files Community

github-actions[bot] commited on 18 days ago

Commit

d778d65

1 Parent(s): cc25b3c

🚀 Auto-deploy backend from GitHub (3001f56)

Browse files

Files changed (3) hide show

config/env.sample +5 -2
main.py +41 -2
tests/test_api.py +20 -0

config/env.sample CHANGED Viewed

@@ -38,17 +38,20 @@ INFERENCE_INTERACTIVE_TIMEOUT_SEC=55
 INFERENCE_BACKGROUND_TIMEOUT_SEC=120
 # model defaults
-INFERENCE_MODEL_ID=meta-llama/Llama-3.1-8B-Instruct
 INFERENCE_MAX_NEW_TOKENS=640
 INFERENCE_TEMPERATURE=0.2
 INFERENCE_TOP_P=0.9
-INFERENCE_CHAT_MODEL_ID=meta-llama/Llama-3.1-8B-Instruct
 INFERENCE_CHAT_HARD_MODEL_ID=meta-llama/Meta-Llama-3-70B-Instruct
 INFERENCE_CHAT_HARD_TRIGGER_ENABLED=true
 INFERENCE_CHAT_HARD_PROMPT_CHARS=650
 INFERENCE_CHAT_HARD_HISTORY_CHARS=1500
 INFERENCE_CHAT_HARD_KEYWORDS=step-by-step,show all steps,explain each step,justify each step,derive,derivation,proof,prove,rigorous,multi-step,word problem
 CHAT_MAX_NEW_TOKENS=768
 # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
 HF_QUIZ_MODEL_ID=
 HF_QUIZ_JSON_REPAIR_MODEL_ID=meta-llama/Llama-3.1-8B-Instruct

 INFERENCE_BACKGROUND_TIMEOUT_SEC=120
 # model defaults
+# Leave empty unless you intentionally want one global model for every task.
+INFERENCE_MODEL_ID=
 INFERENCE_MAX_NEW_TOKENS=640
 INFERENCE_TEMPERATURE=0.2
 INFERENCE_TOP_P=0.9
+INFERENCE_CHAT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
 INFERENCE_CHAT_HARD_MODEL_ID=meta-llama/Meta-Llama-3-70B-Instruct
 INFERENCE_CHAT_HARD_TRIGGER_ENABLED=true
 INFERENCE_CHAT_HARD_PROMPT_CHARS=650
 INFERENCE_CHAT_HARD_HISTORY_CHARS=1500
 INFERENCE_CHAT_HARD_KEYWORDS=step-by-step,show all steps,explain each step,justify each step,derive,derivation,proof,prove,rigorous,multi-step,word problem
 CHAT_MAX_NEW_TOKENS=768
+CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC=25
+CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
 # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
 HF_QUIZ_MODEL_ID=
 HF_QUIZ_JSON_REPAIR_MODEL_ID=meta-llama/Llama-3.1-8B-Instruct

main.py CHANGED Viewed

@@ -206,6 +206,11 @@ FIREBASE_AUTH_PROJECT_ALLOWLIST: Set[str] = {
     if value.strip()
 }
 CHAT_MAX_NEW_TOKENS = max(256, int(os.getenv("CHAT_MAX_NEW_TOKENS", "576")))
 ALLOWED_UPLOAD_EXTENSIONS: Set[str] = {".csv", ".xlsx", ".xls", ".pdf"}
 ALLOWED_UPLOAD_MIME_TYPES: Set[str] = {
@@ -1929,17 +1934,51 @@ async def chat_tutor_stream(request: ChatRequest):
             return "\n".join(body) + "\n\n"
         async def event_generator():
             try:
-                async for chunk in call_hf_chat_stream_async(
                     messages,
                     max_tokens=CHAT_MAX_NEW_TOKENS,
                     temperature=0.3,
                     top_p=0.85,
                     task_type="chat",
-                ):
                     payload = json.dumps({"chunk": chunk}, ensure_ascii=False)
                     yield _sse("chunk", payload)
                     await asyncio.sleep(0)
             except Exception as hf_err:
                 logger.error(f"HF chat stream failed: {hf_err}")
                 err_payload = json.dumps({

     if value.strip()
 }
 CHAT_MAX_NEW_TOKENS = max(256, int(os.getenv("CHAT_MAX_NEW_TOKENS", "576")))
+CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC = max(5, int(os.getenv("CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC", "25")))
+CHAT_STREAM_TOTAL_TIMEOUT_SEC = max(
+    CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC,
+    int(os.getenv("CHAT_STREAM_TOTAL_TIMEOUT_SEC", "120")),
+)
 ALLOWED_UPLOAD_EXTENSIONS: Set[str] = {".csv", ".xlsx", ".xls", ".pdf"}
 ALLOWED_UPLOAD_MIME_TYPES: Set[str] = {
             return "\n".join(body) + "\n\n"
         async def event_generator():
+            stream_iterator = None
+            stream_started_at = time.monotonic()
+            emitted_any_chunk = False
             try:
+                stream_iterator = call_hf_chat_stream_async(
                     messages,
                     max_tokens=CHAT_MAX_NEW_TOKENS,
                     temperature=0.3,
                     top_p=0.85,
                     task_type="chat",
+                )
+                while True:
+                    elapsed = time.monotonic() - stream_started_at
+                    remaining_total = CHAT_STREAM_TOTAL_TIMEOUT_SEC - elapsed
+                    if remaining_total <= 0:
+                        raise TimeoutError("Chat stream exceeded total timeout")
+                    token_timeout = min(CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC, remaining_total)
+                    try:
+                        chunk = await asyncio.wait_for(stream_iterator.__anext__(), timeout=token_timeout)
+                    except StopAsyncIteration:
+                        break
+                    if not chunk:
+                        continue
+                    emitted_any_chunk = True
                     payload = json.dumps({"chunk": chunk}, ensure_ascii=False)
                     yield _sse("chunk", payload)
                     await asyncio.sleep(0)
+            except (asyncio.TimeoutError, TimeoutError):
+                logger.error(
+                    "HF chat stream timed out (idle=%ss total=%ss)",
+                    CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC,
+                    CHAT_STREAM_TOTAL_TIMEOUT_SEC,
+                )
+                err_payload = json.dumps({
+                    "detail": (
+                        "AI response stream timed out mid-response. Please retry."
+                        if emitted_any_chunk
+                        else "AI response stream timed out before any tokens were received. Please retry."
+                    ),
+                })
+                yield _sse("error", err_payload)
             except Exception as hf_err:
                 logger.error(f"HF chat stream failed: {hf_err}")
                 err_payload = json.dumps({

tests/test_api.py CHANGED Viewed

@@ -286,6 +286,26 @@ class TestChatEndpoint:
         assert "event: error" in content
         assert "event: end" in content
 class TestHFChatTransport:
     @patch("main.http_requests.post")

         assert "event: error" in content
         assert "event: end" in content
+    @patch("main.call_hf_chat_stream_async")
+    def test_chat_stream_timeout_emits_error_and_end_events(self, mock_stream_async):
+        async def _slow_stream(*args, **kwargs):
+            await asyncio.sleep(0.05)
+            yield "late chunk"
+        mock_stream_async.return_value = _slow_stream()
+        with patch.object(main_module, "CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC", 0.01), patch.object(main_module, "CHAT_STREAM_TOTAL_TIMEOUT_SEC", 0.03):
+            with client.stream("POST", "/api/chat/stream", json={
+                "message": "Say hello",
+                "history": [],
+            }) as response:
+                assert response.status_code == 200
+                content = "".join(response.iter_text())
+        assert "event: error" in content
+        assert "timed out" in content.lower()
+        assert "event: end" in content
 class TestHFChatTransport:
     @patch("main.http_requests.post")