Spaces:

Deign86
/

mathpulse-api-v3test

Running

App Files Files Community

github-actions[bot] commited on 10 days ago

Commit

463eded

1 Parent(s): 3ab8c81

🚀 Auto-deploy backend from GitHub (1256e20)

Browse files

Files changed (5) hide show

config/env.sample +3 -3
config/models.yaml +10 -10
main.py +1 -1
services/inference_client.py +13 -14
startup_validation.py +2 -2

config/env.sample CHANGED Viewed

@@ -39,9 +39,9 @@ INFERENCE_BACKGROUND_TIMEOUT_SEC=120
 # model defaults
 # Global default model for all tasks.
-INFERENCE_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
 INFERENCE_ENFORCE_QWEN_ONLY=true
-INFERENCE_QWEN_LOCK_MODEL=Qwen/Qwen2.5-7B-Instruct
 INFERENCE_MAX_NEW_TOKENS=640
 INFERENCE_TEMPERATURE=0.2
 INFERENCE_TOP_P=0.9
@@ -60,7 +60,7 @@ CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC=25
 CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
 # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
 HF_QUIZ_MODEL_ID=
-HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
 # retry behavior
 INFERENCE_MAX_RETRIES=3

 # model defaults
 # Global default model for all tasks.
+INFERENCE_MODEL_ID=Qwen/Qwen3-32B
 INFERENCE_ENFORCE_QWEN_ONLY=true
+INFERENCE_QWEN_LOCK_MODEL=Qwen/Qwen3-32B
 INFERENCE_MAX_NEW_TOKENS=640
 INFERENCE_TEMPERATURE=0.2
 INFERENCE_TOP_P=0.9
 CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
 # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
 HF_QUIZ_MODEL_ID=
+HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen3-32B
 # retry behavior
 INFERENCE_MAX_RETRIES=3

config/models.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 models:
   primary:
-    id: Qwen/Qwen2.5-7B-Instruct
     description: Global default instruction model for interactive Grade 11-12 math tutoring
     max_new_tokens: 640
     temperature: 0.25
@@ -26,16 +26,16 @@ models:
 routing:
   task_model_map:
-    # Non-chat defaults stay on Qwen2.5-7B; chat defaults to Qwen3-32B.
     # Hard prompts can still escalate via runtime policy in inference_client.
     chat: Qwen/Qwen3-32B
-    verify_solution: Qwen/Qwen2.5-7B-Instruct
-    lesson_generation: Qwen/Qwen2.5-7B-Instruct
-    quiz_generation: Qwen/Qwen2.5-7B-Instruct
-    learning_path: Qwen/Qwen2.5-7B-Instruct
-    daily_insight: Qwen/Qwen2.5-7B-Instruct
-    risk_classification: Qwen/Qwen2.5-7B-Instruct
-    risk_narrative: Qwen/Qwen2.5-7B-Instruct
   task_fallback_model_map:
     chat: []                                       # Chat is strict-primary only (no fallback chain)
@@ -44,7 +44,7 @@ routing:
       - meta-llama/Llama-3.1-8B-Instruct          # Second fallback
   task_provider_map:
-    # All tasks use hf_inference router (Qwen2.5-7B-Instruct natively supported)
     chat: hf_inference
     verify_solution: hf_inference
     lesson_generation: hf_inference

 models:
   primary:
+    id: Qwen/Qwen3-32B
     description: Global default instruction model for interactive Grade 11-12 math tutoring
     max_new_tokens: 640
     temperature: 0.25
 routing:
   task_model_map:
+    # Keep all task defaults aligned to Qwen3-32B.
     # Hard prompts can still escalate via runtime policy in inference_client.
     chat: Qwen/Qwen3-32B
+    verify_solution: Qwen/Qwen3-32B
+    lesson_generation: Qwen/Qwen3-32B
+    quiz_generation: Qwen/Qwen3-32B
+    learning_path: Qwen/Qwen3-32B
+    daily_insight: Qwen/Qwen3-32B
+    risk_classification: Qwen/Qwen3-32B
+    risk_narrative: Qwen/Qwen3-32B
   task_fallback_model_map:
     chat: []                                       # Chat is strict-primary only (no fallback chain)
       - meta-llama/Llama-3.1-8B-Instruct          # Second fallback
   task_provider_map:
+    # All tasks use hf_inference router (Qwen/Qwen3-32B natively supported)
     chat: hf_inference
     verify_solution: hf_inference
     lesson_generation: hf_inference

main.py CHANGED Viewed

@@ -173,7 +173,7 @@ CHAT_MODEL = HF_MATH_MODEL_ID
 # Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
 HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
-HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
 RISK_MODEL = "facebook/bart-large-mnli"
 VERIFICATION_SAMPLES = 3  # Number of samples for self-consistency checking

 # Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
 HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
+HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/Qwen3-32B")
 RISK_MODEL = "facebook/bart-large-mnli"
 VERIFICATION_SAMPLES = 3  # Number of samples for self-consistency checking

services/inference_client.py CHANGED Viewed

@@ -109,9 +109,9 @@ class InferenceClient:
         self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
         self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
-        self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen2.5-7B-Instruct").strip() or "Qwen/Qwen2.5-7B-Instruct"
-        default_model_fallback = str(primary.get("id") or "Qwen/Qwen2.5-7B-Instruct")
         env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
         self.default_model = env_model_id or default_model_fallback
@@ -184,16 +184,16 @@ class InferenceClient:
         )
         # Default task-to-model routing.
-        # Chat defaults to Qwen3-32B while other tasks stay on Qwen2.5-7B.
         self.task_model_map: Dict[str, str] = {
             "chat": "Qwen/Qwen3-32B",
-            "verify_solution": "Qwen/Qwen2.5-7B-Instruct",
-            "lesson_generation": "Qwen/Qwen2.5-7B-Instruct",
-            "quiz_generation": "Qwen/Qwen2.5-7B-Instruct",
-            "learning_path": "Qwen/Qwen2.5-7B-Instruct",
-            "daily_insight": "Qwen/Qwen2.5-7B-Instruct",
-            "risk_classification": "Qwen/Qwen2.5-7B-Instruct",
-            "risk_narrative": "Qwen/Qwen2.5-7B-Instruct",
         }
         # Fallback chains (only to other HF-supported models, no featherless-ai)
         self.task_fallback_model_map: Dict[str, List[str]] = {
@@ -592,7 +592,7 @@ class InferenceClient:
         if provider == "local_space":
             return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
-        # All models use HF inference router directly (including Qwen/Qwen2.5-7B-Instruct)
         return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
     def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
@@ -705,7 +705,7 @@ class InferenceClient:
     def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
         """
-        Call Qwen models via Featherless AI provider (the only provider serving Qwen/Qwen2.5-7B-Instruct).
         Uses HF InferenceClient with provider="featherless-ai" for direct model access.
         """
         if not self.hf_token:
@@ -718,8 +718,7 @@ class InferenceClient:
         start = time.perf_counter()
         try:
-            # Use HF InferenceClient with featherless-ai provider for Qwen models
-            # This is the only provider that supports Qwen/Qwen2.5-7B-Instruct
             client = HFInferenceClient(
                 model=target_model_base,
                 token=self.hf_token,

         self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
         self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
+        self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
+        default_model_fallback = str(primary.get("id") or "Qwen/Qwen3-32B")
         env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
         self.default_model = env_model_id or default_model_fallback
         )
         # Default task-to-model routing.
+        # Keep all tasks pinned to Qwen3-32B when qwen-only lock is active.
         self.task_model_map: Dict[str, str] = {
             "chat": "Qwen/Qwen3-32B",
+            "verify_solution": "Qwen/Qwen3-32B",
+            "lesson_generation": "Qwen/Qwen3-32B",
+            "quiz_generation": "Qwen/Qwen3-32B",
+            "learning_path": "Qwen/Qwen3-32B",
+            "daily_insight": "Qwen/Qwen3-32B",
+            "risk_classification": "Qwen/Qwen3-32B",
+            "risk_narrative": "Qwen/Qwen3-32B",
         }
         # Fallback chains (only to other HF-supported models, no featherless-ai)
         self.task_fallback_model_map: Dict[str, List[str]] = {
         if provider == "local_space":
             return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
+        # All models use HF inference router directly (including Qwen/Qwen3-32B)
         return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
     def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
     def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
         """
+        Call Qwen models via Featherless AI provider.
         Uses HF InferenceClient with provider="featherless-ai" for direct model access.
         """
         if not self.hf_token:
         start = time.perf_counter()
         try:
+            # Use HF InferenceClient with featherless-ai provider for Qwen models.
             client = HFInferenceClient(
                 model=target_model_base,
                 token=self.hf_token,

startup_validation.py CHANGED Viewed

@@ -92,13 +92,13 @@ def validate_environment() -> None:
     logger.info(f"   ✓ INFERENCE_PROVIDER: {inference_provider}")
     # Check model IDs
-    chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/Qwen2.5-7B-Instruct"
     logger.info(f"   ✓ Chat model configured: {chat_model}")
     chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
     chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
     enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
-    qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen2.5-7B-Instruct").strip() or "Qwen/Qwen2.5-7B-Instruct"
     logger.info(f"   ✓ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
     logger.info(f"   ✓ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
     logger.info(f"   ✓ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")

     logger.info(f"   ✓ INFERENCE_PROVIDER: {inference_provider}")
     # Check model IDs
+    chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/Qwen3-32B"
     logger.info(f"   ✓ Chat model configured: {chat_model}")
     chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
     chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
     enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
+    qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
     logger.info(f"   ✓ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
     logger.info(f"   ✓ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
     logger.info(f"   ✓ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")