Spaces:
Running
Running
github-actions[bot] commited on
Commit ยท
463eded
1
Parent(s): 3ab8c81
๐ Auto-deploy backend from GitHub (1256e20)
Browse files- config/env.sample +3 -3
- config/models.yaml +10 -10
- main.py +1 -1
- services/inference_client.py +13 -14
- startup_validation.py +2 -2
config/env.sample
CHANGED
|
@@ -39,9 +39,9 @@ INFERENCE_BACKGROUND_TIMEOUT_SEC=120
|
|
| 39 |
|
| 40 |
# model defaults
|
| 41 |
# Global default model for all tasks.
|
| 42 |
-
INFERENCE_MODEL_ID=Qwen/
|
| 43 |
INFERENCE_ENFORCE_QWEN_ONLY=true
|
| 44 |
-
INFERENCE_QWEN_LOCK_MODEL=Qwen/
|
| 45 |
INFERENCE_MAX_NEW_TOKENS=640
|
| 46 |
INFERENCE_TEMPERATURE=0.2
|
| 47 |
INFERENCE_TOP_P=0.9
|
|
@@ -60,7 +60,7 @@ CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC=25
|
|
| 60 |
CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
|
| 61 |
# Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
|
| 62 |
HF_QUIZ_MODEL_ID=
|
| 63 |
-
HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/
|
| 64 |
|
| 65 |
# retry behavior
|
| 66 |
INFERENCE_MAX_RETRIES=3
|
|
|
|
| 39 |
|
| 40 |
# model defaults
|
| 41 |
# Global default model for all tasks.
|
| 42 |
+
INFERENCE_MODEL_ID=Qwen/Qwen3-32B
|
| 43 |
INFERENCE_ENFORCE_QWEN_ONLY=true
|
| 44 |
+
INFERENCE_QWEN_LOCK_MODEL=Qwen/Qwen3-32B
|
| 45 |
INFERENCE_MAX_NEW_TOKENS=640
|
| 46 |
INFERENCE_TEMPERATURE=0.2
|
| 47 |
INFERENCE_TOP_P=0.9
|
|
|
|
| 60 |
CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
|
| 61 |
# Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
|
| 62 |
HF_QUIZ_MODEL_ID=
|
| 63 |
+
HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen3-32B
|
| 64 |
|
| 65 |
# retry behavior
|
| 66 |
INFERENCE_MAX_RETRIES=3
|
config/models.yaml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
models:
|
| 2 |
primary:
|
| 3 |
-
id: Qwen/
|
| 4 |
description: Global default instruction model for interactive Grade 11-12 math tutoring
|
| 5 |
max_new_tokens: 640
|
| 6 |
temperature: 0.25
|
|
@@ -26,16 +26,16 @@ models:
|
|
| 26 |
|
| 27 |
routing:
|
| 28 |
task_model_map:
|
| 29 |
-
#
|
| 30 |
# Hard prompts can still escalate via runtime policy in inference_client.
|
| 31 |
chat: Qwen/Qwen3-32B
|
| 32 |
-
verify_solution: Qwen/
|
| 33 |
-
lesson_generation: Qwen/
|
| 34 |
-
quiz_generation: Qwen/
|
| 35 |
-
learning_path: Qwen/
|
| 36 |
-
daily_insight: Qwen/
|
| 37 |
-
risk_classification: Qwen/
|
| 38 |
-
risk_narrative: Qwen/
|
| 39 |
|
| 40 |
task_fallback_model_map:
|
| 41 |
chat: [] # Chat is strict-primary only (no fallback chain)
|
|
@@ -44,7 +44,7 @@ routing:
|
|
| 44 |
- meta-llama/Llama-3.1-8B-Instruct # Second fallback
|
| 45 |
|
| 46 |
task_provider_map:
|
| 47 |
-
# All tasks use hf_inference router (
|
| 48 |
chat: hf_inference
|
| 49 |
verify_solution: hf_inference
|
| 50 |
lesson_generation: hf_inference
|
|
|
|
| 1 |
models:
|
| 2 |
primary:
|
| 3 |
+
id: Qwen/Qwen3-32B
|
| 4 |
description: Global default instruction model for interactive Grade 11-12 math tutoring
|
| 5 |
max_new_tokens: 640
|
| 6 |
temperature: 0.25
|
|
|
|
| 26 |
|
| 27 |
routing:
|
| 28 |
task_model_map:
|
| 29 |
+
# Keep all task defaults aligned to Qwen3-32B.
|
| 30 |
# Hard prompts can still escalate via runtime policy in inference_client.
|
| 31 |
chat: Qwen/Qwen3-32B
|
| 32 |
+
verify_solution: Qwen/Qwen3-32B
|
| 33 |
+
lesson_generation: Qwen/Qwen3-32B
|
| 34 |
+
quiz_generation: Qwen/Qwen3-32B
|
| 35 |
+
learning_path: Qwen/Qwen3-32B
|
| 36 |
+
daily_insight: Qwen/Qwen3-32B
|
| 37 |
+
risk_classification: Qwen/Qwen3-32B
|
| 38 |
+
risk_narrative: Qwen/Qwen3-32B
|
| 39 |
|
| 40 |
task_fallback_model_map:
|
| 41 |
chat: [] # Chat is strict-primary only (no fallback chain)
|
|
|
|
| 44 |
- meta-llama/Llama-3.1-8B-Instruct # Second fallback
|
| 45 |
|
| 46 |
task_provider_map:
|
| 47 |
+
# All tasks use hf_inference router (Qwen/Qwen3-32B natively supported)
|
| 48 |
chat: hf_inference
|
| 49 |
verify_solution: hf_inference
|
| 50 |
lesson_generation: hf_inference
|
main.py
CHANGED
|
@@ -173,7 +173,7 @@ CHAT_MODEL = HF_MATH_MODEL_ID
|
|
| 173 |
|
| 174 |
# Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
|
| 175 |
HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
|
| 176 |
-
HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/
|
| 177 |
|
| 178 |
RISK_MODEL = "facebook/bart-large-mnli"
|
| 179 |
VERIFICATION_SAMPLES = 3 # Number of samples for self-consistency checking
|
|
|
|
| 173 |
|
| 174 |
# Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
|
| 175 |
HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
|
| 176 |
+
HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/Qwen3-32B")
|
| 177 |
|
| 178 |
RISK_MODEL = "facebook/bart-large-mnli"
|
| 179 |
VERIFICATION_SAMPLES = 3 # Number of samples for self-consistency checking
|
services/inference_client.py
CHANGED
|
@@ -109,9 +109,9 @@ class InferenceClient:
|
|
| 109 |
self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
|
| 110 |
|
| 111 |
self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 112 |
-
self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/
|
| 113 |
|
| 114 |
-
default_model_fallback = str(primary.get("id") or "Qwen/
|
| 115 |
env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
|
| 116 |
self.default_model = env_model_id or default_model_fallback
|
| 117 |
|
|
@@ -184,16 +184,16 @@ class InferenceClient:
|
|
| 184 |
)
|
| 185 |
|
| 186 |
# Default task-to-model routing.
|
| 187 |
-
#
|
| 188 |
self.task_model_map: Dict[str, str] = {
|
| 189 |
"chat": "Qwen/Qwen3-32B",
|
| 190 |
-
"verify_solution": "Qwen/
|
| 191 |
-
"lesson_generation": "Qwen/
|
| 192 |
-
"quiz_generation": "Qwen/
|
| 193 |
-
"learning_path": "Qwen/
|
| 194 |
-
"daily_insight": "Qwen/
|
| 195 |
-
"risk_classification": "Qwen/
|
| 196 |
-
"risk_narrative": "Qwen/
|
| 197 |
}
|
| 198 |
# Fallback chains (only to other HF-supported models, no featherless-ai)
|
| 199 |
self.task_fallback_model_map: Dict[str, List[str]] = {
|
|
@@ -592,7 +592,7 @@ class InferenceClient:
|
|
| 592 |
if provider == "local_space":
|
| 593 |
return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
|
| 594 |
|
| 595 |
-
# All models use HF inference router directly (including Qwen/
|
| 596 |
return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
|
| 597 |
|
| 598 |
def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
|
|
@@ -705,7 +705,7 @@ class InferenceClient:
|
|
| 705 |
|
| 706 |
def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
|
| 707 |
"""
|
| 708 |
-
Call Qwen models via Featherless AI provider
|
| 709 |
Uses HF InferenceClient with provider="featherless-ai" for direct model access.
|
| 710 |
"""
|
| 711 |
if not self.hf_token:
|
|
@@ -718,8 +718,7 @@ class InferenceClient:
|
|
| 718 |
start = time.perf_counter()
|
| 719 |
|
| 720 |
try:
|
| 721 |
-
# Use HF InferenceClient with featherless-ai provider for Qwen models
|
| 722 |
-
# This is the only provider that supports Qwen/Qwen2.5-7B-Instruct
|
| 723 |
client = HFInferenceClient(
|
| 724 |
model=target_model_base,
|
| 725 |
token=self.hf_token,
|
|
|
|
| 109 |
self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
|
| 110 |
|
| 111 |
self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 112 |
+
self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
|
| 113 |
|
| 114 |
+
default_model_fallback = str(primary.get("id") or "Qwen/Qwen3-32B")
|
| 115 |
env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
|
| 116 |
self.default_model = env_model_id or default_model_fallback
|
| 117 |
|
|
|
|
| 184 |
)
|
| 185 |
|
| 186 |
# Default task-to-model routing.
|
| 187 |
+
# Keep all tasks pinned to Qwen3-32B when qwen-only lock is active.
|
| 188 |
self.task_model_map: Dict[str, str] = {
|
| 189 |
"chat": "Qwen/Qwen3-32B",
|
| 190 |
+
"verify_solution": "Qwen/Qwen3-32B",
|
| 191 |
+
"lesson_generation": "Qwen/Qwen3-32B",
|
| 192 |
+
"quiz_generation": "Qwen/Qwen3-32B",
|
| 193 |
+
"learning_path": "Qwen/Qwen3-32B",
|
| 194 |
+
"daily_insight": "Qwen/Qwen3-32B",
|
| 195 |
+
"risk_classification": "Qwen/Qwen3-32B",
|
| 196 |
+
"risk_narrative": "Qwen/Qwen3-32B",
|
| 197 |
}
|
| 198 |
# Fallback chains (only to other HF-supported models, no featherless-ai)
|
| 199 |
self.task_fallback_model_map: Dict[str, List[str]] = {
|
|
|
|
| 592 |
if provider == "local_space":
|
| 593 |
return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
|
| 594 |
|
| 595 |
+
# All models use HF inference router directly (including Qwen/Qwen3-32B)
|
| 596 |
return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
|
| 597 |
|
| 598 |
def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
|
|
|
|
| 705 |
|
| 706 |
def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
|
| 707 |
"""
|
| 708 |
+
Call Qwen models via Featherless AI provider.
|
| 709 |
Uses HF InferenceClient with provider="featherless-ai" for direct model access.
|
| 710 |
"""
|
| 711 |
if not self.hf_token:
|
|
|
|
| 718 |
start = time.perf_counter()
|
| 719 |
|
| 720 |
try:
|
| 721 |
+
# Use HF InferenceClient with featherless-ai provider for Qwen models.
|
|
|
|
| 722 |
client = HFInferenceClient(
|
| 723 |
model=target_model_base,
|
| 724 |
token=self.hf_token,
|
startup_validation.py
CHANGED
|
@@ -92,13 +92,13 @@ def validate_environment() -> None:
|
|
| 92 |
logger.info(f" โ INFERENCE_PROVIDER: {inference_provider}")
|
| 93 |
|
| 94 |
# Check model IDs
|
| 95 |
-
chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/
|
| 96 |
logger.info(f" โ Chat model configured: {chat_model}")
|
| 97 |
|
| 98 |
chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 99 |
chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
|
| 100 |
enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 101 |
-
qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/
|
| 102 |
logger.info(f" โ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
|
| 103 |
logger.info(f" โ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
|
| 104 |
logger.info(f" โ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")
|
|
|
|
| 92 |
logger.info(f" โ INFERENCE_PROVIDER: {inference_provider}")
|
| 93 |
|
| 94 |
# Check model IDs
|
| 95 |
+
chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/Qwen3-32B"
|
| 96 |
logger.info(f" โ Chat model configured: {chat_model}")
|
| 97 |
|
| 98 |
chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 99 |
chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
|
| 100 |
enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
|
| 101 |
+
qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
|
| 102 |
logger.info(f" โ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
|
| 103 |
logger.info(f" โ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
|
| 104 |
logger.info(f" โ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")
|