github-actions[bot] commited on
Commit
463eded
ยท
1 Parent(s): 3ab8c81

๐Ÿš€ Auto-deploy backend from GitHub (1256e20)

Browse files
config/env.sample CHANGED
@@ -39,9 +39,9 @@ INFERENCE_BACKGROUND_TIMEOUT_SEC=120
39
 
40
  # model defaults
41
  # Global default model for all tasks.
42
- INFERENCE_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
43
  INFERENCE_ENFORCE_QWEN_ONLY=true
44
- INFERENCE_QWEN_LOCK_MODEL=Qwen/Qwen2.5-7B-Instruct
45
  INFERENCE_MAX_NEW_TOKENS=640
46
  INFERENCE_TEMPERATURE=0.2
47
  INFERENCE_TOP_P=0.9
@@ -60,7 +60,7 @@ CHAT_STREAM_NO_TOKEN_TIMEOUT_SEC=25
60
  CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
61
  # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
62
  HF_QUIZ_MODEL_ID=
63
- HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
64
 
65
  # retry behavior
66
  INFERENCE_MAX_RETRIES=3
 
39
 
40
  # model defaults
41
  # Global default model for all tasks.
42
+ INFERENCE_MODEL_ID=Qwen/Qwen3-32B
43
  INFERENCE_ENFORCE_QWEN_ONLY=true
44
+ INFERENCE_QWEN_LOCK_MODEL=Qwen/Qwen3-32B
45
  INFERENCE_MAX_NEW_TOKENS=640
46
  INFERENCE_TEMPERATURE=0.2
47
  INFERENCE_TOP_P=0.9
 
60
  CHAT_STREAM_TOTAL_TIMEOUT_SEC=120
61
  # Optional: force quiz-generation model. Leave empty to use routing.task_model_map.quiz_generation.
62
  HF_QUIZ_MODEL_ID=
63
+ HF_QUIZ_JSON_REPAIR_MODEL_ID=Qwen/Qwen3-32B
64
 
65
  # retry behavior
66
  INFERENCE_MAX_RETRIES=3
config/models.yaml CHANGED
@@ -1,6 +1,6 @@
1
  models:
2
  primary:
3
- id: Qwen/Qwen2.5-7B-Instruct
4
  description: Global default instruction model for interactive Grade 11-12 math tutoring
5
  max_new_tokens: 640
6
  temperature: 0.25
@@ -26,16 +26,16 @@ models:
26
 
27
  routing:
28
  task_model_map:
29
- # Non-chat defaults stay on Qwen2.5-7B; chat defaults to Qwen3-32B.
30
  # Hard prompts can still escalate via runtime policy in inference_client.
31
  chat: Qwen/Qwen3-32B
32
- verify_solution: Qwen/Qwen2.5-7B-Instruct
33
- lesson_generation: Qwen/Qwen2.5-7B-Instruct
34
- quiz_generation: Qwen/Qwen2.5-7B-Instruct
35
- learning_path: Qwen/Qwen2.5-7B-Instruct
36
- daily_insight: Qwen/Qwen2.5-7B-Instruct
37
- risk_classification: Qwen/Qwen2.5-7B-Instruct
38
- risk_narrative: Qwen/Qwen2.5-7B-Instruct
39
 
40
  task_fallback_model_map:
41
  chat: [] # Chat is strict-primary only (no fallback chain)
@@ -44,7 +44,7 @@ routing:
44
  - meta-llama/Llama-3.1-8B-Instruct # Second fallback
45
 
46
  task_provider_map:
47
- # All tasks use hf_inference router (Qwen2.5-7B-Instruct natively supported)
48
  chat: hf_inference
49
  verify_solution: hf_inference
50
  lesson_generation: hf_inference
 
1
  models:
2
  primary:
3
+ id: Qwen/Qwen3-32B
4
  description: Global default instruction model for interactive Grade 11-12 math tutoring
5
  max_new_tokens: 640
6
  temperature: 0.25
 
26
 
27
  routing:
28
  task_model_map:
29
+ # Keep all task defaults aligned to Qwen3-32B.
30
  # Hard prompts can still escalate via runtime policy in inference_client.
31
  chat: Qwen/Qwen3-32B
32
+ verify_solution: Qwen/Qwen3-32B
33
+ lesson_generation: Qwen/Qwen3-32B
34
+ quiz_generation: Qwen/Qwen3-32B
35
+ learning_path: Qwen/Qwen3-32B
36
+ daily_insight: Qwen/Qwen3-32B
37
+ risk_classification: Qwen/Qwen3-32B
38
+ risk_narrative: Qwen/Qwen3-32B
39
 
40
  task_fallback_model_map:
41
  chat: [] # Chat is strict-primary only (no fallback chain)
 
44
  - meta-llama/Llama-3.1-8B-Instruct # Second fallback
45
 
46
  task_provider_map:
47
+ # All tasks use hf_inference router (Qwen/Qwen3-32B natively supported)
48
  chat: hf_inference
49
  verify_solution: hf_inference
50
  lesson_generation: hf_inference
main.py CHANGED
@@ -173,7 +173,7 @@ CHAT_MODEL = HF_MATH_MODEL_ID
173
 
174
  # Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
175
  HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
176
- HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
177
 
178
  RISK_MODEL = "facebook/bart-large-mnli"
179
  VERIFICATION_SAMPLES = 3 # Number of samples for self-consistency checking
 
173
 
174
  # Dedicated quiz model override. When empty, routing.task_model_map decides quiz model.
175
  HF_QUIZ_MODEL_ID = (os.getenv("HF_QUIZ_MODEL_ID", "").strip() or None)
176
+ HF_QUIZ_JSON_REPAIR_MODEL_ID = os.getenv("HF_QUIZ_JSON_REPAIR_MODEL_ID", "Qwen/Qwen3-32B")
177
 
178
  RISK_MODEL = "facebook/bart-large-mnli"
179
  VERIFICATION_SAMPLES = 3 # Number of samples for self-consistency checking
services/inference_client.py CHANGED
@@ -109,9 +109,9 @@ class InferenceClient:
109
  self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
110
 
111
  self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
112
- self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen2.5-7B-Instruct").strip() or "Qwen/Qwen2.5-7B-Instruct"
113
 
114
- default_model_fallback = str(primary.get("id") or "Qwen/Qwen2.5-7B-Instruct")
115
  env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
116
  self.default_model = env_model_id or default_model_fallback
117
 
@@ -184,16 +184,16 @@ class InferenceClient:
184
  )
185
 
186
  # Default task-to-model routing.
187
- # Chat defaults to Qwen3-32B while other tasks stay on Qwen2.5-7B.
188
  self.task_model_map: Dict[str, str] = {
189
  "chat": "Qwen/Qwen3-32B",
190
- "verify_solution": "Qwen/Qwen2.5-7B-Instruct",
191
- "lesson_generation": "Qwen/Qwen2.5-7B-Instruct",
192
- "quiz_generation": "Qwen/Qwen2.5-7B-Instruct",
193
- "learning_path": "Qwen/Qwen2.5-7B-Instruct",
194
- "daily_insight": "Qwen/Qwen2.5-7B-Instruct",
195
- "risk_classification": "Qwen/Qwen2.5-7B-Instruct",
196
- "risk_narrative": "Qwen/Qwen2.5-7B-Instruct",
197
  }
198
  # Fallback chains (only to other HF-supported models, no featherless-ai)
199
  self.task_fallback_model_map: Dict[str, List[str]] = {
@@ -592,7 +592,7 @@ class InferenceClient:
592
  if provider == "local_space":
593
  return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
594
 
595
- # All models use HF inference router directly (including Qwen/Qwen2.5-7B-Instruct)
596
  return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
597
 
598
  def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
@@ -705,7 +705,7 @@ class InferenceClient:
705
 
706
  def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
707
  """
708
- Call Qwen models via Featherless AI provider (the only provider serving Qwen/Qwen2.5-7B-Instruct).
709
  Uses HF InferenceClient with provider="featherless-ai" for direct model access.
710
  """
711
  if not self.hf_token:
@@ -718,8 +718,7 @@ class InferenceClient:
718
  start = time.perf_counter()
719
 
720
  try:
721
- # Use HF InferenceClient with featherless-ai provider for Qwen models
722
- # This is the only provider that supports Qwen/Qwen2.5-7B-Instruct
723
  client = HFInferenceClient(
724
  model=target_model_base,
725
  token=self.hf_token,
 
109
  self.pro_route_header_value = os.getenv("INFERENCE_PRO_ROUTE_HEADER_VALUE", "true")
110
 
111
  self.enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
112
+ self.qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
113
 
114
+ default_model_fallback = str(primary.get("id") or "Qwen/Qwen3-32B")
115
  env_model_id = os.getenv("INFERENCE_MODEL_ID", "").strip()
116
  self.default_model = env_model_id or default_model_fallback
117
 
 
184
  )
185
 
186
  # Default task-to-model routing.
187
+ # Keep all tasks pinned to Qwen3-32B when qwen-only lock is active.
188
  self.task_model_map: Dict[str, str] = {
189
  "chat": "Qwen/Qwen3-32B",
190
+ "verify_solution": "Qwen/Qwen3-32B",
191
+ "lesson_generation": "Qwen/Qwen3-32B",
192
+ "quiz_generation": "Qwen/Qwen3-32B",
193
+ "learning_path": "Qwen/Qwen3-32B",
194
+ "daily_insight": "Qwen/Qwen3-32B",
195
+ "risk_classification": "Qwen/Qwen3-32B",
196
+ "risk_narrative": "Qwen/Qwen3-32B",
197
  }
198
  # Fallback chains (only to other HF-supported models, no featherless-ai)
199
  self.task_fallback_model_map: Dict[str, List[str]] = {
 
592
  if provider == "local_space":
593
  return self._call_local_space(req, provider=provider, route=route, fallback_depth=fallback_depth)
594
 
595
+ # All models use HF inference router directly (including Qwen/Qwen3-32B)
596
  return self._call_hf_inference(req, provider=provider, route=route, fallback_depth=fallback_depth)
597
 
598
  def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
 
705
 
706
  def _call_hf_inference_direct(self, req: InferenceRequest, *, provider: str, route: str, fallback_depth: int) -> str:
707
  """
708
+ Call Qwen models via Featherless AI provider.
709
  Uses HF InferenceClient with provider="featherless-ai" for direct model access.
710
  """
711
  if not self.hf_token:
 
718
  start = time.perf_counter()
719
 
720
  try:
721
+ # Use HF InferenceClient with featherless-ai provider for Qwen models.
 
722
  client = HFInferenceClient(
723
  model=target_model_base,
724
  token=self.hf_token,
startup_validation.py CHANGED
@@ -92,13 +92,13 @@ def validate_environment() -> None:
92
  logger.info(f" โœ“ INFERENCE_PROVIDER: {inference_provider}")
93
 
94
  # Check model IDs
95
- chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/Qwen2.5-7B-Instruct"
96
  logger.info(f" โœ“ Chat model configured: {chat_model}")
97
 
98
  chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
99
  chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
100
  enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
101
- qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen2.5-7B-Instruct").strip() or "Qwen/Qwen2.5-7B-Instruct"
102
  logger.info(f" โœ“ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
103
  logger.info(f" โœ“ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
104
  logger.info(f" โœ“ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")
 
92
  logger.info(f" โœ“ INFERENCE_PROVIDER: {inference_provider}")
93
 
94
  # Check model IDs
95
+ chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "Qwen/Qwen3-32B"
96
  logger.info(f" โœ“ Chat model configured: {chat_model}")
97
 
98
  chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
99
  chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
100
  enforce_qwen_only = os.getenv("INFERENCE_ENFORCE_QWEN_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
101
+ qwen_lock_model = os.getenv("INFERENCE_QWEN_LOCK_MODEL", "Qwen/Qwen3-32B").strip() or "Qwen/Qwen3-32B"
102
  logger.info(f" โœ“ INFERENCE_CHAT_STRICT_MODEL_ONLY: {chat_strict}")
103
  logger.info(f" โœ“ INFERENCE_CHAT_HARD_TRIGGER_ENABLED: {chat_hard_trigger}")
104
  logger.info(f" โœ“ INFERENCE_ENFORCE_QWEN_ONLY: {enforce_qwen_only}")