Yash030 Claude Opus 4.7 commited on
Commit
ebba9d6
·
1 Parent(s): 49813da

Speed optimizations and enhanced auto-model fallback routing

Browse files

- Connection pool tuning: increased max_connections (100→200), max_keepalive (20→50), reduced keepalive_expiry (30s→15s)
- NIM concurrency: increased max_concurrency for better throughput
- Faster timeouts: connect (30→15s), first_chunk (60→45s), fallback (60→30s)
- Retry backoff: reduced base_delay (1→0.5s), max_delay (60→30s)
- New trivial request fast-path for hi/ok/status checks
- Auto model now routes to all 7 NIM models by default
- Pre-flight rate limit check before trying each candidate
- Timeout/transient errors trigger automatic fallback to next model
- Import cleanup for httpx/asyncio at module level

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

api/detection.py CHANGED
@@ -9,6 +9,37 @@ from core.anthropic import extract_text_from_content
9
  from .models.anthropic import MessagesRequest
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def is_quota_check_request(request_data: MessagesRequest) -> bool:
13
  """Check if this is a quota probe request.
14
 
 
9
  from .models.anthropic import MessagesRequest
10
 
11
 
12
+ def is_trivial_text_request(request_data: MessagesRequest) -> tuple[bool, str]:
13
+ """Detect trivial requests that can be fast-pathed.
14
+
15
+ Returns (is_trivial, text_content) for trivial requests that only need
16
+ a simple acknowledgment or echo response.
17
+ """
18
+ # Only for single short user messages with max_tokens=1
19
+ if request_data.max_tokens != 1:
20
+ return False, ""
21
+ if len(request_data.messages) != 1:
22
+ return False, ""
23
+ msg = request_data.messages[0]
24
+ if msg.role != "user":
25
+ return False, ""
26
+
27
+ text = extract_text_from_content(msg.content)
28
+ text_lower = text.lower().strip()
29
+
30
+ # Single word or very short queries
31
+ if len(text_lower) < 50:
32
+ # "hi", "hello", "ok", "thanks", etc.
33
+ if text_lower in ("hi", "hello", "ok", "thanks", "thank you", "yes", "no", "okay"):
34
+ return True, f"OK. {text}"
35
+
36
+ # Health/status checks
37
+ if any(kw in text_lower for kw in ["status", "health", "ping", "are you"]):
38
+ return True, "I'm ready."
39
+
40
+ return False, ""
41
+
42
+
43
  def is_quota_check_request(request_data: MessagesRequest) -> bool:
44
  """Check if this is a quota probe request.
45
 
api/model_router.py CHANGED
@@ -8,17 +8,24 @@ from loguru import logger
8
 
9
  from config.provider_ids import SUPPORTED_PROVIDER_IDS
10
  from config.settings import Settings
 
11
  from core.session_tracker import SessionTracker
12
- from core.model_capabilities import (
13
- get_model_capabilities,
14
- find_best_model_for_task,
15
- find_models_with_capability,
16
- )
17
  from core.task_detector import TaskDetector
 
18
 
19
  from .gateway_model_ids import decode_gateway_model_id
20
  from .models.anthropic import MessagesRequest, TokenCountRequest
21
- from providers.rate_limit import GlobalRateLimiter
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  @dataclass(frozen=True, slots=True)
@@ -151,6 +158,12 @@ class ModelRouter:
151
 
152
  Used by the 'auto' routing logic to implement provider-side failover.
153
  Considers session load for fair resource sharing across multiple clients.
 
 
 
 
 
 
154
  """
155
  if not self._is_auto(claude_model_name):
156
  return [self.resolve(claude_model_name)]
@@ -206,20 +219,24 @@ class ModelRouter:
206
  source,
207
  )
208
 
209
- # 1. Preferred order (AUTO_MODEL_ORDER)
210
  order_csv = (self._settings.auto_model_order or "").strip()
211
  if order_csv:
212
  for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
213
  add_candidate(cand, "AUTO_MODEL_PRIORITY")
214
 
215
- # 2. Main MODEL
216
  add_candidate(self._settings.model, "MODEL")
217
 
218
- # 3. NVIDIA Fallbacks
219
  nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
220
  if nim_csv:
221
  for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
222
  add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
 
 
 
 
223
 
224
  # 4. Model-specific overrides
225
  add_candidate(self._settings.model_opus, "MODEL_OPUS")
 
8
 
9
  from config.provider_ids import SUPPORTED_PROVIDER_IDS
10
  from config.settings import Settings
11
+ from core.model_capabilities import find_best_model_for_task
12
  from core.session_tracker import SessionTracker
 
 
 
 
 
13
  from core.task_detector import TaskDetector
14
+ from providers.rate_limit import GlobalRateLimiter
15
 
16
  from .gateway_model_ids import decode_gateway_model_id
17
  from .models.anthropic import MessagesRequest, TokenCountRequest
18
+
19
+ # Default NIM models to include in auto routing (in order of preference)
20
+ DEFAULT_NIM_AUTO_MODELS = [
21
+ "nvidia_nim/qwen/qwen3-coder-480b-a35b-instruct",
22
+ "nvidia_nim/z-ai/glm4.7",
23
+ "nvidia_nim/stepfun-ai/step-3.5-flash",
24
+ "nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
25
+ "nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct",
26
+ "nvidia_nim/bytedance/seed-oss-36b-instruct",
27
+ "nvidia_nim/mistralai/mistral-nemotron",
28
+ ]
29
 
30
 
31
  @dataclass(frozen=True, slots=True)
 
158
 
159
  Used by the 'auto' routing logic to implement provider-side failover.
160
  Considers session load for fair resource sharing across multiple clients.
161
+
162
+ Priority order:
163
+ 1. AUTO_MODEL_ORDER (if configured)
164
+ 2. MODEL (primary)
165
+ 3. NVIDIA NIM fallback models (if configured, or DEFAULT_NIM_AUTO_MODELS)
166
+ 4. MODEL_OPUS, MODEL_SONNET, MODEL_HAIKU
167
  """
168
  if not self._is_auto(claude_model_name):
169
  return [self.resolve(claude_model_name)]
 
219
  source,
220
  )
221
 
222
+ # 1. AUTO_MODEL_ORDER (user-configured priority)
223
  order_csv = (self._settings.auto_model_order or "").strip()
224
  if order_csv:
225
  for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
226
  add_candidate(cand, "AUTO_MODEL_PRIORITY")
227
 
228
+ # 2. Primary MODEL
229
  add_candidate(self._settings.model, "MODEL")
230
 
231
+ # 3. NVIDIA Fallbacks - use configured or defaults
232
  nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
233
  if nim_csv:
234
  for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
235
  add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
236
+ else:
237
+ # Use default NIM models when no explicit fallback configured
238
+ for cand in DEFAULT_NIM_AUTO_MODELS:
239
+ add_candidate(cand, "DEFAULT_NIM_AUTO_MODELS")
240
 
241
  # 4. Model-specific overrides
242
  add_candidate(self._settings.model_opus, "MODEL_OPUS")
api/optimization_handlers.py CHANGED
@@ -17,6 +17,7 @@ from .detection import (
17
  is_quota_check_request,
18
  is_suggestion_mode_request,
19
  is_title_generation_request,
 
20
  )
21
  from .models.anthropic import MessagesRequest
22
  from .models.responses import MessagesResponse, Usage
@@ -133,8 +134,26 @@ def try_filepath_mock(
133
  )
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # Cheapest/most common optimizations first for faster short-circuit.
137
  OPTIMIZATION_HANDLERS = [
 
138
  try_quota_mock,
139
  try_prefix_detection,
140
  try_title_skip,
 
17
  is_quota_check_request,
18
  is_suggestion_mode_request,
19
  is_title_generation_request,
20
+ is_trivial_text_request,
21
  )
22
  from .models.anthropic import MessagesRequest
23
  from .models.responses import MessagesResponse, Usage
 
134
  )
135
 
136
 
137
+ def try_trivial_text(
138
+ request_data: MessagesRequest, settings: Settings
139
+ ) -> MessagesResponse | None:
140
+ """Fast-path trivial text requests (hi, ok, status checks) without API call."""
141
+ is_trivial, text = is_trivial_text_request(request_data)
142
+ if not is_trivial:
143
+ return None
144
+
145
+ logger.info("Optimization: Fast-path trivial text request")
146
+ return _text_response(
147
+ request_data,
148
+ text,
149
+ input_tokens=5,
150
+ output_tokens=3,
151
+ )
152
+
153
+
154
  # Cheapest/most common optimizations first for faster short-circuit.
155
  OPTIMIZATION_HANDLERS = [
156
+ try_trivial_text,
157
  try_quota_mock,
158
  try_prefix_detection,
159
  try_title_skip,
api/services.py CHANGED
@@ -244,6 +244,18 @@ class ClaudeProxyService:
244
 
245
  for i, resolved in enumerate(candidates):
246
  try:
 
 
 
 
 
 
 
 
 
 
 
 
247
  provider = self._provider_getter(resolved.provider_id)
248
  routed_request = request_data.model_copy(deep=True)
249
  routed_request.model = resolved.provider_model
@@ -292,7 +304,39 @@ class ClaudeProxyService:
292
  )
293
  last_exc = e
294
  continue
 
 
 
 
 
 
 
 
 
295
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  logger.error(
297
  "Provider '{}' failed with unexpected error: {}. Trying next candidate...",
298
  resolved.provider_id,
 
244
 
245
  for i, resolved in enumerate(candidates):
246
  try:
247
+ # Pre-check: skip candidates that are currently rate limited
248
+ from providers.rate_limit import GlobalRateLimiter
249
+
250
+ limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
251
+ if limiter.is_blocked() and resolved.provider_id != "zen":
252
+ logger.warning(
253
+ "Provider '{} is currently rate limited, skipping to next candidate...",
254
+ resolved.provider_id,
255
+ )
256
+ last_exc = Exception("Rate limited")
257
+ continue
258
+
259
  provider = self._provider_getter(resolved.provider_id)
260
  routed_request = request_data.model_copy(deep=True)
261
  routed_request.model = resolved.provider_model
 
304
  )
305
  last_exc = e
306
  continue
307
+ except TimeoutError as e:
308
+ # Timeout = slow model, try next candidate for faster response
309
+ logger.warning(
310
+ "Provider '{}' timed out ({}). Trying next candidate...",
311
+ resolved.provider_id,
312
+ type(e).__name__,
313
+ )
314
+ last_exc = e
315
+ continue
316
  except Exception as e:
317
+ # Check if it's a transient error that should trigger fallback
318
+ error_str = str(e).lower()
319
+ is_transient = any(
320
+ kw in error_str
321
+ for kw in [
322
+ "timeout",
323
+ "connection",
324
+ "refused",
325
+ "reset",
326
+ "unavailable",
327
+ "service",
328
+ ]
329
+ )
330
+ if is_transient:
331
+ logger.warning(
332
+ "Provider '{}' failed with transient error ({}): {}. Trying next candidate...",
333
+ resolved.provider_id,
334
+ type(e).__name__,
335
+ e,
336
+ )
337
+ last_exc = e
338
+ continue
339
+
340
  logger.error(
341
  "Provider '{}' failed with unexpected error: {}. Trying next candidate...",
342
  resolved.provider_id,
providers/nvidia_nim/client.py CHANGED
@@ -1,11 +1,13 @@
1
  """NVIDIA NIM provider implementation."""
2
 
 
3
  import json
4
  from typing import Any
5
 
 
6
  import openai
7
- from openai import AsyncOpenAI
8
  from loguru import logger
 
9
 
10
  from config.nim import NimSettings
11
  from config.settings import Settings
@@ -13,13 +15,13 @@ from providers.base import ProviderConfig
13
  from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
14
  from providers.openai_compat import OpenAIChatTransport
15
 
 
16
  from .request import (
17
  build_request_body,
18
  clone_body_without_chat_template,
19
  clone_body_without_reasoning_budget,
20
  clone_body_without_reasoning_content,
21
  )
22
- from . import metrics as nim_metrics
23
 
24
 
25
  class NvidiaNimProvider(OpenAIChatTransport):
@@ -105,12 +107,11 @@ class NvidiaNimProvider(OpenAIChatTransport):
105
  configured fallback models from settings `nvidia_nim_fallback_models`.
106
  """
107
  from config.settings import get_settings
108
- import httpx
109
- import asyncio
110
 
111
- connect_timeout_s = 30
112
- first_chunk_timeout_s = 60
113
- fallback_first_chunk_timeout_s = 60
 
114
 
115
  try:
116
  client = self._client_for_body(body)
@@ -129,7 +130,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
129
  first = await asyncio.wait_for(
130
  stream.__anext__(), timeout=first_chunk_timeout_s
131
  )
132
- except asyncio.TimeoutError:
133
  # try to close original stream if possible
134
  try:
135
  await getattr(stream, "aclose", lambda: None)()
@@ -214,7 +215,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
214
  first = await asyncio.wait_for(
215
  stream.__anext__(), timeout=fallback_first_chunk_timeout_s
216
  )
217
- except asyncio.TimeoutError:
218
  try:
219
  await getattr(stream, "aclose", lambda: None)()
220
  except Exception:
 
1
  """NVIDIA NIM provider implementation."""
2
 
3
+ import asyncio
4
  import json
5
  from typing import Any
6
 
7
+ import httpx
8
  import openai
 
9
  from loguru import logger
10
+ from openai import AsyncOpenAI
11
 
12
  from config.nim import NimSettings
13
  from config.settings import Settings
 
15
  from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
16
  from providers.openai_compat import OpenAIChatTransport
17
 
18
+ from . import metrics as nim_metrics
19
  from .request import (
20
  build_request_body,
21
  clone_body_without_chat_template,
22
  clone_body_without_reasoning_budget,
23
  clone_body_without_reasoning_content,
24
  )
 
25
 
26
 
27
  class NvidiaNimProvider(OpenAIChatTransport):
 
107
  configured fallback models from settings `nvidia_nim_fallback_models`.
108
  """
109
  from config.settings import get_settings
 
 
110
 
111
+ # Reduced timeouts for faster fallback detection
112
+ connect_timeout_s = 15 # Reduced from 30
113
+ first_chunk_timeout_s = 45 # Reduced from 60
114
+ fallback_first_chunk_timeout_s = 30 # Reduced from 60 - faster fallback
115
 
116
  try:
117
  client = self._client_for_body(body)
 
130
  first = await asyncio.wait_for(
131
  stream.__anext__(), timeout=first_chunk_timeout_s
132
  )
133
+ except TimeoutError:
134
  # try to close original stream if possible
135
  try:
136
  await getattr(stream, "aclose", lambda: None)()
 
215
  first = await asyncio.wait_for(
216
  stream.__anext__(), timeout=fallback_first_chunk_timeout_s
217
  )
218
+ except TimeoutError:
219
  try:
220
  await getattr(stream, "aclose", lambda: None)()
221
  except Exception:
providers/openai_compat.py CHANGED
@@ -28,12 +28,12 @@ from providers.error_mapping import (
28
  map_error,
29
  user_visible_message_for_mapped_provider_error,
30
  )
31
- from providers.rate_limit import GlobalRateLimiter
32
  from providers.model_listing import (
33
  ProviderModelInfo,
34
  extract_openai_model_ids,
35
  model_infos_from_ids,
36
  )
 
37
 
38
 
39
  def _iter_heuristic_tool_use_sse(
@@ -77,23 +77,24 @@ class OpenAIChatTransport(BaseProvider):
77
  self._base_url = base_url.rstrip("/")
78
  self._http_client = None
79
  self._client_cache: dict[str, AsyncOpenAI] = {}
80
- # Zen has no rate limits - use very high limits to avoid throttling
81
- # NVIDIA NIM has 40 req/min - respect that limit
82
  if provider_name.lower() == "zen":
83
  effective_rate_limit = 9999 # Effectively unlimited
84
  effective_max_concurrency = config.max_concurrency * 4 # Higher concurrency for Zen
85
  else:
86
  effective_rate_limit = config.rate_limit or 40
87
- effective_max_concurrency = config.max_concurrency
 
88
  self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
89
  provider_name.lower(),
90
  rate_limit=effective_rate_limit,
91
  rate_window=config.rate_window,
92
  max_concurrency=effective_max_concurrency,
93
  )
94
- # Always create an explicit httpx.AsyncClient with trust_env=False to avoid
95
- # slow system proxy detection on Windows during initialization.
96
- # Connection pool tuned for high throughput with keepalive optimization.
97
  http_client_args = {
98
  "timeout": httpx.Timeout(
99
  config.http_read_timeout,
@@ -104,9 +105,9 @@ class OpenAIChatTransport(BaseProvider):
104
  "trust_env": False,
105
  "http2": True,
106
  "limits": httpx.Limits(
107
- max_keepalive_connections=20,
108
- max_connections=100,
109
- keepalive_expiry=30.0,
110
  ),
111
  }
112
  if config.proxy:
@@ -407,7 +408,7 @@ class OpenAIChatTransport(BaseProvider):
407
  except Exception as e:
408
  self._log_stream_transport_error(tag, req_tag, e)
409
  mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
410
-
411
  has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
412
  has_content_blocks = (
413
  sse.blocks.text_index != -1
 
28
  map_error,
29
  user_visible_message_for_mapped_provider_error,
30
  )
 
31
  from providers.model_listing import (
32
  ProviderModelInfo,
33
  extract_openai_model_ids,
34
  model_infos_from_ids,
35
  )
36
+ from providers.rate_limit import GlobalRateLimiter
37
 
38
 
39
  def _iter_heuristic_tool_use_sse(
 
77
  self._base_url = base_url.rstrip("/")
78
  self._http_client = None
79
  self._client_cache: dict[str, AsyncOpenAI] = {}
80
+ # NVIDIA NIM has 40 req/min - use burst capacity for faster initial response
81
+ # Increase concurrency for better throughput under load
82
  if provider_name.lower() == "zen":
83
  effective_rate_limit = 9999 # Effectively unlimited
84
  effective_max_concurrency = config.max_concurrency * 4 # Higher concurrency for Zen
85
  else:
86
  effective_rate_limit = config.rate_limit or 40
87
+ # Increase default concurrency for NIM - allows more parallel streams
88
+ effective_max_concurrency = max(config.max_concurrency * 2, 20)
89
  self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
90
  provider_name.lower(),
91
  rate_limit=effective_rate_limit,
92
  rate_window=config.rate_window,
93
  max_concurrency=effective_max_concurrency,
94
  )
95
+ # Connection pool tuned for maximum throughput.
96
+ # NVIDIA NIM servers: reduce keepalive_expiry to avoid stale connections,
97
+ # increase pool size for high concurrency.
98
  http_client_args = {
99
  "timeout": httpx.Timeout(
100
  config.http_read_timeout,
 
105
  "trust_env": False,
106
  "http2": True,
107
  "limits": httpx.Limits(
108
+ max_keepalive_connections=50, # Increased from 20
109
+ max_connections=200, # Increased from 100
110
+ keepalive_expiry=15.0, # Reduced from 30 - faster connection rotation
111
  ),
112
  }
113
  if config.proxy:
 
408
  except Exception as e:
409
  self._log_stream_transport_error(tag, req_tag, e)
410
  mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
411
+
412
  has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
413
  has_content_blocks = (
414
  sse.blocks.text_index != -1
providers/rate_limit.py CHANGED
@@ -198,9 +198,9 @@ class GlobalRateLimiter:
198
  fn: Callable[..., Any],
199
  *args: Any,
200
  max_retries: int = 3,
201
- base_delay: float = 1.0,
202
- max_delay: float = 60.0,
203
- jitter: float = 0.5,
204
  **kwargs: Any,
205
  ) -> Any:
206
  """Execute an async callable with rate limiting and retry on 429.
 
198
  fn: Callable[..., Any],
199
  *args: Any,
200
  max_retries: int = 3,
201
+ base_delay: float = 0.5, # Reduced from 1.0 for faster recovery
202
+ max_delay: float = 30.0, # Reduced from 60.0 for faster fallback
203
+ jitter: float = 0.25, # Reduced from 0.5 for more predictable delays
204
  **kwargs: Any,
205
  ) -> Any:
206
  """Execute an async callable with rate limiting and retry on 429.