Spaces:
Running
Running
Speed optimizations and enhanced auto-model fallback routing
Browse files- Connection pool tuning: increased max_connections (100→200), max_keepalive (20→50), reduced keepalive_expiry (30s→15s)
- NIM concurrency: increased max_concurrency for better throughput
- Faster timeouts: connect (30→15s), first_chunk (60→45s), fallback (60→30s)
- Retry backoff: reduced base_delay (1→0.5s), max_delay (60→30s)
- New trivial request fast-path for hi/ok/status checks
- Auto model now routes to all 7 NIM models by default
- Pre-flight rate limit check before trying each candidate
- Timeout/transient errors trigger automatic fallback to next model
- Import cleanup for httpx/asyncio at module level
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- api/detection.py +31 -0
- api/model_router.py +26 -9
- api/optimization_handlers.py +19 -0
- api/services.py +44 -0
- providers/nvidia_nim/client.py +10 -9
- providers/openai_compat.py +12 -11
- providers/rate_limit.py +3 -3
api/detection.py
CHANGED
|
@@ -9,6 +9,37 @@ from core.anthropic import extract_text_from_content
|
|
| 9 |
from .models.anthropic import MessagesRequest
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def is_quota_check_request(request_data: MessagesRequest) -> bool:
|
| 13 |
"""Check if this is a quota probe request.
|
| 14 |
|
|
|
|
| 9 |
from .models.anthropic import MessagesRequest
|
| 10 |
|
| 11 |
|
| 12 |
+
def is_trivial_text_request(request_data: MessagesRequest) -> tuple[bool, str]:
|
| 13 |
+
"""Detect trivial requests that can be fast-pathed.
|
| 14 |
+
|
| 15 |
+
Returns (is_trivial, text_content) for trivial requests that only need
|
| 16 |
+
a simple acknowledgment or echo response.
|
| 17 |
+
"""
|
| 18 |
+
# Only for single short user messages with max_tokens=1
|
| 19 |
+
if request_data.max_tokens != 1:
|
| 20 |
+
return False, ""
|
| 21 |
+
if len(request_data.messages) != 1:
|
| 22 |
+
return False, ""
|
| 23 |
+
msg = request_data.messages[0]
|
| 24 |
+
if msg.role != "user":
|
| 25 |
+
return False, ""
|
| 26 |
+
|
| 27 |
+
text = extract_text_from_content(msg.content)
|
| 28 |
+
text_lower = text.lower().strip()
|
| 29 |
+
|
| 30 |
+
# Single word or very short queries
|
| 31 |
+
if len(text_lower) < 50:
|
| 32 |
+
# "hi", "hello", "ok", "thanks", etc.
|
| 33 |
+
if text_lower in ("hi", "hello", "ok", "thanks", "thank you", "yes", "no", "okay"):
|
| 34 |
+
return True, f"OK. {text}"
|
| 35 |
+
|
| 36 |
+
# Health/status checks
|
| 37 |
+
if any(kw in text_lower for kw in ["status", "health", "ping", "are you"]):
|
| 38 |
+
return True, "I'm ready."
|
| 39 |
+
|
| 40 |
+
return False, ""
|
| 41 |
+
|
| 42 |
+
|
| 43 |
def is_quota_check_request(request_data: MessagesRequest) -> bool:
|
| 44 |
"""Check if this is a quota probe request.
|
| 45 |
|
api/model_router.py
CHANGED
|
@@ -8,17 +8,24 @@ from loguru import logger
|
|
| 8 |
|
| 9 |
from config.provider_ids import SUPPORTED_PROVIDER_IDS
|
| 10 |
from config.settings import Settings
|
|
|
|
| 11 |
from core.session_tracker import SessionTracker
|
| 12 |
-
from core.model_capabilities import (
|
| 13 |
-
get_model_capabilities,
|
| 14 |
-
find_best_model_for_task,
|
| 15 |
-
find_models_with_capability,
|
| 16 |
-
)
|
| 17 |
from core.task_detector import TaskDetector
|
|
|
|
| 18 |
|
| 19 |
from .gateway_model_ids import decode_gateway_model_id
|
| 20 |
from .models.anthropic import MessagesRequest, TokenCountRequest
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
@dataclass(frozen=True, slots=True)
|
|
@@ -151,6 +158,12 @@ class ModelRouter:
|
|
| 151 |
|
| 152 |
Used by the 'auto' routing logic to implement provider-side failover.
|
| 153 |
Considers session load for fair resource sharing across multiple clients.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
"""
|
| 155 |
if not self._is_auto(claude_model_name):
|
| 156 |
return [self.resolve(claude_model_name)]
|
|
@@ -206,20 +219,24 @@ class ModelRouter:
|
|
| 206 |
source,
|
| 207 |
)
|
| 208 |
|
| 209 |
-
# 1.
|
| 210 |
order_csv = (self._settings.auto_model_order or "").strip()
|
| 211 |
if order_csv:
|
| 212 |
for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
|
| 213 |
add_candidate(cand, "AUTO_MODEL_PRIORITY")
|
| 214 |
|
| 215 |
-
# 2.
|
| 216 |
add_candidate(self._settings.model, "MODEL")
|
| 217 |
|
| 218 |
-
# 3. NVIDIA Fallbacks
|
| 219 |
nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
|
| 220 |
if nim_csv:
|
| 221 |
for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
|
| 222 |
add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
# 4. Model-specific overrides
|
| 225 |
add_candidate(self._settings.model_opus, "MODEL_OPUS")
|
|
|
|
| 8 |
|
| 9 |
from config.provider_ids import SUPPORTED_PROVIDER_IDS
|
| 10 |
from config.settings import Settings
|
| 11 |
+
from core.model_capabilities import find_best_model_for_task
|
| 12 |
from core.session_tracker import SessionTracker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from core.task_detector import TaskDetector
|
| 14 |
+
from providers.rate_limit import GlobalRateLimiter
|
| 15 |
|
| 16 |
from .gateway_model_ids import decode_gateway_model_id
|
| 17 |
from .models.anthropic import MessagesRequest, TokenCountRequest
|
| 18 |
+
|
| 19 |
+
# Default NIM models to include in auto routing (in order of preference)
|
| 20 |
+
DEFAULT_NIM_AUTO_MODELS = [
|
| 21 |
+
"nvidia_nim/qwen/qwen3-coder-480b-a35b-instruct",
|
| 22 |
+
"nvidia_nim/z-ai/glm4.7",
|
| 23 |
+
"nvidia_nim/stepfun-ai/step-3.5-flash",
|
| 24 |
+
"nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
|
| 25 |
+
"nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct",
|
| 26 |
+
"nvidia_nim/bytedance/seed-oss-36b-instruct",
|
| 27 |
+
"nvidia_nim/mistralai/mistral-nemotron",
|
| 28 |
+
]
|
| 29 |
|
| 30 |
|
| 31 |
@dataclass(frozen=True, slots=True)
|
|
|
|
| 158 |
|
| 159 |
Used by the 'auto' routing logic to implement provider-side failover.
|
| 160 |
Considers session load for fair resource sharing across multiple clients.
|
| 161 |
+
|
| 162 |
+
Priority order:
|
| 163 |
+
1. AUTO_MODEL_ORDER (if configured)
|
| 164 |
+
2. MODEL (primary)
|
| 165 |
+
3. NVIDIA NIM fallback models (if configured, or DEFAULT_NIM_AUTO_MODELS)
|
| 166 |
+
4. MODEL_OPUS, MODEL_SONNET, MODEL_HAIKU
|
| 167 |
"""
|
| 168 |
if not self._is_auto(claude_model_name):
|
| 169 |
return [self.resolve(claude_model_name)]
|
|
|
|
| 219 |
source,
|
| 220 |
)
|
| 221 |
|
| 222 |
+
# 1. AUTO_MODEL_ORDER (user-configured priority)
|
| 223 |
order_csv = (self._settings.auto_model_order or "").strip()
|
| 224 |
if order_csv:
|
| 225 |
for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
|
| 226 |
add_candidate(cand, "AUTO_MODEL_PRIORITY")
|
| 227 |
|
| 228 |
+
# 2. Primary MODEL
|
| 229 |
add_candidate(self._settings.model, "MODEL")
|
| 230 |
|
| 231 |
+
# 3. NVIDIA Fallbacks - use configured or defaults
|
| 232 |
nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
|
| 233 |
if nim_csv:
|
| 234 |
for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
|
| 235 |
add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
|
| 236 |
+
else:
|
| 237 |
+
# Use default NIM models when no explicit fallback configured
|
| 238 |
+
for cand in DEFAULT_NIM_AUTO_MODELS:
|
| 239 |
+
add_candidate(cand, "DEFAULT_NIM_AUTO_MODELS")
|
| 240 |
|
| 241 |
# 4. Model-specific overrides
|
| 242 |
add_candidate(self._settings.model_opus, "MODEL_OPUS")
|
api/optimization_handlers.py
CHANGED
|
@@ -17,6 +17,7 @@ from .detection import (
|
|
| 17 |
is_quota_check_request,
|
| 18 |
is_suggestion_mode_request,
|
| 19 |
is_title_generation_request,
|
|
|
|
| 20 |
)
|
| 21 |
from .models.anthropic import MessagesRequest
|
| 22 |
from .models.responses import MessagesResponse, Usage
|
|
@@ -133,8 +134,26 @@ def try_filepath_mock(
|
|
| 133 |
)
|
| 134 |
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
# Cheapest/most common optimizations first for faster short-circuit.
|
| 137 |
OPTIMIZATION_HANDLERS = [
|
|
|
|
| 138 |
try_quota_mock,
|
| 139 |
try_prefix_detection,
|
| 140 |
try_title_skip,
|
|
|
|
| 17 |
is_quota_check_request,
|
| 18 |
is_suggestion_mode_request,
|
| 19 |
is_title_generation_request,
|
| 20 |
+
is_trivial_text_request,
|
| 21 |
)
|
| 22 |
from .models.anthropic import MessagesRequest
|
| 23 |
from .models.responses import MessagesResponse, Usage
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
+
def try_trivial_text(
|
| 138 |
+
request_data: MessagesRequest, settings: Settings
|
| 139 |
+
) -> MessagesResponse | None:
|
| 140 |
+
"""Fast-path trivial text requests (hi, ok, status checks) without API call."""
|
| 141 |
+
is_trivial, text = is_trivial_text_request(request_data)
|
| 142 |
+
if not is_trivial:
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
logger.info("Optimization: Fast-path trivial text request")
|
| 146 |
+
return _text_response(
|
| 147 |
+
request_data,
|
| 148 |
+
text,
|
| 149 |
+
input_tokens=5,
|
| 150 |
+
output_tokens=3,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
# Cheapest/most common optimizations first for faster short-circuit.
|
| 155 |
OPTIMIZATION_HANDLERS = [
|
| 156 |
+
try_trivial_text,
|
| 157 |
try_quota_mock,
|
| 158 |
try_prefix_detection,
|
| 159 |
try_title_skip,
|
api/services.py
CHANGED
|
@@ -244,6 +244,18 @@ class ClaudeProxyService:
|
|
| 244 |
|
| 245 |
for i, resolved in enumerate(candidates):
|
| 246 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
provider = self._provider_getter(resolved.provider_id)
|
| 248 |
routed_request = request_data.model_copy(deep=True)
|
| 249 |
routed_request.model = resolved.provider_model
|
|
@@ -292,7 +304,39 @@ class ClaudeProxyService:
|
|
| 292 |
)
|
| 293 |
last_exc = e
|
| 294 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
logger.error(
|
| 297 |
"Provider '{}' failed with unexpected error: {}. Trying next candidate...",
|
| 298 |
resolved.provider_id,
|
|
|
|
| 244 |
|
| 245 |
for i, resolved in enumerate(candidates):
|
| 246 |
try:
|
| 247 |
+
# Pre-check: skip candidates that are currently rate limited
|
| 248 |
+
from providers.rate_limit import GlobalRateLimiter
|
| 249 |
+
|
| 250 |
+
limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
|
| 251 |
+
if limiter.is_blocked() and resolved.provider_id != "zen":
|
| 252 |
+
logger.warning(
|
| 253 |
+
"Provider '{} is currently rate limited, skipping to next candidate...",
|
| 254 |
+
resolved.provider_id,
|
| 255 |
+
)
|
| 256 |
+
last_exc = Exception("Rate limited")
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
provider = self._provider_getter(resolved.provider_id)
|
| 260 |
routed_request = request_data.model_copy(deep=True)
|
| 261 |
routed_request.model = resolved.provider_model
|
|
|
|
| 304 |
)
|
| 305 |
last_exc = e
|
| 306 |
continue
|
| 307 |
+
except TimeoutError as e:
|
| 308 |
+
# Timeout = slow model, try next candidate for faster response
|
| 309 |
+
logger.warning(
|
| 310 |
+
"Provider '{}' timed out ({}). Trying next candidate...",
|
| 311 |
+
resolved.provider_id,
|
| 312 |
+
type(e).__name__,
|
| 313 |
+
)
|
| 314 |
+
last_exc = e
|
| 315 |
+
continue
|
| 316 |
except Exception as e:
|
| 317 |
+
# Check if it's a transient error that should trigger fallback
|
| 318 |
+
error_str = str(e).lower()
|
| 319 |
+
is_transient = any(
|
| 320 |
+
kw in error_str
|
| 321 |
+
for kw in [
|
| 322 |
+
"timeout",
|
| 323 |
+
"connection",
|
| 324 |
+
"refused",
|
| 325 |
+
"reset",
|
| 326 |
+
"unavailable",
|
| 327 |
+
"service",
|
| 328 |
+
]
|
| 329 |
+
)
|
| 330 |
+
if is_transient:
|
| 331 |
+
logger.warning(
|
| 332 |
+
"Provider '{}' failed with transient error ({}): {}. Trying next candidate...",
|
| 333 |
+
resolved.provider_id,
|
| 334 |
+
type(e).__name__,
|
| 335 |
+
e,
|
| 336 |
+
)
|
| 337 |
+
last_exc = e
|
| 338 |
+
continue
|
| 339 |
+
|
| 340 |
logger.error(
|
| 341 |
"Provider '{}' failed with unexpected error: {}. Trying next candidate...",
|
| 342 |
resolved.provider_id,
|
providers/nvidia_nim/client.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
"""NVIDIA NIM provider implementation."""
|
| 2 |
|
|
|
|
| 3 |
import json
|
| 4 |
from typing import Any
|
| 5 |
|
|
|
|
| 6 |
import openai
|
| 7 |
-
from openai import AsyncOpenAI
|
| 8 |
from loguru import logger
|
|
|
|
| 9 |
|
| 10 |
from config.nim import NimSettings
|
| 11 |
from config.settings import Settings
|
|
@@ -13,13 +15,13 @@ from providers.base import ProviderConfig
|
|
| 13 |
from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
|
| 14 |
from providers.openai_compat import OpenAIChatTransport
|
| 15 |
|
|
|
|
| 16 |
from .request import (
|
| 17 |
build_request_body,
|
| 18 |
clone_body_without_chat_template,
|
| 19 |
clone_body_without_reasoning_budget,
|
| 20 |
clone_body_without_reasoning_content,
|
| 21 |
)
|
| 22 |
-
from . import metrics as nim_metrics
|
| 23 |
|
| 24 |
|
| 25 |
class NvidiaNimProvider(OpenAIChatTransport):
|
|
@@ -105,12 +107,11 @@ class NvidiaNimProvider(OpenAIChatTransport):
|
|
| 105 |
configured fallback models from settings `nvidia_nim_fallback_models`.
|
| 106 |
"""
|
| 107 |
from config.settings import get_settings
|
| 108 |
-
import httpx
|
| 109 |
-
import asyncio
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
try:
|
| 116 |
client = self._client_for_body(body)
|
|
@@ -129,7 +130,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
|
|
| 129 |
first = await asyncio.wait_for(
|
| 130 |
stream.__anext__(), timeout=first_chunk_timeout_s
|
| 131 |
)
|
| 132 |
-
except
|
| 133 |
# try to close original stream if possible
|
| 134 |
try:
|
| 135 |
await getattr(stream, "aclose", lambda: None)()
|
|
@@ -214,7 +215,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
|
|
| 214 |
first = await asyncio.wait_for(
|
| 215 |
stream.__anext__(), timeout=fallback_first_chunk_timeout_s
|
| 216 |
)
|
| 217 |
-
except
|
| 218 |
try:
|
| 219 |
await getattr(stream, "aclose", lambda: None)()
|
| 220 |
except Exception:
|
|
|
|
| 1 |
"""NVIDIA NIM provider implementation."""
|
| 2 |
|
| 3 |
+
import asyncio
|
| 4 |
import json
|
| 5 |
from typing import Any
|
| 6 |
|
| 7 |
+
import httpx
|
| 8 |
import openai
|
|
|
|
| 9 |
from loguru import logger
|
| 10 |
+
from openai import AsyncOpenAI
|
| 11 |
|
| 12 |
from config.nim import NimSettings
|
| 13 |
from config.settings import Settings
|
|
|
|
| 15 |
from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
|
| 16 |
from providers.openai_compat import OpenAIChatTransport
|
| 17 |
|
| 18 |
+
from . import metrics as nim_metrics
|
| 19 |
from .request import (
|
| 20 |
build_request_body,
|
| 21 |
clone_body_without_chat_template,
|
| 22 |
clone_body_without_reasoning_budget,
|
| 23 |
clone_body_without_reasoning_content,
|
| 24 |
)
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class NvidiaNimProvider(OpenAIChatTransport):
|
|
|
|
| 107 |
configured fallback models from settings `nvidia_nim_fallback_models`.
|
| 108 |
"""
|
| 109 |
from config.settings import get_settings
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
# Reduced timeouts for faster fallback detection
|
| 112 |
+
connect_timeout_s = 15 # Reduced from 30
|
| 113 |
+
first_chunk_timeout_s = 45 # Reduced from 60
|
| 114 |
+
fallback_first_chunk_timeout_s = 30 # Reduced from 60 - faster fallback
|
| 115 |
|
| 116 |
try:
|
| 117 |
client = self._client_for_body(body)
|
|
|
|
| 130 |
first = await asyncio.wait_for(
|
| 131 |
stream.__anext__(), timeout=first_chunk_timeout_s
|
| 132 |
)
|
| 133 |
+
except TimeoutError:
|
| 134 |
# try to close original stream if possible
|
| 135 |
try:
|
| 136 |
await getattr(stream, "aclose", lambda: None)()
|
|
|
|
| 215 |
first = await asyncio.wait_for(
|
| 216 |
stream.__anext__(), timeout=fallback_first_chunk_timeout_s
|
| 217 |
)
|
| 218 |
+
except TimeoutError:
|
| 219 |
try:
|
| 220 |
await getattr(stream, "aclose", lambda: None)()
|
| 221 |
except Exception:
|
providers/openai_compat.py
CHANGED
|
@@ -28,12 +28,12 @@ from providers.error_mapping import (
|
|
| 28 |
map_error,
|
| 29 |
user_visible_message_for_mapped_provider_error,
|
| 30 |
)
|
| 31 |
-
from providers.rate_limit import GlobalRateLimiter
|
| 32 |
from providers.model_listing import (
|
| 33 |
ProviderModelInfo,
|
| 34 |
extract_openai_model_ids,
|
| 35 |
model_infos_from_ids,
|
| 36 |
)
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def _iter_heuristic_tool_use_sse(
|
|
@@ -77,23 +77,24 @@ class OpenAIChatTransport(BaseProvider):
|
|
| 77 |
self._base_url = base_url.rstrip("/")
|
| 78 |
self._http_client = None
|
| 79 |
self._client_cache: dict[str, AsyncOpenAI] = {}
|
| 80 |
-
#
|
| 81 |
-
#
|
| 82 |
if provider_name.lower() == "zen":
|
| 83 |
effective_rate_limit = 9999 # Effectively unlimited
|
| 84 |
effective_max_concurrency = config.max_concurrency * 4 # Higher concurrency for Zen
|
| 85 |
else:
|
| 86 |
effective_rate_limit = config.rate_limit or 40
|
| 87 |
-
|
|
|
|
| 88 |
self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
|
| 89 |
provider_name.lower(),
|
| 90 |
rate_limit=effective_rate_limit,
|
| 91 |
rate_window=config.rate_window,
|
| 92 |
max_concurrency=effective_max_concurrency,
|
| 93 |
)
|
| 94 |
-
#
|
| 95 |
-
#
|
| 96 |
-
#
|
| 97 |
http_client_args = {
|
| 98 |
"timeout": httpx.Timeout(
|
| 99 |
config.http_read_timeout,
|
|
@@ -104,9 +105,9 @@ class OpenAIChatTransport(BaseProvider):
|
|
| 104 |
"trust_env": False,
|
| 105 |
"http2": True,
|
| 106 |
"limits": httpx.Limits(
|
| 107 |
-
max_keepalive_connections=
|
| 108 |
-
max_connections=
|
| 109 |
-
keepalive_expiry=
|
| 110 |
),
|
| 111 |
}
|
| 112 |
if config.proxy:
|
|
@@ -407,7 +408,7 @@ class OpenAIChatTransport(BaseProvider):
|
|
| 407 |
except Exception as e:
|
| 408 |
self._log_stream_transport_error(tag, req_tag, e)
|
| 409 |
mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
|
| 410 |
-
|
| 411 |
has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
|
| 412 |
has_content_blocks = (
|
| 413 |
sse.blocks.text_index != -1
|
|
|
|
| 28 |
map_error,
|
| 29 |
user_visible_message_for_mapped_provider_error,
|
| 30 |
)
|
|
|
|
| 31 |
from providers.model_listing import (
|
| 32 |
ProviderModelInfo,
|
| 33 |
extract_openai_model_ids,
|
| 34 |
model_infos_from_ids,
|
| 35 |
)
|
| 36 |
+
from providers.rate_limit import GlobalRateLimiter
|
| 37 |
|
| 38 |
|
| 39 |
def _iter_heuristic_tool_use_sse(
|
|
|
|
| 77 |
self._base_url = base_url.rstrip("/")
|
| 78 |
self._http_client = None
|
| 79 |
self._client_cache: dict[str, AsyncOpenAI] = {}
|
| 80 |
+
# NVIDIA NIM has 40 req/min - use burst capacity for faster initial response
|
| 81 |
+
# Increase concurrency for better throughput under load
|
| 82 |
if provider_name.lower() == "zen":
|
| 83 |
effective_rate_limit = 9999 # Effectively unlimited
|
| 84 |
effective_max_concurrency = config.max_concurrency * 4 # Higher concurrency for Zen
|
| 85 |
else:
|
| 86 |
effective_rate_limit = config.rate_limit or 40
|
| 87 |
+
# Increase default concurrency for NIM - allows more parallel streams
|
| 88 |
+
effective_max_concurrency = max(config.max_concurrency * 2, 20)
|
| 89 |
self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
|
| 90 |
provider_name.lower(),
|
| 91 |
rate_limit=effective_rate_limit,
|
| 92 |
rate_window=config.rate_window,
|
| 93 |
max_concurrency=effective_max_concurrency,
|
| 94 |
)
|
| 95 |
+
# Connection pool tuned for maximum throughput.
|
| 96 |
+
# NVIDIA NIM servers: reduce keepalive_expiry to avoid stale connections,
|
| 97 |
+
# increase pool size for high concurrency.
|
| 98 |
http_client_args = {
|
| 99 |
"timeout": httpx.Timeout(
|
| 100 |
config.http_read_timeout,
|
|
|
|
| 105 |
"trust_env": False,
|
| 106 |
"http2": True,
|
| 107 |
"limits": httpx.Limits(
|
| 108 |
+
max_keepalive_connections=50, # Increased from 20
|
| 109 |
+
max_connections=200, # Increased from 100
|
| 110 |
+
keepalive_expiry=15.0, # Reduced from 30 - faster connection rotation
|
| 111 |
),
|
| 112 |
}
|
| 113 |
if config.proxy:
|
|
|
|
| 408 |
except Exception as e:
|
| 409 |
self._log_stream_transport_error(tag, req_tag, e)
|
| 410 |
mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
|
| 411 |
+
|
| 412 |
has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
|
| 413 |
has_content_blocks = (
|
| 414 |
sse.blocks.text_index != -1
|
providers/rate_limit.py
CHANGED
|
@@ -198,9 +198,9 @@ class GlobalRateLimiter:
|
|
| 198 |
fn: Callable[..., Any],
|
| 199 |
*args: Any,
|
| 200 |
max_retries: int = 3,
|
| 201 |
-
base_delay: float = 1.0
|
| 202 |
-
max_delay: float =
|
| 203 |
-
jitter: float = 0.
|
| 204 |
**kwargs: Any,
|
| 205 |
) -> Any:
|
| 206 |
"""Execute an async callable with rate limiting and retry on 429.
|
|
|
|
| 198 |
fn: Callable[..., Any],
|
| 199 |
*args: Any,
|
| 200 |
max_retries: int = 3,
|
| 201 |
+
base_delay: float = 0.5, # Reduced from 1.0 for faster recovery
|
| 202 |
+
max_delay: float = 30.0, # Reduced from 60.0 for faster fallback
|
| 203 |
+
jitter: float = 0.25, # Reduced from 0.5 for more predictable delays
|
| 204 |
**kwargs: Any,
|
| 205 |
) -> Any:
|
| 206 |
"""Execute an async callable with rate limiting and retry on 429.
|