Spaces:

Yash030
/

claude-code-proxy

Running

Yash030 Claude Opus 4.7 commited on 3 days ago

Commit

ebba9d6

1 Parent(s): 49813da

Speed optimizations and enhanced auto-model fallback routing

- Connection pool tuning: increased max_connections (100→200), max_keepalive (20→50), reduced keepalive_expiry (30s→15s)
- NIM concurrency: increased max_concurrency for better throughput
- Faster timeouts: connect (30→15s), first_chunk (60→45s), fallback (60→30s)
- Retry backoff: reduced base_delay (1→0.5s), max_delay (60→30s)
- New trivial request fast-path for hi/ok/status checks
- Auto model now routes to all 7 NIM models by default
- Pre-flight rate limit check before trying each candidate
- Timeout/transient errors trigger automatic fallback to next model
- Import cleanup for httpx/asyncio at module level

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (7) hide show

api/detection.py +31 -0
api/model_router.py +26 -9
api/optimization_handlers.py +19 -0
api/services.py +44 -0
providers/nvidia_nim/client.py +10 -9
providers/openai_compat.py +12 -11
providers/rate_limit.py +3 -3

api/detection.py CHANGED Viewed

@@ -9,6 +9,37 @@ from core.anthropic import extract_text_from_content
 from .models.anthropic import MessagesRequest
 def is_quota_check_request(request_data: MessagesRequest) -> bool:
     """Check if this is a quota probe request.

 from .models.anthropic import MessagesRequest
+def is_trivial_text_request(request_data: MessagesRequest) -> tuple[bool, str]:
+    """Detect trivial requests that can be fast-pathed.
+    Returns (is_trivial, text_content) for trivial requests that only need
+    a simple acknowledgment or echo response.
+    """
+    # Only for single short user messages with max_tokens=1
+    if request_data.max_tokens != 1:
+        return False, ""
+    if len(request_data.messages) != 1:
+        return False, ""
+    msg = request_data.messages[0]
+    if msg.role != "user":
+        return False, ""
+    text = extract_text_from_content(msg.content)
+    text_lower = text.lower().strip()
+    # Single word or very short queries
+    if len(text_lower) < 50:
+        # "hi", "hello", "ok", "thanks", etc.
+        if text_lower in ("hi", "hello", "ok", "thanks", "thank you", "yes", "no", "okay"):
+            return True, f"OK. {text}"
+        # Health/status checks
+        if any(kw in text_lower for kw in ["status", "health", "ping", "are you"]):
+            return True, "I'm ready."
+    return False, ""
 def is_quota_check_request(request_data: MessagesRequest) -> bool:
     """Check if this is a quota probe request.

api/model_router.py CHANGED Viewed

@@ -8,17 +8,24 @@ from loguru import logger
 from config.provider_ids import SUPPORTED_PROVIDER_IDS
 from config.settings import Settings
 from core.session_tracker import SessionTracker
-from core.model_capabilities import (
-    get_model_capabilities,
-    find_best_model_for_task,
-    find_models_with_capability,
-)
 from core.task_detector import TaskDetector
 from .gateway_model_ids import decode_gateway_model_id
 from .models.anthropic import MessagesRequest, TokenCountRequest
-from providers.rate_limit import GlobalRateLimiter
 @dataclass(frozen=True, slots=True)
@@ -151,6 +158,12 @@ class ModelRouter:
         Used by the 'auto' routing logic to implement provider-side failover.
         Considers session load for fair resource sharing across multiple clients.
         """
         if not self._is_auto(claude_model_name):
             return [self.resolve(claude_model_name)]
@@ -206,20 +219,24 @@ class ModelRouter:
                     source,
                 )
-        # 1. Preferred order (AUTO_MODEL_ORDER)
         order_csv = (self._settings.auto_model_order or "").strip()
         if order_csv:
             for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
                 add_candidate(cand, "AUTO_MODEL_PRIORITY")
-        # 2. Main MODEL
         add_candidate(self._settings.model, "MODEL")
-        # 3. NVIDIA Fallbacks
         nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
         if nim_csv:
             for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
                 add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
         # 4. Model-specific overrides
         add_candidate(self._settings.model_opus, "MODEL_OPUS")

 from config.provider_ids import SUPPORTED_PROVIDER_IDS
 from config.settings import Settings
+from core.model_capabilities import find_best_model_for_task
 from core.session_tracker import SessionTracker
 from core.task_detector import TaskDetector
+from providers.rate_limit import GlobalRateLimiter
 from .gateway_model_ids import decode_gateway_model_id
 from .models.anthropic import MessagesRequest, TokenCountRequest
+# Default NIM models to include in auto routing (in order of preference)
+DEFAULT_NIM_AUTO_MODELS = [
+    "nvidia_nim/qwen/qwen3-coder-480b-a35b-instruct",
+    "nvidia_nim/z-ai/glm4.7",
+    "nvidia_nim/stepfun-ai/step-3.5-flash",
+    "nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
+    "nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct",
+    "nvidia_nim/bytedance/seed-oss-36b-instruct",
+    "nvidia_nim/mistralai/mistral-nemotron",
+]
 @dataclass(frozen=True, slots=True)
         Used by the 'auto' routing logic to implement provider-side failover.
         Considers session load for fair resource sharing across multiple clients.
+        Priority order:
+        1. AUTO_MODEL_ORDER (if configured)
+        2. MODEL (primary)
+        3. NVIDIA NIM fallback models (if configured, or DEFAULT_NIM_AUTO_MODELS)
+        4. MODEL_OPUS, MODEL_SONNET, MODEL_HAIKU
         """
         if not self._is_auto(claude_model_name):
             return [self.resolve(claude_model_name)]
                     source,
                 )
+        # 1. AUTO_MODEL_ORDER (user-configured priority)
         order_csv = (self._settings.auto_model_order or "").strip()
         if order_csv:
             for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
                 add_candidate(cand, "AUTO_MODEL_PRIORITY")
+        # 2. Primary MODEL
         add_candidate(self._settings.model, "MODEL")
+        # 3. NVIDIA Fallbacks - use configured or defaults
         nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
         if nim_csv:
             for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
                 add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
+        else:
+            # Use default NIM models when no explicit fallback configured
+            for cand in DEFAULT_NIM_AUTO_MODELS:
+                add_candidate(cand, "DEFAULT_NIM_AUTO_MODELS")
         # 4. Model-specific overrides
         add_candidate(self._settings.model_opus, "MODEL_OPUS")

api/optimization_handlers.py CHANGED Viewed

@@ -17,6 +17,7 @@ from .detection import (
     is_quota_check_request,
     is_suggestion_mode_request,
     is_title_generation_request,
 )
 from .models.anthropic import MessagesRequest
 from .models.responses import MessagesResponse, Usage
@@ -133,8 +134,26 @@ def try_filepath_mock(
     )
 # Cheapest/most common optimizations first for faster short-circuit.
 OPTIMIZATION_HANDLERS = [
     try_quota_mock,
     try_prefix_detection,
     try_title_skip,

     is_quota_check_request,
     is_suggestion_mode_request,
     is_title_generation_request,
+    is_trivial_text_request,
 )
 from .models.anthropic import MessagesRequest
 from .models.responses import MessagesResponse, Usage
     )
+def try_trivial_text(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Fast-path trivial text requests (hi, ok, status checks) without API call."""
+    is_trivial, text = is_trivial_text_request(request_data)
+    if not is_trivial:
+        return None
+    logger.info("Optimization: Fast-path trivial text request")
+    return _text_response(
+        request_data,
+        text,
+        input_tokens=5,
+        output_tokens=3,
+    )
 # Cheapest/most common optimizations first for faster short-circuit.
 OPTIMIZATION_HANDLERS = [
+    try_trivial_text,
     try_quota_mock,
     try_prefix_detection,
     try_title_skip,

api/services.py CHANGED Viewed

@@ -244,6 +244,18 @@ class ClaudeProxyService:
         for i, resolved in enumerate(candidates):
             try:
                 provider = self._provider_getter(resolved.provider_id)
                 routed_request = request_data.model_copy(deep=True)
                 routed_request.model = resolved.provider_model
@@ -292,7 +304,39 @@ class ClaudeProxyService:
                 )
                 last_exc = e
                 continue
             except Exception as e:
                 logger.error(
                     "Provider '{}' failed with unexpected error: {}. Trying next candidate...",
                     resolved.provider_id,

         for i, resolved in enumerate(candidates):
             try:
+                # Pre-check: skip candidates that are currently rate limited
+                from providers.rate_limit import GlobalRateLimiter
+                limiter = GlobalRateLimiter.get_scoped_instance(resolved.provider_id)
+                if limiter.is_blocked() and resolved.provider_id != "zen":
+                    logger.warning(
+                        "Provider '{} is currently rate limited, skipping to next candidate...",
+                        resolved.provider_id,
+                    )
+                    last_exc = Exception("Rate limited")
+                    continue
                 provider = self._provider_getter(resolved.provider_id)
                 routed_request = request_data.model_copy(deep=True)
                 routed_request.model = resolved.provider_model
                 )
                 last_exc = e
                 continue
+            except TimeoutError as e:
+                # Timeout = slow model, try next candidate for faster response
+                logger.warning(
+                    "Provider '{}' timed out ({}). Trying next candidate...",
+                    resolved.provider_id,
+                    type(e).__name__,
+                )
+                last_exc = e
+                continue
             except Exception as e:
+                # Check if it's a transient error that should trigger fallback
+                error_str = str(e).lower()
+                is_transient = any(
+                    kw in error_str
+                    for kw in [
+                        "timeout",
+                        "connection",
+                        "refused",
+                        "reset",
+                        "unavailable",
+                        "service",
+                    ]
+                )
+                if is_transient:
+                    logger.warning(
+                        "Provider '{}' failed with transient error ({}): {}. Trying next candidate...",
+                        resolved.provider_id,
+                        type(e).__name__,
+                        e,
+                    )
+                    last_exc = e
+                    continue
                 logger.error(
                     "Provider '{}' failed with unexpected error: {}. Trying next candidate...",
                     resolved.provider_id,

providers/nvidia_nim/client.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """NVIDIA NIM provider implementation."""
 import json
 from typing import Any
 import openai
-from openai import AsyncOpenAI
 from loguru import logger
 from config.nim import NimSettings
 from config.settings import Settings
@@ -13,13 +15,13 @@ from providers.base import ProviderConfig
 from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
 from providers.openai_compat import OpenAIChatTransport
 from .request import (
     build_request_body,
     clone_body_without_chat_template,
     clone_body_without_reasoning_budget,
     clone_body_without_reasoning_content,
 )
-from . import metrics as nim_metrics
 class NvidiaNimProvider(OpenAIChatTransport):
@@ -105,12 +107,11 @@ class NvidiaNimProvider(OpenAIChatTransport):
         configured fallback models from settings `nvidia_nim_fallback_models`.
         """
         from config.settings import get_settings
-        import httpx
-        import asyncio
-        connect_timeout_s = 30
-        first_chunk_timeout_s = 60
-        fallback_first_chunk_timeout_s = 60
         try:
             client = self._client_for_body(body)
@@ -129,7 +130,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
                 first = await asyncio.wait_for(
                     stream.__anext__(), timeout=first_chunk_timeout_s
                 )
-            except asyncio.TimeoutError:
                 # try to close original stream if possible
                 try:
                     await getattr(stream, "aclose", lambda: None)()
@@ -214,7 +215,7 @@ class NvidiaNimProvider(OpenAIChatTransport):
                         first = await asyncio.wait_for(
                             stream.__anext__(), timeout=fallback_first_chunk_timeout_s
                         )
-                    except asyncio.TimeoutError:
                         try:
                             await getattr(stream, "aclose", lambda: None)()
                         except Exception:

 """NVIDIA NIM provider implementation."""
+import asyncio
 import json
 from typing import Any
+import httpx
 import openai
 from loguru import logger
+from openai import AsyncOpenAI
 from config.nim import NimSettings
 from config.settings import Settings
 from providers.defaults import NVIDIA_NIM_DEFAULT_BASE
 from providers.openai_compat import OpenAIChatTransport
+from . import metrics as nim_metrics
 from .request import (
     build_request_body,
     clone_body_without_chat_template,
     clone_body_without_reasoning_budget,
     clone_body_without_reasoning_content,
 )
 class NvidiaNimProvider(OpenAIChatTransport):
         configured fallback models from settings `nvidia_nim_fallback_models`.
         """
         from config.settings import get_settings
+        # Reduced timeouts for faster fallback detection
+        connect_timeout_s = 15  # Reduced from 30
+        first_chunk_timeout_s = 45  # Reduced from 60
+        fallback_first_chunk_timeout_s = 30  # Reduced from 60 - faster fallback
         try:
             client = self._client_for_body(body)
                 first = await asyncio.wait_for(
                     stream.__anext__(), timeout=first_chunk_timeout_s
                 )
+            except TimeoutError:
                 # try to close original stream if possible
                 try:
                     await getattr(stream, "aclose", lambda: None)()
                         first = await asyncio.wait_for(
                             stream.__anext__(), timeout=fallback_first_chunk_timeout_s
                         )
+                    except TimeoutError:
                         try:
                             await getattr(stream, "aclose", lambda: None)()
                         except Exception:

providers/openai_compat.py CHANGED Viewed

@@ -28,12 +28,12 @@ from providers.error_mapping import (
     map_error,
     user_visible_message_for_mapped_provider_error,
 )
-from providers.rate_limit import GlobalRateLimiter
 from providers.model_listing import (
     ProviderModelInfo,
     extract_openai_model_ids,
     model_infos_from_ids,
 )
 def _iter_heuristic_tool_use_sse(
@@ -77,23 +77,24 @@ class OpenAIChatTransport(BaseProvider):
         self._base_url = base_url.rstrip("/")
         self._http_client = None
         self._client_cache: dict[str, AsyncOpenAI] = {}
-        # Zen has no rate limits - use very high limits to avoid throttling
-        # NVIDIA NIM has 40 req/min - respect that limit
         if provider_name.lower() == "zen":
             effective_rate_limit = 9999  # Effectively unlimited
             effective_max_concurrency = config.max_concurrency * 4  # Higher concurrency for Zen
         else:
             effective_rate_limit = config.rate_limit or 40
-            effective_max_concurrency = config.max_concurrency
         self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
             provider_name.lower(),
             rate_limit=effective_rate_limit,
             rate_window=config.rate_window,
             max_concurrency=effective_max_concurrency,
         )
-        # Always create an explicit httpx.AsyncClient with trust_env=False to avoid
-        # slow system proxy detection on Windows during initialization.
-        # Connection pool tuned for high throughput with keepalive optimization.
         http_client_args = {
             "timeout": httpx.Timeout(
                 config.http_read_timeout,
@@ -104,9 +105,9 @@ class OpenAIChatTransport(BaseProvider):
             "trust_env": False,
             "http2": True,
             "limits": httpx.Limits(
-                max_keepalive_connections=20,
-                max_connections=100,
-                keepalive_expiry=30.0,
             ),
         }
         if config.proxy:
@@ -407,7 +408,7 @@ class OpenAIChatTransport(BaseProvider):
             except Exception as e:
                 self._log_stream_transport_error(tag, req_tag, e)
                 mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
                 has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
                 has_content_blocks = (
                     sse.blocks.text_index != -1

     map_error,
     user_visible_message_for_mapped_provider_error,
 )
 from providers.model_listing import (
     ProviderModelInfo,
     extract_openai_model_ids,
     model_infos_from_ids,
 )
+from providers.rate_limit import GlobalRateLimiter
 def _iter_heuristic_tool_use_sse(
         self._base_url = base_url.rstrip("/")
         self._http_client = None
         self._client_cache: dict[str, AsyncOpenAI] = {}
+        # NVIDIA NIM has 40 req/min - use burst capacity for faster initial response
+        # Increase concurrency for better throughput under load
         if provider_name.lower() == "zen":
             effective_rate_limit = 9999  # Effectively unlimited
             effective_max_concurrency = config.max_concurrency * 4  # Higher concurrency for Zen
         else:
             effective_rate_limit = config.rate_limit or 40
+            # Increase default concurrency for NIM - allows more parallel streams
+            effective_max_concurrency = max(config.max_concurrency * 2, 20)
         self._global_rate_limiter = GlobalRateLimiter.get_scoped_instance(
             provider_name.lower(),
             rate_limit=effective_rate_limit,
             rate_window=config.rate_window,
             max_concurrency=effective_max_concurrency,
         )
+        # Connection pool tuned for maximum throughput.
+        # NVIDIA NIM servers: reduce keepalive_expiry to avoid stale connections,
+        # increase pool size for high concurrency.
         http_client_args = {
             "timeout": httpx.Timeout(
                 config.http_read_timeout,
             "trust_env": False,
             "http2": True,
             "limits": httpx.Limits(
+                max_keepalive_connections=50,  # Increased from 20
+                max_connections=200,  # Increased from 100
+                keepalive_expiry=15.0,  # Reduced from 30 - faster connection rotation
             ),
         }
         if config.proxy:
             except Exception as e:
                 self._log_stream_transport_error(tag, req_tag, e)
                 mapped_e = map_error(e, rate_limiter=self._global_rate_limiter)
                 has_started_tool = any(s.started for s in sse.blocks.tool_states.values())
                 has_content_blocks = (
                     sse.blocks.text_index != -1

providers/rate_limit.py CHANGED Viewed

@@ -198,9 +198,9 @@ class GlobalRateLimiter:
         fn: Callable[..., Any],
         *args: Any,
         max_retries: int = 3,
-        base_delay: float = 1.0,
-        max_delay: float = 60.0,
-        jitter: float = 0.5,
         **kwargs: Any,
     ) -> Any:
         """Execute an async callable with rate limiting and retry on 429.

         fn: Callable[..., Any],
         *args: Any,
         max_retries: int = 3,
+        base_delay: float = 0.5,  # Reduced from 1.0 for faster recovery
+        max_delay: float = 30.0,  # Reduced from 60.0 for faster fallback
+        jitter: float = 0.25,  # Reduced from 0.5 for more predictable delays
         **kwargs: Any,
     ) -> Any:
         """Execute an async callable with rate limiting and retry on 429.