Spaces:

Yash030
/

claude-code-proxy

Running

App Files Files Community

Yash030 Claude Opus 4.7 commited on 1 day ago

Commit

0157ac7

1 Parent(s): 0c3f08f

Deploy claude-code-nvidia proxy to Hugging Face Spaces

Browse files

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +107 -0
Dockerfile +26 -0
api/__init__.py +17 -0
api/__pycache__/__init__.cpython-314.pyc +0 -0
api/__pycache__/app.cpython-314.pyc +0 -0
api/__pycache__/command_utils.cpython-314.pyc +0 -0
api/__pycache__/dependencies.cpython-314.pyc +0 -0
api/__pycache__/detection.cpython-314.pyc +0 -0
api/__pycache__/gateway_model_ids.cpython-314.pyc +0 -0
api/__pycache__/model_router.cpython-314.pyc +0 -0
api/__pycache__/optimization_handlers.cpython-314.pyc +0 -0
api/__pycache__/routes.cpython-314.pyc +0 -0
api/__pycache__/runtime.cpython-314.pyc +0 -0
api/__pycache__/services.cpython-314.pyc +0 -0
api/__pycache__/validation_log.cpython-314.pyc +0 -0
api/app.py +175 -0
api/command_utils.py +164 -0
api/dependencies.py +144 -0
api/detection.py +136 -0
api/gateway_model_ids.py +54 -0
api/model_router.py +261 -0
api/models/__init__.py +45 -0
api/models/__pycache__/__init__.cpython-314.pyc +0 -0
api/models/__pycache__/anthropic.cpython-314.pyc +0 -0
api/models/__pycache__/responses.cpython-314.pyc +0 -0
api/models/anthropic.py +163 -0
api/models/responses.py +56 -0
api/optimization_handlers.py +154 -0
api/routes.py +271 -0
api/runtime.py +338 -0
api/services.py +305 -0
api/validation_log.py +48 -0
api/web_server_tools.py +22 -0
api/web_tools/__init__.py +17 -0
api/web_tools/__pycache__/__init__.cpython-314.pyc +0 -0
api/web_tools/__pycache__/constants.cpython-314.pyc +0 -0
api/web_tools/__pycache__/egress.cpython-314.pyc +0 -0
api/web_tools/__pycache__/parsers.cpython-314.pyc +0 -0
api/web_tools/__pycache__/request.cpython-314.pyc +0 -0
api/web_tools/__pycache__/streaming.cpython-314.pyc +0 -0
api/web_tools/constants.py +15 -0
api/web_tools/egress.py +99 -0
api/web_tools/outbound.py +278 -0
api/web_tools/parsers.py +104 -0
api/web_tools/request.py +86 -0
api/web_tools/streaming.py +206 -0
cli/__init__.py +6 -0
cli/entrypoints.py +60 -0
cli/manager.py +163 -0
cli/process_registry.py +74 -0

.env.example ADDED Viewed

	@@ -0,0 +1,107 @@

+# NVIDIA NIM Config
+NVIDIA_NIM_API_KEY=""
+# All Claude model requests are mapped to these models, plain model is fallback
+# Format: provider_type/model/name
+# Valid provider: "nvidia_nim"
+MODEL_OPUS=
+MODEL_SONNET=
+MODEL_HAIKU=
+MODEL="nvidia_nim/z-ai/glm4.7"
+# Optional live smoke model overrides. Smoke runs for NVIDIA NIM.
+FCC_SMOKE_MODEL_NVIDIA_NIM=
+# Thinking output
+# Per-Claude-model switches for provider reasoning requests and Claude thinking blocks.
+# Blank per-model switches inherit ENABLE_MODEL_THINKING.
+ENABLE_OPUS_THINKING=
+ENABLE_SONNET_THINKING=
+ENABLE_HAIKU_THINKING=
+ENABLE_MODEL_THINKING=true
+# Provider config
+# Per-provider proxy support: http and socks5, example: "http://username:password@host:port"
+NVIDIA_NIM_PROXY=""
+PROVIDER_RATE_LIMIT=1
+PROVIDER_RATE_WINDOW=3
+PROVIDER_MAX_CONCURRENCY=5
+# HTTP client timeouts (seconds) for provider API requests
+HTTP_READ_TIMEOUT=300
+HTTP_WRITE_TIMEOUT=10
+HTTP_CONNECT_TIMEOUT=10
+# Optional server API key (Anthropic-style)
+ANTHROPIC_AUTH_TOKEN="freecc"
+# Messaging Platform: "telegram" | "discord" | "none"
+MESSAGING_PLATFORM="discord"
+MESSAGING_RATE_LIMIT=1
+MESSAGING_RATE_WINDOW=1
+# Voice Note Transcription
+VOICE_NOTE_ENABLED=false
+# WHISPER_DEVICE: "cpu" | "cuda" | "nvidia_nim"
+# - "cpu"/"cuda": Hugging Face transformers Whisper (offline, free; install with: uv sync --extra voice_local)
+# - "nvidia_nim": NVIDIA NIM Whisper via Riva gRPC (requires NVIDIA_NIM_API_KEY; install with: uv sync --extra voice)
+# (Independent of MODEL=nvidia_nim/...: that selects the *chat* provider; this selects voice STT only.)
+WHISPER_DEVICE="nvidia_nim"
+# WHISPER_MODEL:
+# - For cpu/cuda: Hugging Face ID or short name (tiny, base, small, medium, large-v2, large-v3, large-v3-turbo)
+# - For nvidia_nim: NVIDIA NIM model (e.g., "nvidia/parakeet-ctc-1.1b-asr", "openai/whisper-large-v3")
+# - For nvidia_nim, default to "openai/whisper-large-v3" for best performance
+WHISPER_MODEL="openai/whisper-large-v3"
+HF_TOKEN=""
+# Telegram Config
+TELEGRAM_BOT_TOKEN=""
+ALLOWED_TELEGRAM_USER_ID=""
+# Discord Config
+DISCORD_BOT_TOKEN=""
+ALLOWED_DISCORD_CHANNELS=""
+# Agent Config
+CLAUDE_WORKSPACE="./agent_workspace"
+ALLOWED_DIR=""
+CLAUDE_CLI_BIN="claude"
+FAST_PREFIX_DETECTION=true
+ENABLE_NETWORK_PROBE_MOCK=true
+ENABLE_TITLE_GENERATION_SKIP=true
+ENABLE_SUGGESTION_MODE_SKIP=true
+ENABLE_FILEPATH_EXTRACTION_MOCK=true
+# Local Anthropic web_search / web_fetch handling (performs outbound HTTP; on by default)
+ENABLE_WEB_SERVER_TOOLS=true
+WEB_FETCH_ALLOWED_SCHEMES=http,https
+WEB_FETCH_ALLOW_PRIVATE_NETWORKS=false
+# Verbose diagnostics (avoid logging raw prompts / SSE bodies in production)
+DEBUG_PLATFORM_EDITS=false
+DEBUG_SUBAGENT_STACK=false
+# When true, also allows DEBUG-level httpx/httpcore/telegram log noise (not just payload logging).
+LOG_RAW_API_PAYLOADS=false
+LOG_RAW_SSE_EVENTS=false
+# When true, log full exception text and tracebacks for unhandled errors (may leak request-derived data).
+LOG_API_ERROR_TRACEBACKS=false
+# When true, log message/transcription text previews in messaging adapters (may leak user content).
+LOG_RAW_MESSAGING_CONTENT=false
+# When true, log full Claude CLI stderr, non-JSON stdout lines, and parser error text.
+LOG_RAW_CLI_DIAGNOSTICS=false
+# When true, log full exception and CLI error message strings in messaging (may leak user content).
+LOG_MESSAGING_ERROR_DETAILS=false

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.14-slim
+WORKDIR /app
+# Install uv
+RUN pip install uv
+# Copy project files
+COPY pyproject.toml uv.lock ./
+COPY api/ ./api/
+COPY cli/ ./cli/
+COPY config/ ./config/
+COPY core/ ./core/
+COPY messaging/ ./messaging/
+COPY providers/ ./providers/
+COPY server.py ./
+COPY .env.example ./
+# Install dependencies
+RUN uv sync --frozen --no-dev
+# Expose port (HF Spaces default)
+EXPOSE 7860
+# Run server
+CMD ["uv", "run", "uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]

api/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""API layer for Claude Code Proxy."""
+from .app import create_app
+from .models import (
+    MessagesRequest,
+    MessagesResponse,
+    TokenCountRequest,
+    TokenCountResponse,
+)
+__all__ = [
+    "MessagesRequest",
+    "MessagesResponse",
+    "TokenCountRequest",
+    "TokenCountResponse",
+    "create_app",
+]

api/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (431 Bytes). View file

api/__pycache__/app.cpython-314.pyc ADDED Viewed

Binary file (10.7 kB). View file

api/__pycache__/command_utils.cpython-314.pyc ADDED Viewed

Binary file (5.83 kB). View file

api/__pycache__/dependencies.cpython-314.pyc ADDED Viewed

Binary file (7.56 kB). View file

api/__pycache__/detection.cpython-314.pyc ADDED Viewed

Binary file (6.71 kB). View file

api/__pycache__/gateway_model_ids.cpython-314.pyc ADDED Viewed

Binary file (2.55 kB). View file

api/__pycache__/model_router.cpython-314.pyc ADDED Viewed

Binary file (12.6 kB). View file

api/__pycache__/optimization_handlers.cpython-314.pyc ADDED Viewed

Binary file (5.7 kB). View file

api/__pycache__/routes.cpython-314.pyc ADDED Viewed

Binary file (14.1 kB). View file

api/__pycache__/runtime.cpython-314.pyc ADDED Viewed

Binary file (20.1 kB). View file

api/__pycache__/services.cpython-314.pyc ADDED Viewed

Binary file (14 kB). View file

api/__pycache__/validation_log.cpython-314.pyc ADDED Viewed

Binary file (2.95 kB). View file

api/app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""FastAPI application factory and configuration."""
+import traceback
+from contextlib import asynccontextmanager
+from typing import Any
+from fastapi import FastAPI, Request
+from fastapi.exception_handlers import request_validation_exception_handler
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from loguru import logger
+from starlette.types import Receive, Scope, Send
+from config.logging_config import configure_logging
+from config.settings import get_settings
+from providers.exceptions import ProviderError
+from .routes import router
+from .runtime import AppRuntime, startup_failure_message
+from .validation_log import summarize_request_validation_body
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager."""
+    runtime = AppRuntime.for_app(app, settings=get_settings())
+    await runtime.startup()
+    yield
+    await runtime.shutdown()
+class GracefulLifespanApp:
+    """ASGI wrapper that reports startup failures without Starlette tracebacks."""
+    def __init__(self, app: FastAPI):
+        self.app = app
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.app, name)
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] != "lifespan":
+            await self.app(scope, receive, send)
+            return
+        await self._lifespan(receive, send)
+    async def _lifespan(self, receive: Receive, send: Send) -> None:
+        settings = get_settings()
+        runtime = AppRuntime.for_app(self.app, settings=settings)
+        startup_complete = False
+        while True:
+            message = await receive()
+            if message["type"] == "lifespan.startup":
+                try:
+                    await runtime.startup()
+                except Exception as exc:
+                    await send(
+                        {
+                            "type": "lifespan.startup.failed",
+                            "message": startup_failure_message(settings, exc),
+                        }
+                    )
+                    return
+                startup_complete = True
+                await send({"type": "lifespan.startup.complete"})
+                continue
+            if message["type"] == "lifespan.shutdown":
+                if startup_complete:
+                    try:
+                        await runtime.shutdown()
+                    except Exception as exc:
+                        logger.error("Shutdown failed: exc_type={}", type(exc).__name__)
+                        await send({"type": "lifespan.shutdown.failed", "message": ""})
+                        return
+                await send({"type": "lifespan.shutdown.complete"})
+                return
+def create_app(*, lifespan_enabled: bool = True) -> FastAPI:
+    """Create and configure the FastAPI application."""
+    settings = get_settings()
+    configure_logging(
+        settings.log_file, verbose_third_party=settings.log_raw_api_payloads
+    )
+    app_kwargs: dict[str, Any] = {
+        "title": "Claude Code Proxy",
+        "version": "2.0.0",
+    }
+    if lifespan_enabled:
+        app_kwargs["lifespan"] = lifespan
+    app = FastAPI(**app_kwargs)
+    # Register routes
+    app.include_router(router)
+    # Exception handlers
+    @app.exception_handler(RequestValidationError)
+    async def validation_error_handler(request: Request, exc: RequestValidationError):
+        """Log request shape for 422 debugging without content values."""
+        body: Any
+        try:
+            body = await request.json()
+        except Exception as e:
+            body = {"_json_error": type(e).__name__}
+        message_summary, tool_names = summarize_request_validation_body(body)
+        logger.debug(
+            "Request validation failed: path={} query={} error_locs={} error_types={} message_summary={} tool_names={}",
+            request.url.path,
+            str(request.url.query),
+            [list(error.get("loc", ())) for error in exc.errors()],
+            [str(error.get("type", "")) for error in exc.errors()],
+            message_summary,
+            tool_names,
+        )
+        return await request_validation_exception_handler(request, exc)
+    @app.exception_handler(ProviderError)
+    async def provider_error_handler(request: Request, exc: ProviderError):
+        """Handle provider-specific errors and return Anthropic format."""
+        err_settings = get_settings()
+        if err_settings.log_api_error_tracebacks:
+            logger.error(
+                "Provider Error: error_type={} status_code={} message={}",
+                exc.error_type,
+                exc.status_code,
+                exc.message,
+            )
+        else:
+            logger.error(
+                "Provider Error: error_type={} status_code={}",
+                exc.error_type,
+                exc.status_code,
+            )
+        return JSONResponse(
+            status_code=exc.status_code,
+            content=exc.to_anthropic_format(),
+        )
+    @app.exception_handler(Exception)
+    async def general_error_handler(request: Request, exc: Exception):
+        """Handle general errors and return Anthropic format."""
+        settings = get_settings()
+        if settings.log_api_error_tracebacks:
+            logger.error("General Error: {}", exc)
+            logger.error(traceback.format_exc())
+        else:
+            logger.error(
+                "General Error: path={} method={} exc_type={}",
+                request.url.path,
+                request.method,
+                type(exc).__name__,
+            )
+        return JSONResponse(
+            status_code=500,
+            content={
+                "type": "error",
+                "error": {
+                    "type": "api_error",
+                    "message": "An unexpected error occurred.",
+                },
+            },
+        )
+    return app
+def create_asgi_app() -> GracefulLifespanApp:
+    """Create the server ASGI app with graceful lifespan failure reporting."""
+    return GracefulLifespanApp(create_app(lifespan_enabled=False))

api/command_utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Command parsing utilities for API optimizations."""
+import re
+import shlex
+_ENV_ASSIGNMENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=.*$")
+def _is_env_assignment(part: str) -> bool:
+    """Return True when a token is a shell-style env assignment."""
+    return bool(_ENV_ASSIGNMENT_RE.match(part))
+def _strip_env_assignments(parts: list[str]) -> list[str]:
+    """Return command parts after leading shell-style env assignments."""
+    cmd_start = 0
+    for i, part in enumerate(parts):
+        if _is_env_assignment(part):
+            cmd_start = i + 1
+        else:
+            break
+    return parts[cmd_start:]
+def extract_command_prefix(command: str) -> str:
+    """Extract the command prefix for fast prefix detection.
+    Parses a shell command safely, handling environment variables and
+    command injection attempts. Returns the command prefix suitable
+    for quick identification.
+    Returns:
+        Command prefix (e.g., "git", "git commit", "npm install")
+        or "none" if no valid command found
+    """
+    if "`" in command or "$(" in command:
+        return "command_injection_detected"
+    try:
+        parts = shlex.split(command, posix=False)
+        if not parts:
+            return "none"
+        env_prefix = []
+        cmd_start = 0
+        for i, part in enumerate(parts):
+            if _is_env_assignment(part):
+                env_prefix.append(part)
+                cmd_start = i + 1
+            else:
+                break
+        if cmd_start >= len(parts):
+            return "none"
+        cmd_parts = parts[cmd_start:]
+        if not cmd_parts:
+            return "none"
+        first_word = cmd_parts[0]
+        two_word_commands = {
+            "git",
+            "npm",
+            "docker",
+            "kubectl",
+            "cargo",
+            "go",
+            "pip",
+            "yarn",
+        }
+        if first_word in two_word_commands and len(cmd_parts) > 1:
+            second_word = cmd_parts[1]
+            if not second_word.startswith("-"):
+                return f"{first_word} {second_word}"
+            return first_word
+        return first_word if not env_prefix else " ".join(env_prefix) + " " + first_word
+    except ValueError:
+        parts = command.split()
+        if not parts:
+            return "none"
+        cmd_parts = _strip_env_assignments(parts)
+        return cmd_parts[0] if cmd_parts else "none"
+def extract_filepaths_from_command(command: str, output: str) -> str:
+    """Extract file paths from a command locally without API call.
+    Determines if the command reads file contents and extracts paths accordingly.
+    Commands like ls/dir/find just list files, so return empty.
+    Commands like cat/head/tail actually read contents, so extract the file path.
+    Returns:
+        Filepath extraction result in <filepaths> format
+    """
+    listing_commands = {
+        "ls",
+        "dir",
+        "find",
+        "tree",
+        "pwd",
+        "cd",
+        "mkdir",
+        "rmdir",
+        "rm",
+    }
+    reading_commands = {"cat", "head", "tail", "less", "more", "bat", "type"}
+    try:
+        parts = shlex.split(command, posix=False)
+        if not parts:
+            return "<filepaths>\n</filepaths>"
+        cmd_parts = _strip_env_assignments(parts)
+        if not cmd_parts:
+            return "<filepaths>\n</filepaths>"
+        base_cmd = cmd_parts[0].split("/")[-1].split("\\")[-1].lower()
+        if base_cmd in listing_commands:
+            return "<filepaths>\n</filepaths>"
+        if base_cmd in reading_commands:
+            filepaths = []
+            for part in cmd_parts[1:]:
+                if part.startswith("-"):
+                    continue
+                filepaths.append(part)
+            if filepaths:
+                paths_str = "\n".join(filepaths)
+                return f"<filepaths>\n{paths_str}\n</filepaths>"
+            return "<filepaths>\n</filepaths>"
+        if base_cmd == "grep":
+            flags_with_args = {"-e", "-f", "-m", "-A", "-B", "-C"}
+            pattern_provided_via_flag = False
+            positional = []
+            skip_next = False
+            for part in cmd_parts[1:]:
+                if skip_next:
+                    skip_next = False
+                    continue
+                if part.startswith("-"):
+                    if part in flags_with_args:
+                        if part in {"-e", "-f"}:
+                            pattern_provided_via_flag = True
+                        skip_next = True
+                    continue
+                positional.append(part)
+            filepaths = positional if pattern_provided_via_flag else positional[1:]
+            if filepaths:
+                paths_str = "\n".join(filepaths)
+                return f"<filepaths>\n{paths_str}\n</filepaths>"
+            return "<filepaths>\n</filepaths>"
+        return "<filepaths>\n</filepaths>"
+    except ValueError:
+        return "<filepaths>\n</filepaths>"

api/dependencies.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Dependency injection for FastAPI."""
+import secrets
+from fastapi import Depends, HTTPException, Request
+from loguru import logger
+from starlette.applications import Starlette
+from config.settings import Settings
+from config.settings import get_settings as _get_settings
+from core.anthropic import get_user_facing_error_message
+from providers.base import BaseProvider
+from providers.exceptions import (
+    AuthenticationError,
+    ServiceUnavailableError,
+    UnknownProviderTypeError,
+)
+from providers.registry import PROVIDER_DESCRIPTORS, ProviderRegistry
+# Process-level cache: only for :func:`get_provider_for_type` / :func:`get_provider`
+# when there is no ``Request``/``app`` (unit tests, scripts). HTTP handlers must pass
+# ``app`` to :func:`resolve_provider` so the app-scoped registry is used.
+_providers: dict[str, BaseProvider] = {}
+def get_settings() -> Settings:
+    """Return cached :class:`~config.settings.Settings` (FastAPI-friendly alias)."""
+    return _get_settings()
+def resolve_provider(
+    provider_type: str,
+    *,
+    app: Starlette | None,
+    settings: Settings,
+) -> BaseProvider:
+    """Resolve a provider using the app-scoped registry when ``app`` is set.
+    When ``app`` is not ``None``, the app-owned :attr:`app.state.provider_registry`
+    must exist (installed by :class:`~api.runtime.AppRuntime` during startup).
+    Callers that construct a bare ``FastAPI`` without lifespan must set
+    ``app.state.provider_registry`` explicitly.
+    When ``app`` is ``None`` (no HTTP context), uses the process-level
+    :data:`_providers` cache only.
+    """
+    if app is not None:
+        reg = getattr(app.state, "provider_registry", None)
+        if reg is None:
+            raise ServiceUnavailableError(
+                "Provider registry is not configured. Ensure AppRuntime startup ran "
+                "or assign app.state.provider_registry for test apps."
+            )
+        return _resolve_with_registry(reg, provider_type, settings)
+    return _resolve_with_registry(ProviderRegistry(_providers), provider_type, settings)
+def _resolve_with_registry(
+    registry: ProviderRegistry, provider_type: str, settings: Settings
+) -> BaseProvider:
+    should_log_init = not registry.is_cached(provider_type)
+    try:
+        provider = registry.get(provider_type, settings)
+    except AuthenticationError as e:
+        # Provider :class:`~providers.exceptions.AuthenticationError` messages are
+        # curated configuration hints (env var names, docs links), not upstream noise.
+        detail = str(e).strip() or get_user_facing_error_message(e)
+        raise HTTPException(status_code=503, detail=detail) from e
+    except UnknownProviderTypeError:
+        logger.error(
+            "Unknown provider_type: '{}'. Supported: {}",
+            provider_type,
+            ", ".join(f"'{key}'" for key in PROVIDER_DESCRIPTORS),
+        )
+        raise
+    if should_log_init:
+        logger.info("Provider initialized: {}", provider_type)
+    return provider
+def get_provider_for_type(provider_type: str) -> BaseProvider:
+    """Get or create a provider in the process-level cache (no ``app``/Request).
+    HTTP route handlers should call :func:`resolve_provider` with the active
+    :attr:`request.app` (via :class:`~api.runtime.AppRuntime`) instead of this
+    process-wide cache.
+    """
+    return resolve_provider(provider_type, app=None, settings=get_settings())
+def require_api_key(
+    request: Request, settings: Settings = Depends(get_settings)
+) -> None:
+    """Require a server API key (Anthropic-style).
+    Checks `x-api-key` header or `Authorization: Bearer ...` against
+    `Settings.anthropic_auth_token`. If `ANTHROPIC_AUTH_TOKEN` is empty, this is a no-op.
+    """
+    anthropic_auth_token = settings.anthropic_auth_token
+    if not anthropic_auth_token:
+        # No API key configured -> allow
+        return
+    header = (
+        request.headers.get("x-api-key")
+        or request.headers.get("authorization")
+        or request.headers.get("anthropic-auth-token")
+    )
+    if not header:
+        raise HTTPException(status_code=401, detail="Missing API key")
+    # Support both raw key in X-API-Key and Bearer token in Authorization
+    token = header
+    if header.lower().startswith("bearer "):
+        token = header.split(" ", 1)[1]
+    # Strip anything after the first colon to handle tokens with appended model names
+    if token and ":" in token:
+        token = token.split(":", 1)[0]
+    # Constant-time comparison to avoid leaking the configured token via
+    # response-time differences on a per-byte mismatch (CWE-208).
+    if not secrets.compare_digest(
+        token.encode("utf-8"), anthropic_auth_token.encode("utf-8")
+    ):
+        raise HTTPException(status_code=401, detail="Invalid API key")
+def get_provider() -> BaseProvider:
+    """Get or create the default provider (``MODEL`` / ``provider_type``).
+    Process-cache helper for scripts, unit tests, and non-FastAPI callers. HTTP
+    handlers must use :func:`resolve_provider` with :attr:`request.app` so the
+    app-scoped :class:`~providers.registry.ProviderRegistry` is used.
+    """
+    return get_provider_for_type(get_settings().provider_type)
+async def cleanup_provider():
+    """Cleanup all provider resources."""
+    global _providers
+    await ProviderRegistry(_providers).cleanup()
+    _providers = {}
+    logger.debug("Provider cleanup completed")

api/detection.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Request detection utilities for API optimizations.
+Detects quota checks, title generation, prefix detection, suggestion mode,
+and filepath extraction requests to enable fast-path responses.
+"""
+from core.anthropic import extract_text_from_content
+from .models.anthropic import MessagesRequest
+def is_quota_check_request(request_data: MessagesRequest) -> bool:
+    """Check if this is a quota probe request.
+    Quota checks are typically simple requests with max_tokens=1
+    and a single message containing the word "quota".
+    """
+    if (
+        request_data.max_tokens == 1
+        and len(request_data.messages) == 1
+        and request_data.messages[0].role == "user"
+    ):
+        text = extract_text_from_content(request_data.messages[0].content)
+        if "quota" in text.lower():
+            return True
+    return False
+def is_title_generation_request(request_data: MessagesRequest) -> bool:
+    """Check if this is a conversation title generation request.
+    Title generation requests are detected by a system prompt containing
+    title extraction instructions, no tools, and a single user message.
+    Matches Claude Code session title prompts (sentence-case title, JSON
+    \"title\" field, etc.).
+    """
+    if not request_data.system or request_data.tools:
+        return False
+    system_text = extract_text_from_content(request_data.system).lower()
+    if "title" not in system_text:
+        return False
+    return "sentence-case title" in system_text or (
+        "return json" in system_text
+        and "field" in system_text
+        and ("coding session" in system_text or "this session" in system_text)
+    )
+def is_prefix_detection_request(request_data: MessagesRequest) -> tuple[bool, str]:
+    """Check if this is a fast prefix detection request.
+    Prefix detection requests contain a policy_spec block and
+    a Command: section for extracting shell command prefixes.
+    Returns:
+        Tuple of (is_prefix_request, command_string)
+    """
+    if len(request_data.messages) != 1 or request_data.messages[0].role != "user":
+        return False, ""
+    content = extract_text_from_content(request_data.messages[0].content)
+    if "<policy_spec>" in content and "Command:" in content:
+        try:
+            cmd_start = content.rfind("Command:") + len("Command:")
+            return True, content[cmd_start:].strip()
+        except TypeError:
+            return False, ""
+    return False, ""
+def is_suggestion_mode_request(request_data: MessagesRequest) -> bool:
+    """Check if this is a suggestion mode request.
+    Suggestion mode requests contain "[SUGGESTION MODE:" in the user's message,
+    used for auto-suggesting what the user might type next.
+    """
+    for msg in request_data.messages:
+        if msg.role == "user":
+            text = extract_text_from_content(msg.content)
+            if "[SUGGESTION MODE:" in text:
+                return True
+    return False
+def is_filepath_extraction_request(
+    request_data: MessagesRequest,
+) -> tuple[bool, str, str]:
+    """Check if this is a filepath extraction request.
+    Filepath extraction requests have a single user message with
+    "Command:" and "Output:" sections, asking to extract file paths
+    from command output.
+    Returns:
+        Tuple of (is_filepath_request, command, output)
+    """
+    if len(request_data.messages) != 1 or request_data.messages[0].role != "user":
+        return False, "", ""
+    if request_data.tools:
+        return False, "", ""
+    content = extract_text_from_content(request_data.messages[0].content)
+    if "Command:" not in content or "Output:" not in content:
+        return False, "", ""
+    # Match if user content OR system block indicates filepath extraction
+    user_has_filepaths = (
+        "filepaths" in content.lower() or "<filepaths>" in content.lower()
+    )
+    system_text = (
+        extract_text_from_content(request_data.system) if request_data.system else ""
+    )
+    system_has_extract = (
+        "extract any file paths" in system_text.lower()
+        or "file paths that this command" in system_text.lower()
+    )
+    if not user_has_filepaths and not system_has_extract:
+        return False, "", ""
+    cmd_start = content.find("Command:") + len("Command:")
+    output_marker = content.find("Output:", cmd_start)
+    if output_marker == -1:
+        return False, "", ""
+    command = content[cmd_start:output_marker].strip()
+    output = content[output_marker + len("Output:") :].strip()
+    for marker in ["<", "\n\n"]:
+        if marker in output:
+            output = output.split(marker)[0].strip()
+    return True, command, output

api/gateway_model_ids.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Gateway-safe model id encoding for Claude Code model discovery."""
+from __future__ import annotations
+from dataclasses import dataclass
+GATEWAY_MODEL_ID_PREFIX = "anthropic"
+# Claude Code currently treats any model id containing ``claude-3-`` as not
+# supporting thinking. This intentionally uses that client-side capability
+# heuristic while keeping the real provider/model ref reversible for routing.
+NO_THINKING_GATEWAY_MODEL_ID_PREFIX = "claude-3-freecc-no-thinking"
+@dataclass(frozen=True, slots=True)
+class DecodedGatewayModelId:
+    provider_id: str
+    provider_model: str
+    force_thinking_enabled: bool | None = None
+def gateway_model_id(provider_model_ref: str) -> str:
+    """Return the normal Claude Code-discoverable id for a provider/model ref."""
+    return f"{GATEWAY_MODEL_ID_PREFIX}/{provider_model_ref}"
+def no_thinking_gateway_model_id(provider_model_ref: str) -> str:
+    """Return a Claude Code-discoverable id that disables client thinking."""
+    return f"{NO_THINKING_GATEWAY_MODEL_ID_PREFIX}/{provider_model_ref}"
+def decode_gateway_model_id(model_name: str) -> DecodedGatewayModelId | None:
+    """Decode a model id advertised by this gateway, if it is one."""
+    prefix, separator, remainder = model_name.partition("/")
+    if not separator:
+        return None
+    force_thinking_enabled: bool | None
+    if prefix == GATEWAY_MODEL_ID_PREFIX:
+        force_thinking_enabled = None
+    elif prefix == NO_THINKING_GATEWAY_MODEL_ID_PREFIX:
+        force_thinking_enabled = False
+    else:
+        return None
+    provider_id, provider_separator, provider_model = remainder.partition("/")
+    if not provider_separator or not provider_model:
+        return None
+    return DecodedGatewayModelId(
+        provider_id=provider_id,
+        provider_model=provider_model,
+        force_thinking_enabled=force_thinking_enabled,
+    )

api/model_router.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Model routing for Claude-compatible requests."""
+from __future__ import annotations
+from dataclasses import dataclass
+from loguru import logger
+from config.provider_ids import SUPPORTED_PROVIDER_IDS
+from config.settings import Settings
+from .gateway_model_ids import decode_gateway_model_id
+from .models.anthropic import MessagesRequest, TokenCountRequest
+from providers.rate_limit import GlobalRateLimiter
+@dataclass(frozen=True, slots=True)
+class ResolvedModel:
+    original_model: str
+    provider_id: str
+    provider_model: str
+    provider_model_ref: str
+    thinking_enabled: bool
+@dataclass(frozen=True, slots=True)
+class RoutedMessagesRequest:
+    request: MessagesRequest
+    resolved: ResolvedModel
+@dataclass(frozen=True, slots=True)
+class RoutedTokenCountRequest:
+    request: TokenCountRequest
+    resolved: ResolvedModel
+class ModelRouter:
+    """Resolve incoming Claude model names to configured provider/model pairs."""
+    def __init__(self, settings: Settings):
+        self._settings = settings
+    def _is_auto(self, model_name: str) -> bool:
+        """Return whether the model name refers to the virtual 'auto' model."""
+        name_lower = model_name.lower()
+        return name_lower == "auto" or name_lower == "anthropic/auto"
+    def _normalize_candidate_ref(self, raw_ref: str) -> str | None:
+        """Normalize auto candidate refs to ``provider/model`` when possible."""
+        candidate = (raw_ref or "").strip()
+        if not candidate:
+            return None
+        provider_id, separator, remainder = candidate.partition("/")
+        if separator and provider_id in SUPPORTED_PROVIDER_IDS and remainder:
+            return f"{provider_id}/{remainder}"
+        # Treat bare model ids and vendor/model ids as NVIDIA NIM models.
+        return f"nvidia_nim/{candidate}"
+    def resolve(self, claude_model_name: str) -> ResolvedModel:
+        # Special virtual model 'auto' maps to the configured default MODEL and
+        # enables provider-side fallbacks. Resolve it to the configured model
+        # while preserving the original requested name.
+        if self._is_auto(claude_model_name):
+            # If the user configured an explicit AUTO_MODEL_ORDER, try each
+            # provider/model pair in order and pick the first provider that is
+            # plausibly configured. Fall back to the single configured MODEL.
+            order_csv = (self._settings.auto_model_order or "").strip()
+            if order_csv:
+                for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
+                    if "/" not in cand:
+                        # assume vendor-prefixed entries; skip malformed
+                        continue
+                    provider_id = Settings.parse_provider_type(cand)
+                    provider_model = Settings.parse_model_name(cand)
+                    if self._settings.provider_is_configured(provider_id):
+                        thinking_enabled = self._settings.resolve_thinking(claude_model_name)
+                        return ResolvedModel(
+                            original_model=claude_model_name,
+                            provider_id=provider_id,
+                            provider_model=provider_model,
+                            provider_model_ref=cand,
+                            thinking_enabled=thinking_enabled,
+                        )
+            # No explicit order matched or none configured — fall back to default MODEL
+            provider_model_ref = self._settings.model
+            provider_id = Settings.parse_provider_type(provider_model_ref)
+            provider_model = Settings.parse_model_name(provider_model_ref)
+            thinking_enabled = self._settings.resolve_thinking(claude_model_name)
+            return ResolvedModel(
+                original_model=claude_model_name,
+                provider_id=provider_id,
+                provider_model=provider_model,
+                provider_model_ref=provider_model_ref,
+                thinking_enabled=thinking_enabled,
+            )
+        (
+            direct_provider_id,
+            direct_provider_model,
+            force_thinking_enabled,
+        ) = self._direct_provider_model(claude_model_name)
+        if direct_provider_id is not None and direct_provider_model is not None:
+            thinking_enabled = (
+                force_thinking_enabled
+                if force_thinking_enabled is not None
+                else self._settings.resolve_thinking(direct_provider_model)
+            )
+            logger.debug(
+                "MODEL DIRECT: '{}' -> provider='{}' model='{}' thinking={}",
+                claude_model_name,
+                direct_provider_id,
+                direct_provider_model,
+                thinking_enabled,
+            )
+            return ResolvedModel(
+                original_model=claude_model_name,
+                provider_id=direct_provider_id,
+                provider_model=direct_provider_model,
+                provider_model_ref=claude_model_name,
+                thinking_enabled=thinking_enabled,
+            )
+        provider_model_ref = self._settings.resolve_model(claude_model_name)
+        thinking_enabled = self._settings.resolve_thinking(claude_model_name)
+        provider_id = Settings.parse_provider_type(provider_model_ref)
+        provider_model = Settings.parse_model_name(provider_model_ref)
+        if provider_model != claude_model_name:
+            logger.debug(
+                "MODEL MAPPING: '{}' -> '{}'", claude_model_name, provider_model
+            )
+        return ResolvedModel(
+            original_model=claude_model_name,
+            provider_id=provider_id,
+            provider_model=provider_model,
+            provider_model_ref=provider_model_ref,
+            thinking_enabled=thinking_enabled,
+        )
+    def resolve_candidates(self, claude_model_name: str) -> list[ResolvedModel]:
+        """Resolve a model name to a prioritized list of candidates.
+        Used by the 'auto' routing logic to implement provider-side failover.
+        """
+        if not self._is_auto(claude_model_name):
+            return [self.resolve(claude_model_name)]
+        healthy_candidates: list[ResolvedModel] = []
+        blocked_candidates: list[ResolvedModel] = []
+        seen: set[str] = set()
+        def add_candidate(ref: str | None, source: str) -> None:
+            normalized_ref = self._normalize_candidate_ref(ref or "")
+            if normalized_ref is None or normalized_ref in seen:
+                return
+            provider_id = Settings.parse_provider_type(normalized_ref)
+            provider_model = Settings.parse_model_name(normalized_ref)
+            if self._settings.provider_is_configured(provider_id):
+                seen.add(normalized_ref)
+                resolved = ResolvedModel(
+                    original_model=claude_model_name,
+                    provider_id=provider_id,
+                    provider_model=provider_model,
+                    provider_model_ref=normalized_ref,
+                    thinking_enabled=self._settings.resolve_thinking(claude_model_name),
+                )
+                limiter = GlobalRateLimiter.get_scoped_instance(provider_id)
+                if limiter.is_blocked():
+                    logger.debug(
+                        "Routing: candidate '{}' (from {}) is BLOCKED",
+                        normalized_ref,
+                        source,
+                    )
+                    blocked_candidates.append(resolved)
+                else:
+                    logger.debug(
+                        "Routing: added candidate '{}' (from {})",
+                        normalized_ref,
+                        source,
+                    )
+                    healthy_candidates.append(resolved)
+            else:
+                logger.debug(
+                    "Routing: candidate '{}' (from {}) is NOT CONFIGURED",
+                    normalized_ref,
+                    source,
+                )
+        # 1. Preferred order (AUTO_MODEL_ORDER)
+        order_csv = (self._settings.auto_model_order or "").strip()
+        if order_csv:
+            for cand in [c.strip() for c in order_csv.split(",") if c.strip()]:
+                add_candidate(cand, "AUTO_MODEL_PRIORITY")
+        # 2. Main MODEL
+        add_candidate(self._settings.model, "MODEL")
+        # 3. NVIDIA Fallbacks
+        nim_csv = (self._settings.nvidia_nim_fallback_models or "").strip()
+        if nim_csv:
+            for cand in [c.strip() for c in nim_csv.split(",") if c.strip()]:
+                add_candidate(cand, "NVIDIA_NIM_FALLBACK_MODELS")
+        # 4. Model-specific overrides
+        add_candidate(self._settings.model_opus, "MODEL_OPUS")
+        add_candidate(self._settings.model_sonnet, "MODEL_SONNET")
+        add_candidate(self._settings.model_haiku, "MODEL_HAIKU")
+        all_candidates = healthy_candidates + blocked_candidates
+        logger.info(
+            "Routing: resolved '{}' to {} candidates: {}",
+            claude_model_name,
+            len(all_candidates),
+            ", ".join(c.provider_model_ref for c in all_candidates),
+        )
+        return all_candidates
+    def _direct_provider_model(
+        self, model_name: str
+    ) -> tuple[str | None, str | None, bool | None]:
+        decoded = decode_gateway_model_id(model_name)
+        if decoded is not None:
+            if decoded.provider_id not in SUPPORTED_PROVIDER_IDS:
+                return None, None, None
+            return (
+                decoded.provider_id,
+                decoded.provider_model,
+                decoded.force_thinking_enabled,
+            )
+        provider_id, separator, provider_model = model_name.partition("/")
+        if not separator:
+            return None, None, None
+        if provider_id not in SUPPORTED_PROVIDER_IDS:
+            return None, None, None
+        if not provider_model:
+            return None, None, None
+        return provider_id, provider_model, None
+    def resolve_messages_request(
+        self, request: MessagesRequest
+    ) -> RoutedMessagesRequest:
+        """Return an internal routed request context."""
+        resolved = self.resolve(request.model)
+        routed = request.model_copy(deep=True)
+        routed.model = resolved.provider_model
+        return RoutedMessagesRequest(request=routed, resolved=resolved)
+    def resolve_token_count_request(
+        self, request: TokenCountRequest
+    ) -> RoutedTokenCountRequest:
+        """Return an internal token-count request context."""
+        resolved = self.resolve(request.model)
+        routed = request.model_copy(
+            update={"model": resolved.provider_model}, deep=True
+        )
+        return RoutedTokenCountRequest(request=routed, resolved=resolved)

api/models/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""API models exports."""
+from .anthropic import (
+    ContentBlockImage,
+    ContentBlockRedactedThinking,
+    ContentBlockText,
+    ContentBlockThinking,
+    ContentBlockToolResult,
+    ContentBlockToolUse,
+    Message,
+    MessagesRequest,
+    Role,
+    SystemContent,
+    ThinkingConfig,
+    TokenCountRequest,
+    Tool,
+)
+from .responses import (
+    MessagesResponse,
+    ModelResponse,
+    ModelsListResponse,
+    TokenCountResponse,
+    Usage,
+)
+__all__ = [
+    "ContentBlockImage",
+    "ContentBlockRedactedThinking",
+    "ContentBlockText",
+    "ContentBlockThinking",
+    "ContentBlockToolResult",
+    "ContentBlockToolUse",
+    "Message",
+    "MessagesRequest",
+    "MessagesResponse",
+    "ModelResponse",
+    "ModelsListResponse",
+    "Role",
+    "SystemContent",
+    "ThinkingConfig",
+    "TokenCountRequest",
+    "TokenCountResponse",
+    "Tool",
+    "Usage",
+]

api/models/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (849 Bytes). View file

api/models/__pycache__/anthropic.cpython-314.pyc ADDED Viewed

Binary file (11.6 kB). View file

api/models/__pycache__/responses.cpython-314.pyc ADDED Viewed

Binary file (3.69 kB). View file

api/models/anthropic.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Pydantic models for Anthropic-compatible requests."""
+from enum import StrEnum
+from typing import Any, Literal
+from pydantic import BaseModel, ConfigDict, Field
+# =============================================================================
+# Content Block Types
+# =============================================================================
+class Role(StrEnum):
+    user = "user"
+    assistant = "assistant"
+    system = "system"
+class _AnthropicBlockBase(BaseModel):
+    """Pass through provider fields (e.g. ``cache_control``) for native transports."""
+    model_config = ConfigDict(extra="allow")
+class ContentBlockText(_AnthropicBlockBase):
+    type: Literal["text"]
+    text: str
+class ContentBlockImage(_AnthropicBlockBase):
+    type: Literal["image"]
+    source: dict[str, Any]
+class ContentBlockToolUse(_AnthropicBlockBase):
+    type: Literal["tool_use"]
+    id: str
+    name: str
+    input: dict[str, Any]
+class ContentBlockToolResult(_AnthropicBlockBase):
+    type: Literal["tool_result"]
+    tool_use_id: str
+    content: str | list[Any] | dict[str, Any]
+class ContentBlockThinking(_AnthropicBlockBase):
+    type: Literal["thinking"]
+    thinking: str
+    signature: str | None = None
+class ContentBlockRedactedThinking(_AnthropicBlockBase):
+    type: Literal["redacted_thinking"]
+    data: str
+class ContentBlockServerToolUse(_AnthropicBlockBase):
+    """Anthropic server-side tool invocation (e.g. ``web_search``, ``web_fetch``)."""
+    type: Literal["server_tool_use"]
+    id: str
+    name: str
+    input: dict[str, Any]
+class ContentBlockWebSearchToolResult(_AnthropicBlockBase):
+    type: Literal["web_search_tool_result"]
+    tool_use_id: str
+    content: Any
+class ContentBlockWebFetchToolResult(_AnthropicBlockBase):
+    type: Literal["web_fetch_tool_result"]
+    tool_use_id: str
+    content: Any
+class SystemContent(_AnthropicBlockBase):
+    type: Literal["text"]
+    text: str
+# =============================================================================
+# Message Types
+# =============================================================================
+class Message(BaseModel):
+    role: Literal["user", "assistant"]
+    content: (
+        str
+        | list[
+            ContentBlockText
+            | ContentBlockImage
+            | ContentBlockToolUse
+            | ContentBlockToolResult
+            | ContentBlockThinking
+            | ContentBlockRedactedThinking
+            | ContentBlockServerToolUse
+            | ContentBlockWebSearchToolResult
+            | ContentBlockWebFetchToolResult
+        ]
+    )
+    reasoning_content: str | None = None
+class Tool(_AnthropicBlockBase):
+    name: str
+    # Anthropic server tools (e.g. web_search beta tools) include a ``type`` and
+    # may omit ``input_schema`` because the provider owns the schema.
+    type: str | None = None
+    description: str | None = None
+    input_schema: dict[str, Any] | None = None
+class ThinkingConfig(BaseModel):
+    enabled: bool | None = True
+    type: str | None = None
+    budget_tokens: int | None = None
+# =============================================================================
+# Request Models
+# =============================================================================
+class MessagesRequest(BaseModel):
+    model_config = ConfigDict(extra="allow")
+    model: str
+    # Internal routing / debug: accepted on parse but not serialized to providers.
+    original_model: str | None = Field(default=None, exclude=True)
+    resolved_provider_model: str | None = Field(default=None, exclude=True)
+    max_tokens: int | None = None
+    messages: list[Message]
+    system: str | list[SystemContent] | None = None
+    stop_sequences: list[str] | None = None
+    stream: bool | None = True
+    temperature: float | None = None
+    top_p: float | None = None
+    top_k: int | None = None
+    metadata: dict[str, Any] | None = None
+    tools: list[Tool] | None = None
+    tool_choice: dict[str, Any] | None = None
+    thinking: ThinkingConfig | None = None
+    # Native Anthropic / SDK client hints: ignored (not forwarded) for OpenAI Chat conversion.
+    context_management: dict[str, Any] | None = None
+    output_config: dict[str, Any] | None = None
+    mcp_servers: list[dict[str, Any]] | None = None
+    extra_body: dict[str, Any] | None = None
+class TokenCountRequest(BaseModel):
+    model_config = ConfigDict(extra="allow")
+    model: str
+    original_model: str | None = Field(default=None, exclude=True)
+    resolved_provider_model: str | None = Field(default=None, exclude=True)
+    messages: list[Message]
+    system: str | list[SystemContent] | None = None
+    tools: list[Tool] | None = None
+    thinking: ThinkingConfig | None = None
+    tool_choice: dict[str, Any] | None = None
+    context_management: dict[str, Any] | None = None
+    output_config: dict[str, Any] | None = None
+    mcp_servers: list[dict[str, Any]] | None = None

api/models/responses.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Pydantic models for API responses."""
+from typing import Any, Literal
+from pydantic import BaseModel
+from .anthropic import (
+    ContentBlockRedactedThinking,
+    ContentBlockText,
+    ContentBlockThinking,
+    ContentBlockToolUse,
+)
+class TokenCountResponse(BaseModel):
+    input_tokens: int
+class ModelResponse(BaseModel):
+    created_at: str
+    display_name: str
+    id: str
+    type: Literal["model"] = "model"
+class ModelsListResponse(BaseModel):
+    data: list[ModelResponse]
+    first_id: str | None
+    has_more: bool
+    last_id: str | None
+class Usage(BaseModel):
+    input_tokens: int
+    output_tokens: int
+    cache_creation_input_tokens: int = 0
+    cache_read_input_tokens: int = 0
+class MessagesResponse(BaseModel):
+    id: str
+    model: str
+    role: Literal["assistant"] = "assistant"
+    content: list[
+        ContentBlockText
+        | ContentBlockToolUse
+        | ContentBlockThinking
+        | ContentBlockRedactedThinking
+        | dict[str, Any]
+    ]
+    type: Literal["message"] = "message"
+    stop_reason: (
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"] | None
+    ) = None
+    stop_sequence: str | None = None
+    usage: Usage

api/optimization_handlers.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""Optimization handlers for fast-path API responses.
+Each handler returns a MessagesResponse if the request matches and the
+optimization is enabled, otherwise None.
+"""
+import uuid
+from loguru import logger
+from config.settings import Settings
+from .command_utils import extract_command_prefix, extract_filepaths_from_command
+from .detection import (
+    is_filepath_extraction_request,
+    is_prefix_detection_request,
+    is_quota_check_request,
+    is_suggestion_mode_request,
+    is_title_generation_request,
+)
+from .models.anthropic import MessagesRequest
+from .models.responses import MessagesResponse, Usage
+def _text_response(
+    request_data: MessagesRequest,
+    text: str,
+    *,
+    input_tokens: int,
+    output_tokens: int,
+) -> MessagesResponse:
+    return MessagesResponse(
+        id=f"msg_{uuid.uuid4()}",
+        model=request_data.model,
+        content=[{"type": "text", "text": text}],
+        stop_reason="end_turn",
+        usage=Usage(input_tokens=input_tokens, output_tokens=output_tokens),
+    )
+def try_prefix_detection(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Fast prefix detection - return command prefix without API call."""
+    if not settings.fast_prefix_detection:
+        return None
+    is_prefix_req, command = is_prefix_detection_request(request_data)
+    if not is_prefix_req:
+        return None
+    logger.info("Optimization: Fast prefix detection request")
+    return _text_response(
+        request_data,
+        extract_command_prefix(command),
+        input_tokens=100,
+        output_tokens=5,
+    )
+def try_quota_mock(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Mock quota probe requests."""
+    if not settings.enable_network_probe_mock:
+        return None
+    if not is_quota_check_request(request_data):
+        return None
+    logger.info("Optimization: Intercepted and mocked quota probe")
+    return _text_response(
+        request_data,
+        "Quota check passed.",
+        input_tokens=10,
+        output_tokens=5,
+    )
+def try_title_skip(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Skip title generation requests."""
+    if not settings.enable_title_generation_skip:
+        return None
+    if not is_title_generation_request(request_data):
+        return None
+    logger.info("Optimization: Skipped title generation request")
+    return _text_response(
+        request_data,
+        "Conversation",
+        input_tokens=100,
+        output_tokens=5,
+    )
+def try_suggestion_skip(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Skip suggestion mode requests."""
+    if not settings.enable_suggestion_mode_skip:
+        return None
+    if not is_suggestion_mode_request(request_data):
+        return None
+    logger.info("Optimization: Skipped suggestion mode request")
+    return _text_response(
+        request_data,
+        "",
+        input_tokens=100,
+        output_tokens=1,
+    )
+def try_filepath_mock(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Mock filepath extraction requests."""
+    if not settings.enable_filepath_extraction_mock:
+        return None
+    is_fp, cmd, output = is_filepath_extraction_request(request_data)
+    if not is_fp:
+        return None
+    filepaths = extract_filepaths_from_command(cmd, output)
+    logger.info("Optimization: Mocked filepath extraction")
+    return _text_response(
+        request_data,
+        filepaths,
+        input_tokens=100,
+        output_tokens=10,
+    )
+# Cheapest/most common optimizations first for faster short-circuit.
+OPTIMIZATION_HANDLERS = [
+    try_quota_mock,
+    try_prefix_detection,
+    try_title_skip,
+    try_suggestion_skip,
+    try_filepath_mock,
+]
+def try_optimizations(
+    request_data: MessagesRequest, settings: Settings
+) -> MessagesResponse | None:
+    """Run optimization handlers in order. Returns first match or None."""
+    for handler in OPTIMIZATION_HANDLERS:
+        result = handler(request_data, settings)
+        if result is not None:
+            return result
+    return None

api/routes.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""FastAPI route handlers."""
+from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from loguru import logger
+from config.settings import Settings
+from core.anthropic import get_token_count
+from providers.registry import ProviderRegistry
+from . import dependencies
+from .dependencies import get_settings, require_api_key
+from .gateway_model_ids import gateway_model_id, no_thinking_gateway_model_id
+from .models.anthropic import MessagesRequest, TokenCountRequest
+from .models.responses import ModelResponse, ModelsListResponse
+from .services import ClaudeProxyService
+from providers.nvidia_nim import metrics as nvidia_nim_metrics
+router = APIRouter()
+DISCOVERED_MODEL_CREATED_AT = "1970-01-01T00:00:00Z"
+# The proxy advertises a curated set of provider-backed models. Replace
+# the previous hardcoded Claude model list with the requested NVIDIA-
+# compatible models so clients only see those options.
+REQUESTED_PROVIDER_MODELS = [
+    "nvidia_nim/qwen/qwen3-coder-480b-a35b-instruct",
+    "nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
+    "nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct",
+    "nvidia_nim/z-ai/glm4.7",
+    "nvidia_nim/stepfun-ai/step-3.5-flash",
+    "nvidia_nim/bytedance/seed-oss-36b-instruct",
+    "nvidia_nim/mistralai/mistral-nemotron",
+    "groq/openai/gpt-oss-120b",
+    "groq/openai/gpt-oss-20b",
+    "groq/llama-3.3-70b-versatile",
+    "groq/meta-llama/llama-4-scout-17b-16e-instruct",
+    "groq/qwen/qwen3-32b",
+    "cerebras/gpt-oss-120b",
+    "cerebras/qwen-3-235b-a22b-instruct-2507",
+    "cerebras/zai-glm-4.7",
+    "cerebras/llama3.1-8b",
+]
+def get_proxy_service(
+    request: Request,
+    settings: Settings = Depends(get_settings),
+) -> ClaudeProxyService:
+    """Build the request service for route handlers."""
+    return ClaudeProxyService(
+        settings,
+        provider_getter=lambda provider_type: dependencies.resolve_provider(
+            provider_type, app=request.app, settings=settings
+        ),
+        token_counter=get_token_count,
+    )
+def _probe_response(allow: str) -> Response:
+    """Return an empty success response for compatibility probes."""
+    return Response(status_code=204, headers={"Allow": allow})
+def _discovered_model_response(model_id: str, *, display_name: str) -> ModelResponse:
+    return ModelResponse(
+        id=model_id,
+        display_name=display_name,
+        created_at=DISCOVERED_MODEL_CREATED_AT,
+    )
+def _append_unique_model(
+    models: list[ModelResponse], seen: set[str], model: ModelResponse
+) -> None:
+    if model.id in seen:
+        return
+    seen.add(model.id)
+    models.append(model)
+def _append_provider_model_variants(
+    models: list[ModelResponse],
+    seen: set[str],
+    provider_model_ref: str,
+    *,
+    supports_thinking: bool | None = None,
+) -> None:
+    if supports_thinking is not False:
+        _append_unique_model(
+            models,
+            seen,
+            _discovered_model_response(
+                gateway_model_id(provider_model_ref),
+                display_name=provider_model_ref,
+            ),
+        )
+    _append_unique_model(
+        models,
+        seen,
+        _discovered_model_response(
+            no_thinking_gateway_model_id(provider_model_ref),
+            display_name=f"{provider_model_ref} (no thinking)",
+        ),
+    )
+def _build_models_list_response(
+    settings: Settings, provider_registry: ProviderRegistry | None
+) -> ModelsListResponse:
+    models: list[ModelResponse] = []
+    seen: set[str] = set()
+    # Advertise only the requested provider models (no Claude models, no registry auto-discovery).
+    # Each ref is added with both thinking and no-thinking variants.
+    for provider_ref in REQUESTED_PROVIDER_MODELS:
+        # If the ref already contains a provider prefix, use it as-is;
+        # otherwise assume it belongs to the NVIDIA NIM provider.
+        ref = provider_ref if "/" in provider_ref else f"nvidia_nim/{provider_ref}"
+        supports_thinking = None
+        if provider_registry is not None:
+            # model_id for registry lookups should be provider-prefixed
+            provider, model_id = ref.split("/", 1) if "/" in ref else ("nvidia_nim", ref)
+            supports_thinking = provider_registry.cached_model_supports_thinking(provider, model_id)
+        _append_provider_model_variants(models, seen, ref, supports_thinking=supports_thinking)
+    # Add a virtual `auto` model that maps to the configured MODEL and enables
+    # automatic fallback behavior when used by clients.
+    _append_unique_model(
+        models,
+        seen,
+        ModelResponse(
+            id=gateway_model_id("auto"),
+            display_name="auto (use configured fallbacks)",
+            created_at=DISCOVERED_MODEL_CREATED_AT,
+        ),
+    )
+    # Filter out any residual Claude-branded models so the proxy advertises
+    # only the provider-backed models requested by the user.
+    filtered = [
+        m
+        for m in models
+        if "claude" not in (m.id or "").lower() and "claude" not in (m.display_name or "").lower()
+    ]
+    # Ensure `auto` model remains available even if filtering removed others.
+    if not any(m.id == gateway_model_id("auto") for m in filtered):
+        filtered.append(
+            ModelResponse(
+                id=gateway_model_id("auto"),
+                display_name="auto (use configured fallbacks)",
+                created_at=DISCOVERED_MODEL_CREATED_AT,
+            )
+        )
+    return ModelsListResponse(
+        data=filtered,
+        first_id=filtered[0].id if filtered else None,
+        has_more=False,
+        last_id=filtered[-1].id if filtered else None,
+    )
+# =============================================================================
+# Routes
+# =============================================================================
+@router.post("/v1/messages")
+async def create_message(
+    request_data: MessagesRequest,
+    service: ClaudeProxyService = Depends(get_proxy_service),
+    _auth=Depends(require_api_key),
+):
+    """Create a message (always streaming)."""
+    return service.create_message(request_data)
+@router.api_route("/v1/messages", methods=["HEAD", "OPTIONS"])
+async def probe_messages(_auth=Depends(require_api_key)):
+    """Respond to Claude compatibility probes for the messages endpoint."""
+    return _probe_response("POST, HEAD, OPTIONS")
+@router.post("/v1/messages/count_tokens")
+async def count_tokens(
+    request_data: TokenCountRequest,
+    service: ClaudeProxyService = Depends(get_proxy_service),
+    _auth=Depends(require_api_key),
+):
+    """Count tokens for a request."""
+    return service.count_tokens(request_data)
+@router.api_route("/v1/messages/count_tokens", methods=["HEAD", "OPTIONS"])
+async def probe_count_tokens(_auth=Depends(require_api_key)):
+    """Respond to Claude compatibility probes for the token count endpoint."""
+    return _probe_response("POST, HEAD, OPTIONS")
+@router.get("/")
+async def root(
+    settings: Settings = Depends(get_settings), _auth=Depends(require_api_key)
+):
+    """Root endpoint."""
+    return {
+        "status": "ok",
+        "provider": settings.provider_type,
+        "model": settings.model,
+    }
+@router.api_route("/", methods=["HEAD", "OPTIONS"])
+async def probe_root(_auth=Depends(require_api_key)):
+    """Respond to compatibility probes for the root endpoint."""
+    return _probe_response("GET, HEAD, OPTIONS")
+@router.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+@router.api_route("/health", methods=["HEAD", "OPTIONS"])
+async def probe_health():
+    """Respond to compatibility probes for the health endpoint."""
+    return _probe_response("GET, HEAD, OPTIONS")
+@router.get("/v1/models", response_model=ModelsListResponse)
+async def list_models(
+    request: Request,
+    settings: Settings = Depends(get_settings),
+    _auth=Depends(require_api_key),
+):
+    """List the model ids this proxy advertises to Claude-compatible clients."""
+    registry = getattr(request.app.state, "provider_registry", None)
+    provider_registry = registry if isinstance(registry, ProviderRegistry) else None
+    return _build_models_list_response(settings, provider_registry)
+@router.post("/stop")
+async def stop_cli(request: Request, _auth=Depends(require_api_key)):
+    """Stop all CLI sessions and pending tasks."""
+    handler = getattr(request.app.state, "message_handler", None)
+    if not handler:
+        # Fallback if messaging not initialized
+        cli_manager = getattr(request.app.state, "cli_manager", None)
+        if cli_manager:
+            await cli_manager.stop_all()
+            logger.info("STOP_CLI: source=cli_manager cancelled_count=N/A")
+            return {"status": "stopped", "source": "cli_manager"}
+        raise HTTPException(status_code=503, detail="Messaging system not initialized")
+    count = await handler.stop_all_tasks()
+    logger.info("STOP_CLI: source=handler cancelled_count={}", count)
+    return {"status": "stopped", "cancelled_count": count}
+@router.get("/admin/fallbacks")
+async def admin_fallbacks(_auth=Depends(require_api_key)):
+    """Admin endpoint exposing NVIDIA NIM fallback metrics.
+    Protected by the same API key as other endpoints.
+    """
+    try:
+        data = nvidia_nim_metrics.snapshot()
+    except Exception as e:
+        logger.warning("ADMIN_FALLBACKS: failed to read metrics: {}", e)
+        raise HTTPException(status_code=500, detail="failed to read metrics")
+    return {"provider": "nvidia_nim", "fallbacks": data}

api/runtime.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""Application runtime composition and lifecycle ownership."""
+from __future__ import annotations
+import asyncio
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+from fastapi import FastAPI
+from loguru import logger
+from config.settings import Settings, get_settings
+from providers.exceptions import ServiceUnavailableError
+from providers.registry import ProviderRegistry
+if TYPE_CHECKING:
+    from cli.manager import CLISessionManager
+    from messaging.handler import ClaudeMessageHandler
+    from messaging.platforms.base import MessagingPlatform
+    from messaging.session import SessionStore
+_SHUTDOWN_TIMEOUT_S = 5.0
+async def best_effort(
+    name: str,
+    awaitable: Any,
+    timeout_s: float = _SHUTDOWN_TIMEOUT_S,
+    *,
+    log_verbose_errors: bool = False,
+) -> None:
+    """Run a shutdown step with timeout; never raise to callers."""
+    try:
+        await asyncio.wait_for(awaitable, timeout=timeout_s)
+    except TimeoutError:
+        logger.warning("Shutdown step timed out: {} ({}s)", name, timeout_s)
+    except Exception as e:
+        if log_verbose_errors:
+            logger.warning(
+                "Shutdown step failed: {}: {}: {}",
+                name,
+                type(e).__name__,
+                e,
+            )
+        else:
+            logger.warning(
+                "Shutdown step failed: {}: exc_type={}",
+                name,
+                type(e).__name__,
+            )
+def warn_if_process_auth_token(settings: Settings) -> None:
+    """Warn when server auth was implicitly inherited from the shell."""
+    if settings.uses_process_anthropic_auth_token():
+        logger.warning(
+            "ANTHROPIC_AUTH_TOKEN is set in the process environment but not in "
+            "a configured .env file. The proxy will require that token. Add "
+            "ANTHROPIC_AUTH_TOKEN= to .env to disable proxy auth, or set the "
+            "same token in .env to make server auth explicit."
+        )
+def log_startup_failure(settings: Settings, exc: Exception) -> None:
+    """Log startup failures without traceback noise unless verbose diagnostics are enabled."""
+    message = startup_failure_message(settings, exc)
+    logger.error("Startup failed:\n{}", message)
+def startup_failure_message(settings: Settings, exc: Exception) -> str:
+    """Return a concise startup failure message for logs and ASGI lifespan failure."""
+    if isinstance(exc, ServiceUnavailableError):
+        return exc.message.strip() or "Server startup failed."
+    if settings.log_api_error_tracebacks:
+        return f"{type(exc).__name__}: {exc}"
+    return f"Server startup failed: exc_type={type(exc).__name__}"
+def _should_continue_after_model_validation_failure(exc: Exception) -> bool:
+    """Return whether a model-validation failure should be downgraded to a warning.
+    Provider discovery can fail transiently or due to local environment issues
+    (for example, a missing runtime dependency in the provider's process path).
+    We keep startup alive in those cases so the configured proxy can still serve
+    requests and advertise the models that are already known from settings.
+    """
+    if not isinstance(exc, ServiceUnavailableError):
+        return False
+    message = (exc.message or str(exc)).lower()
+    return "problem=query failure:" in message
+@dataclass(slots=True)
+class AppRuntime:
+    """Own optional messaging, CLI, session, and provider runtime resources."""
+    app: FastAPI
+    settings: Settings
+    _provider_registry: ProviderRegistry | None = field(default=None, init=False)
+    messaging_platform: MessagingPlatform | None = None
+    message_handler: ClaudeMessageHandler | None = None
+    cli_manager: CLISessionManager | None = None
+    @classmethod
+    def for_app(
+        cls,
+        app: FastAPI,
+        settings: Settings | None = None,
+    ) -> AppRuntime:
+        return cls(app=app, settings=settings or get_settings())
+    async def startup(self) -> None:
+        logger.info("Starting Claude Code Proxy...")
+        self._provider_registry = ProviderRegistry()
+        self.app.state.provider_registry = self._provider_registry
+        try:
+            warn_if_process_auth_token(self.settings)
+            try:
+                # Use a reasonable timeout for startup validation to prevent blocking healthy checks.
+                await asyncio.wait_for(
+                    self._provider_registry.validate_configured_models(self.settings),
+                    timeout=15.0,
+                )
+            except Exception as exc:
+                logger.warning(
+                    "Startup model validation skipped or timed out: continuing in lazy mode. "
+                    "Reason: {}",
+                    str(exc) or type(exc).__name__,
+                )
+            self._provider_registry.start_model_list_refresh(self.settings)
+            await self._start_messaging_if_configured()
+            self._publish_state()
+        except Exception as exc:
+            log_startup_failure(self.settings, exc)
+            await best_effort(
+                "provider_registry.cleanup",
+                self._provider_registry.cleanup(),
+                log_verbose_errors=self.settings.log_api_error_tracebacks,
+            )
+            raise
+    async def shutdown(self) -> None:
+        verbose = self.settings.log_api_error_tracebacks
+        if self.message_handler is not None:
+            try:
+                self.message_handler.session_store.flush_pending_save()
+            except Exception as e:
+                if verbose:
+                    logger.warning("Session store flush on shutdown: {}", e)
+                else:
+                    logger.warning(
+                        "Session store flush on shutdown: exc_type={}",
+                        type(e).__name__,
+                    )
+        logger.info("Shutdown requested, cleaning up...")
+        if self.messaging_platform:
+            await best_effort(
+                "messaging_platform.stop",
+                self.messaging_platform.stop(),
+                log_verbose_errors=verbose,
+            )
+        if self.cli_manager:
+            await best_effort(
+                "cli_manager.stop_all",
+                self.cli_manager.stop_all(),
+                log_verbose_errors=verbose,
+            )
+        if self._provider_registry is not None:
+            await best_effort(
+                "provider_registry.cleanup",
+                self._provider_registry.cleanup(),
+                log_verbose_errors=verbose,
+            )
+        await self._shutdown_limiter()
+        logger.info("Server shut down cleanly")
+    async def _start_messaging_if_configured(self) -> None:
+        try:
+            from messaging.platforms.factory import (
+                MessagingPlatformOptions,
+                create_messaging_platform,
+            )
+            self.messaging_platform = create_messaging_platform(
+                self.settings.messaging_platform,
+                MessagingPlatformOptions(
+                    telegram_bot_token=self.settings.telegram_bot_token,
+                    allowed_telegram_user_id=self.settings.allowed_telegram_user_id,
+                    discord_bot_token=self.settings.discord_bot_token,
+                    allowed_discord_channels=self.settings.allowed_discord_channels,
+                    voice_note_enabled=self.settings.voice_note_enabled,
+                    whisper_model=self.settings.whisper_model,
+                    whisper_device=self.settings.whisper_device,
+                    hf_token=self.settings.hf_token,
+                    nvidia_nim_api_key=self.settings.nvidia_nim_api_key_qwen,
+                    messaging_rate_limit=self.settings.messaging_rate_limit,
+                    messaging_rate_window=self.settings.messaging_rate_window,
+                    log_raw_messaging_content=self.settings.log_raw_messaging_content,
+                    log_api_error_tracebacks=self.settings.log_api_error_tracebacks,
+                ),
+            )
+            if self.messaging_platform:
+                await self._start_message_handler()
+        except ImportError as e:
+            if self.settings.log_api_error_tracebacks:
+                logger.warning("Messaging module import error: {}", e)
+            else:
+                logger.warning(
+                    "Messaging module import error: exc_type={}",
+                    type(e).__name__,
+                )
+        except Exception as e:
+            if self.settings.log_api_error_tracebacks:
+                logger.error("Failed to start messaging platform: {}", e)
+                import traceback
+                logger.error(traceback.format_exc())
+            else:
+                logger.error(
+                    "Failed to start messaging platform: exc_type={}",
+                    type(e).__name__,
+                )
+    async def _start_message_handler(self) -> None:
+        from cli.manager import CLISessionManager
+        from messaging.handler import ClaudeMessageHandler
+        from messaging.session import SessionStore
+        workspace = (
+            os.path.abspath(self.settings.allowed_dir)
+            if self.settings.allowed_dir
+            else os.getcwd()
+        )
+        os.makedirs(workspace, exist_ok=True)
+        data_path = os.path.abspath(self.settings.claude_workspace)
+        os.makedirs(data_path, exist_ok=True)
+        api_url = f"http://{self.settings.host}:{self.settings.port}/v1"
+        allowed_dirs = [workspace] if self.settings.allowed_dir else []
+        plans_dir_abs = os.path.abspath(
+            os.path.join(self.settings.claude_workspace, "plans")
+        )
+        plans_directory = os.path.relpath(plans_dir_abs, workspace)
+        self.cli_manager = CLISessionManager(
+            workspace_path=workspace,
+            api_url=api_url,
+            allowed_dirs=allowed_dirs,
+            plans_directory=plans_directory,
+            claude_bin=self.settings.claude_cli_bin,
+            log_raw_cli_diagnostics=self.settings.log_raw_cli_diagnostics,
+            log_messaging_error_details=self.settings.log_messaging_error_details,
+        )
+        session_store = SessionStore(
+            storage_path=os.path.join(data_path, "sessions.json"),
+            message_log_cap=self.settings.max_message_log_entries_per_chat,
+        )
+        platform = self.messaging_platform
+        assert platform is not None
+        self.message_handler = ClaudeMessageHandler(
+            platform=platform,
+            cli_manager=self.cli_manager,
+            session_store=session_store,
+            debug_platform_edits=self.settings.debug_platform_edits,
+            debug_subagent_stack=self.settings.debug_subagent_stack,
+            log_raw_messaging_content=self.settings.log_raw_messaging_content,
+            log_raw_cli_diagnostics=self.settings.log_raw_cli_diagnostics,
+            log_messaging_error_details=self.settings.log_messaging_error_details,
+        )
+        self._restore_tree_state(session_store)
+        platform.on_message(self.message_handler.handle_message)
+        await platform.start()
+        logger.info(f"{platform.name} platform started with message handler")
+    def _restore_tree_state(self, session_store: SessionStore) -> None:
+        saved_trees = session_store.get_all_trees()
+        if not saved_trees:
+            return
+        if self.message_handler is None:
+            return
+        logger.info(f"Restoring {len(saved_trees)} conversation trees...")
+        from messaging.trees.queue_manager import TreeQueueManager
+        self.message_handler.replace_tree_queue(
+            TreeQueueManager.from_dict(
+                {
+                    "trees": saved_trees,
+                    "node_to_tree": session_store.get_node_mapping(),
+                },
+                queue_update_callback=self.message_handler.update_queue_positions,
+                node_started_callback=self.message_handler.mark_node_processing,
+            )
+        )
+        if self.message_handler.tree_queue.cleanup_stale_nodes() > 0:
+            tree_data = self.message_handler.tree_queue.to_dict()
+            session_store.sync_from_tree_data(
+                tree_data["trees"], tree_data["node_to_tree"]
+            )
+    def _publish_state(self) -> None:
+        self.app.state.messaging_platform = self.messaging_platform
+        self.app.state.message_handler = self.message_handler
+        self.app.state.cli_manager = self.cli_manager
+    async def _shutdown_limiter(self) -> None:
+        verbose = self.settings.log_api_error_tracebacks
+        try:
+            from messaging.limiter import MessagingRateLimiter
+        except Exception as e:
+            if verbose:
+                logger.debug(
+                    "Rate limiter shutdown skipped (import failed): {}: {}",
+                    type(e).__name__,
+                    e,
+                )
+            else:
+                logger.debug(
+                    "Rate limiter shutdown skipped (import failed): exc_type={}",
+                    type(e).__name__,
+                )
+            return
+        await best_effort(
+            "MessagingRateLimiter.shutdown_instance",
+            MessagingRateLimiter.shutdown_instance(),
+            timeout_s=2.0,
+            log_verbose_errors=verbose,
+        )

api/services.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""Application services for the Claude-compatible API."""
+from __future__ import annotations
+import traceback
+import uuid
+from collections.abc import AsyncIterator, Callable
+from typing import Any
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+from loguru import logger
+from config.settings import Settings
+from core.anthropic import get_token_count, get_user_facing_error_message
+from core.anthropic.sse import ANTHROPIC_SSE_RESPONSE_HEADERS, format_sse_event
+from providers.base import BaseProvider
+from providers.exceptions import (
+    InvalidRequestError,
+    OverloadedError,
+    ProviderError,
+    RateLimitError,
+)
+from .model_router import ModelRouter
+from .models.anthropic import MessagesRequest, TokenCountRequest
+from .models.responses import TokenCountResponse
+from .optimization_handlers import try_optimizations
+from .web_tools.egress import WebFetchEgressPolicy
+from .web_tools.request import (
+    is_web_server_tool_request,
+    openai_chat_upstream_server_tool_error,
+)
+TokenCounter = Callable[[list[Any], str | list[Any] | None, list[Any] | None], int]
+ProviderGetter = Callable[[str], BaseProvider]
+# Providers that use ``/chat/completions`` + Anthropic-to-OpenAI conversion (not native Messages).
+_OPENAI_CHAT_UPSTREAM_IDS = frozenset({"nvidia_nim", "groq", "cerebras"})
+def anthropic_sse_streaming_response(
+    body: AsyncIterator[str],
+) -> StreamingResponse:
+    """Return a :class:`StreamingResponse` for Anthropic-style SSE streams."""
+    return StreamingResponse(
+        body,
+        media_type="text/event-stream",
+        headers=ANTHROPIC_SSE_RESPONSE_HEADERS,
+    )
+def _http_status_for_unexpected_service_exception(_exc: BaseException) -> int:
+    """HTTP status for uncaught non-provider failures (stable client contract)."""
+    return 500
+def _log_unexpected_service_exception(
+    settings: Settings,
+    exc: BaseException,
+    *,
+    context: str,
+    request_id: str | None = None,
+) -> None:
+    """Log service-layer failures without echoing exception text unless opted in."""
+    if settings.log_api_error_tracebacks:
+        if request_id is not None:
+            logger.error("{} request_id={}: {}", context, request_id, exc)
+        else:
+            logger.error("{}: {}", context, exc)
+        logger.error(traceback.format_exc())
+        return
+    if request_id is not None:
+        logger.error(
+            "{} request_id={} exc_type={}",
+            context,
+            request_id,
+            type(exc).__name__,
+        )
+    else:
+        logger.error("{} exc_type={}", context, type(exc).__name__)
+def _require_non_empty_messages(messages: list[Any]) -> None:
+    if not messages:
+        raise InvalidRequestError("messages cannot be empty")
+class ClaudeProxyService:
+    """Coordinate request optimization, model routing, token count, and providers."""
+    def __init__(
+        self,
+        settings: Settings,
+        provider_getter: ProviderGetter,
+        model_router: ModelRouter | None = None,
+        token_counter: TokenCounter = get_token_count,
+    ):
+        self._settings = settings
+        self._provider_getter = provider_getter
+        self._model_router = model_router or ModelRouter(settings)
+        self._token_counter = token_counter
+    def create_message(self, request_data: MessagesRequest) -> object:
+        """Create a message response or streaming response with optional failover."""
+        from .web_tools.streaming import stream_web_server_tool_response
+        try:
+            _require_non_empty_messages(request_data.messages)
+            candidates = self._model_router.resolve_candidates(request_data.model)
+            if not candidates:
+                raise InvalidRequestError(f"No configured models available for '{request_data.model}'")
+            # For 'auto' requests with multiple candidates, we wrap the stream in a failover loop.
+            if len(candidates) > 1:
+                return anthropic_sse_streaming_response(
+                    self._stream_with_fallbacks(candidates, request_data)
+                )
+            # Standard path for single-model requests
+            return self._create_single_message(candidates[0], request_data)
+        except ProviderError:
+            raise
+        except Exception as e:
+            _log_unexpected_service_exception(
+                self._settings, e, context="CREATE_MESSAGE_ERROR"
+            )
+            raise HTTPException(
+                status_code=_http_status_for_unexpected_service_exception(e),
+                detail=get_user_facing_error_message(e),
+            ) from e
+    def _create_single_message(
+        self, resolved: ResolvedModel, request_data: MessagesRequest
+    ) -> object:
+        """Create a single message response from a resolved model."""
+        routed_request = request_data.model_copy(deep=True)
+        routed_request.model = resolved.provider_model
+        if resolved.provider_id in _OPENAI_CHAT_UPSTREAM_IDS:
+            tool_err = openai_chat_upstream_server_tool_error(
+                routed_request,
+                web_tools_enabled=self._settings.enable_web_server_tools,
+            )
+            if tool_err is not None:
+                raise InvalidRequestError(tool_err)
+        if self._settings.enable_web_server_tools and is_web_server_tool_request(
+            routed_request
+        ):
+            input_tokens = self._token_counter(
+                routed_request.messages, routed_request.system, routed_request.tools
+            )
+            logger.info("Optimization: Handling Anthropic web server tool")
+            egress = WebFetchEgressPolicy(
+                allow_private_network_targets=self._settings.web_fetch_allow_private_networks,
+                allowed_schemes=self._settings.web_fetch_allowed_scheme_set(),
+            )
+            return anthropic_sse_streaming_response(
+                stream_web_server_tool_response(
+                    routed_request,
+                    input_tokens=input_tokens,
+                    web_fetch_egress=egress,
+                    verbose_client_errors=self._settings.log_api_error_tracebacks,
+                ),
+            )
+        optimized = try_optimizations(routed_request, self._settings)
+        if optimized is not None:
+            return optimized
+        provider = self._provider_getter(resolved.provider_id)
+        provider.preflight_stream(
+            routed_request,
+            thinking_enabled=resolved.thinking_enabled,
+        )
+        request_id = f"req_{uuid.uuid4().hex[:12]}"
+        logger.info(
+            "API_REQUEST: request_id={} model={} messages={}",
+            request_id,
+            routed_request.model,
+            len(routed_request.messages),
+        )
+        input_tokens = self._token_counter(
+            routed_request.messages, routed_request.system, routed_request.tools
+        )
+        return anthropic_sse_streaming_response(
+            provider.stream_response(
+                routed_request,
+                input_tokens=input_tokens,
+                request_id=request_id,
+                thinking_enabled=resolved.thinking_enabled,
+            ),
+        )
+    async def _stream_with_fallbacks(
+        self, candidates: list[ResolvedModel], request_data: MessagesRequest
+    ) -> AsyncIterator[str]:
+        """Iterate through candidates until one succeeds or all fail."""
+        last_exc: Exception | None = None
+        for i, resolved in enumerate(candidates):
+            try:
+                provider = self._provider_getter(resolved.provider_id)
+                routed_request = request_data.model_copy(deep=True)
+                routed_request.model = resolved.provider_model
+                provider.preflight_stream(
+                    routed_request,
+                    thinking_enabled=resolved.thinking_enabled,
+                )
+                request_id = f"req_{uuid.uuid4().hex[:12]}"
+                logger.info(
+                    "API_REQUEST (auto fallback {}/{}): request_id={} provider={} model={}",
+                    i + 1,
+                    len(candidates),
+                    request_id,
+                    resolved.provider_id,
+                    resolved.provider_model,
+                )
+                input_tokens = self._token_counter(
+                    routed_request.messages, routed_request.system, routed_request.tools
+                )
+                # Attempt to stream from this provider.
+                async for event in provider.stream_response(
+                    routed_request,
+                    input_tokens=input_tokens,
+                    request_id=request_id,
+                    thinking_enabled=resolved.thinking_enabled,
+                ):
+                    yield event
+                    # CRITICAL: If we have yielded even one event, we have committed to this provider.
+                    # We must not fallback to another candidate mid-stream.
+                return  # Success, exit the fallback loop.
+            except (RateLimitError, OverloadedError) as e:
+                logger.warning(
+                    "Provider '{}' is rate limited or overloaded ({}). Trying next candidate...",
+                    resolved.provider_id,
+                    e.status_code,
+                )
+                last_exc = e
+                continue
+            except Exception as e:
+                logger.error(
+                    "Provider '{}' failed with unexpected error: {}. Trying next candidate...",
+                    resolved.provider_id,
+                    e,
+                )
+                last_exc = e
+                continue
+        err_msg = str(last_exc) if last_exc else "No candidates succeeded"
+        yield format_sse_event(
+            "error",
+            {
+                "type": "error",
+                "error": {
+                    "type": "api_error",
+                    "message": f"All fallback candidates failed: {err_msg}",
+                },
+            },
+        )
+        if last_exc:
+            raise last_exc
+        raise InvalidRequestError("No candidates succeeded")
+    def count_tokens(self, request_data: TokenCountRequest) -> TokenCountResponse:
+        """Count tokens for a request after applying configured model routing."""
+        request_id = f"req_{uuid.uuid4().hex[:12]}"
+        with logger.contextualize(request_id=request_id):
+            try:
+                _require_non_empty_messages(request_data.messages)
+                routed = self._model_router.resolve_token_count_request(request_data)
+                tokens = self._token_counter(
+                    routed.request.messages, routed.request.system, routed.request.tools
+                )
+                logger.info(
+                    "COUNT_TOKENS: request_id={} model={} messages={} input_tokens={}",
+                    request_id,
+                    routed.request.model,
+                    len(routed.request.messages),
+                    tokens,
+                )
+                return TokenCountResponse(input_tokens=tokens)
+            except ProviderError:
+                raise
+            except Exception as e:
+                _log_unexpected_service_exception(
+                    self._settings,
+                    e,
+                    context="COUNT_TOKENS_ERROR",
+                    request_id=request_id,
+                )
+                raise HTTPException(
+                    status_code=_http_status_for_unexpected_service_exception(e),
+                    detail=get_user_facing_error_message(e),
+                ) from e

api/validation_log.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Safe metadata summaries for HTTP 422 validation logging (no raw text content)."""
+from __future__ import annotations
+from typing import Any
+def summarize_request_validation_body(
+    body: Any,
+) -> tuple[list[dict[str, Any]], list[str]]:
+    """Return message shape summary and tool name list for debug logs."""
+    messages = body.get("messages") if isinstance(body, dict) else None
+    message_summary: list[dict[str, Any]] = []
+    if isinstance(messages, list):
+        for msg in messages:
+            if not isinstance(msg, dict):
+                message_summary.append({"message_kind": type(msg).__name__})
+                continue
+            content = msg.get("content")
+            item: dict[str, Any] = {
+                "role": msg.get("role"),
+                "content_kind": type(content).__name__,
+            }
+            if isinstance(content, list):
+                item["block_types"] = [
+                    block.get("type", "dict")
+                    if isinstance(block, dict)
+                    else type(block).__name__
+                    for block in content[:12]
+                ]
+                item["block_keys"] = [
+                    sorted(str(key) for key in block)[:12]
+                    for block in content[:5]
+                    if isinstance(block, dict)
+                ]
+            elif isinstance(content, str):
+                item["content_length"] = len(content)
+            message_summary.append(item)
+    tool_names: list[str] = []
+    if isinstance(body, dict) and isinstance(body.get("tools"), list):
+        tool_names = [
+            str(tool.get("name", ""))
+            for tool in body["tools"]
+            if isinstance(tool, dict)
+        ]
+    return message_summary, tool_names

api/web_server_tools.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Compatibility re-exports for :mod:`api.web_tools` (web_search / web_fetch)."""
+from __future__ import annotations
+import httpx
+from api.web_tools.egress import (
+    WebFetchEgressPolicy,
+    WebFetchEgressViolation,
+    enforce_web_fetch_egress,
+)
+from api.web_tools.request import is_web_server_tool_request
+from api.web_tools.streaming import stream_web_server_tool_response
+__all__ = [
+    "WebFetchEgressPolicy",
+    "WebFetchEgressViolation",
+    "enforce_web_fetch_egress",
+    "httpx",
+    "is_web_server_tool_request",
+    "stream_web_server_tool_response",
+]

api/web_tools/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Submodules for Anthropic web server tool handling (search/fetch, egress, streaming)."""
+from .egress import (
+    WebFetchEgressPolicy,
+    WebFetchEgressViolation,
+    enforce_web_fetch_egress,
+)
+from .request import is_web_server_tool_request
+from .streaming import stream_web_server_tool_response
+__all__ = [
+    "WebFetchEgressPolicy",
+    "WebFetchEgressViolation",
+    "enforce_web_fetch_egress",
+    "is_web_server_tool_request",
+    "stream_web_server_tool_response",
+]

api/web_tools/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (571 Bytes). View file

api/web_tools/__pycache__/constants.cpython-314.pyc ADDED Viewed

Binary file (680 Bytes). View file

api/web_tools/__pycache__/egress.cpython-314.pyc ADDED Viewed

Binary file (5.32 kB). View file

api/web_tools/__pycache__/parsers.cpython-314.pyc ADDED Viewed

Binary file (8.64 kB). View file

api/web_tools/__pycache__/request.cpython-314.pyc ADDED Viewed

Binary file (6.48 kB). View file

api/web_tools/__pycache__/streaming.cpython-314.pyc ADDED Viewed

Binary file (6.6 kB). View file

api/web_tools/constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Limits and defaults for outbound web server tool HTTP."""
+_REQUEST_TIMEOUT_S = 20.0
+_MAX_SEARCH_RESULTS = 10
+_MAX_FETCH_CHARS = 24_000
+# Hard cap on raw bytes read from HTTP responses before decode / HTML parse (memory bound).
+_MAX_WEB_FETCH_RESPONSE_BYTES = 2 * 1024 * 1024
+# Drain at most this many bytes from redirect responses before following Location.
+_REDIRECT_RESPONSE_BODY_CAP_BYTES = 65_536
+_MAX_WEB_FETCH_REDIRECTS = 10
+_WEB_FETCH_REDIRECT_STATUSES = frozenset({301, 302, 303, 307, 308})
+_WEB_TOOL_HTTP_HEADERS = {
+    "User-Agent": "Mozilla/5.0 compatible; free-claude-code/2.0",
+}

api/web_tools/egress.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Egress policy for user-controlled web_fetch URLs (SSRF guard)."""
+from __future__ import annotations
+import ipaddress
+import socket
+from dataclasses import dataclass
+from urllib.parse import urlparse
+@dataclass(frozen=True, slots=True)
+class WebFetchEgressPolicy:
+    """Egress rules for user-influenced web_fetch URLs."""
+    allow_private_network_targets: bool
+    allowed_schemes: frozenset[str]
+class WebFetchEgressViolation(ValueError):
+    """Raised when a web_fetch URL is rejected by egress policy (SSRF guard)."""
+def _port_for_url(parsed) -> int:
+    if parsed.port is not None:
+        return parsed.port
+    return 443 if (parsed.scheme or "").lower() == "https" else 80
+def _stream_getaddrinfo_or_raise(host: str, port: int) -> list[tuple]:
+    try:
+        return socket.getaddrinfo(
+            host, port, type=socket.SOCK_STREAM, proto=socket.IPPROTO_TCP
+        )
+    except OSError as exc:
+        raise WebFetchEgressViolation(
+            f"Could not resolve host {host!r}: {exc}"
+        ) from exc
+def get_validated_stream_addrinfos_for_egress(
+    url: str, policy: WebFetchEgressPolicy
+) -> list[tuple]:
+    """Resolve and validate a URL for web_fetch, returning getaddrinfo rows for pinning.
+    Each HTTP connect pins to only these `getaddrinfo` results so a malicious DNS
+    server cannot rebind to a disallowed address between resolution and the TCP
+    connect (used by :func:`api.web_tools.outbound._run_web_fetch`).
+    """
+    parsed = urlparse(url)
+    scheme = (parsed.scheme or "").lower()
+    if scheme not in policy.allowed_schemes:
+        raise WebFetchEgressViolation(
+            f"URL scheme {scheme!r} is not allowed for web_fetch"
+        )
+    host = parsed.hostname
+    if host is None or host == "":
+        raise WebFetchEgressViolation("web_fetch URL must include a host")
+    port = _port_for_url(parsed)
+    if policy.allow_private_network_targets:
+        return _stream_getaddrinfo_or_raise(host, port)
+    host_lower = host.lower()
+    if host_lower == "localhost" or host_lower.endswith(".localhost"):
+        raise WebFetchEgressViolation("localhost targets are not allowed for web_fetch")
+    if host_lower.endswith(".local"):
+        raise WebFetchEgressViolation(".local hostnames are not allowed for web_fetch")
+    try:
+        parsed_ip = ipaddress.ip_address(host)
+    except ValueError:
+        parsed_ip = None
+    if parsed_ip is not None:
+        if not parsed_ip.is_global:
+            raise WebFetchEgressViolation(
+                f"Non-public IP host {host!r} is not allowed for web_fetch"
+            )
+        return _stream_getaddrinfo_or_raise(host, port)
+    infos = _stream_getaddrinfo_or_raise(host, port)
+    for *_, sockaddr in infos:
+        addr = sockaddr[0]
+        try:
+            resolved = ipaddress.ip_address(addr)
+        except ValueError:
+            continue
+        if not resolved.is_global:
+            raise WebFetchEgressViolation(
+                f"Host {host!r} resolves to a non-public address ({resolved})"
+            )
+    return infos
+def enforce_web_fetch_egress(url: str, policy: WebFetchEgressPolicy) -> None:
+    """Validate ``url`` (scheme, host, and resolved addresses) for web_fetch."""
+    get_validated_stream_addrinfos_for_egress(url, policy)

api/web_tools/outbound.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""Outbound HTTP for web_search / web_fetch (client, body caps, logging)."""
+from __future__ import annotations
+import asyncio
+import socket
+from collections.abc import AsyncIterator
+from urllib.parse import urljoin, urlparse
+import aiohttp
+import httpx
+from aiohttp import ClientSession, ClientTimeout, TCPConnector
+from aiohttp.abc import AbstractResolver, ResolveResult
+from loguru import logger
+from . import constants
+from .constants import (
+    _MAX_FETCH_CHARS,
+    _MAX_SEARCH_RESULTS,
+    _REDIRECT_RESPONSE_BODY_CAP_BYTES,
+    _REQUEST_TIMEOUT_S,
+    _WEB_FETCH_REDIRECT_STATUSES,
+    _WEB_TOOL_HTTP_HEADERS,
+)
+from .egress import (
+    WebFetchEgressPolicy,
+    WebFetchEgressViolation,
+    get_validated_stream_addrinfos_for_egress,
+)
+from .parsers import HTMLTextParser, SearchResultParser
+def _safe_public_host_for_logs(url: str) -> str:
+    host = urlparse(url).hostname or ""
+    return host[:253]
+def _log_web_tool_failure(
+    tool_name: str,
+    error: BaseException,
+    *,
+    fetch_url: str | None = None,
+) -> None:
+    exc_type = type(error).__name__
+    if isinstance(error, WebFetchEgressViolation):
+        host = _safe_public_host_for_logs(fetch_url) if fetch_url else ""
+        logger.warning(
+            "web_tool_egress_rejected tool={} exc_type={} host={!r}",
+            tool_name,
+            exc_type,
+            host,
+        )
+        return
+    if tool_name == "web_fetch" and fetch_url:
+        logger.warning(
+            "web_tool_failure tool={} exc_type={} host={!r}",
+            tool_name,
+            exc_type,
+            _safe_public_host_for_logs(fetch_url),
+        )
+    else:
+        logger.warning("web_tool_failure tool={} exc_type={}", tool_name, exc_type)
+def _web_tool_client_error_summary(
+    tool_name: str,
+    error: BaseException,
+    *,
+    verbose: bool,
+) -> str:
+    if verbose:
+        return f"{tool_name} failed: {type(error).__name__}"
+    return "Web tool request failed."
+async def _iter_response_body_under_cap(
+    response: httpx.Response, max_bytes: int
+) -> AsyncIterator[bytes]:
+    if max_bytes <= 0:
+        return
+    received = 0
+    async for chunk in response.aiter_bytes(chunk_size=65_536):
+        if received >= max_bytes:
+            break
+        remaining = max_bytes - received
+        if len(chunk) <= remaining:
+            received += len(chunk)
+            yield chunk
+            if received >= max_bytes:
+                break
+        else:
+            yield chunk[:remaining]
+            break
+async def _drain_response_body_capped(response: httpx.Response, max_bytes: int) -> None:
+    async for _ in _iter_response_body_under_cap(response, max_bytes):
+        pass
+async def _read_response_body_capped(response: httpx.Response, max_bytes: int) -> bytes:
+    return b"".join(
+        [piece async for piece in _iter_response_body_under_cap(response, max_bytes)]
+    )
+_NUMERIC_RESOLVE_FLAGS = socket.AI_NUMERICHOST | socket.AI_NUMERICSERV
+_NAME_RESOLVE_FLAGS = socket.NI_NUMERICHOST | socket.NI_NUMERICSERV
+def getaddrinfo_rows_to_resolve_results(
+    host: str, addrinfos: list[tuple]
+) -> list[ResolveResult]:
+    """Map :func:`socket.getaddrinfo` rows to aiohttp :class:`ResolveResult` (ThreadedResolver logic)."""
+    out: list[ResolveResult] = []
+    for family, _type, proto, _canon, sockaddr in addrinfos:
+        if family == socket.AF_INET6:
+            if len(sockaddr) < 3:
+                continue
+            if sockaddr[3]:
+                resolved_host, port = socket.getnameinfo(sockaddr, _NAME_RESOLVE_FLAGS)
+            else:
+                resolved_host, port = sockaddr[:2]
+        else:
+            assert family == socket.AF_INET, family
+            resolved_host, port = sockaddr[0], sockaddr[1]
+            resolved_host = str(resolved_host)
+            port = int(port)
+        out.append(
+            ResolveResult(
+                hostname=host,
+                host=resolved_host,
+                port=int(port),
+                family=family,
+                proto=proto,
+                flags=_NUMERIC_RESOLVE_FLAGS,
+            )
+        )
+    return out
+class _PinnedEgressStaticResolver(AbstractResolver):
+    """Return only pre-validated :class:`ResolveResult` for the outbound request."""
+    def __init__(self, results: list[ResolveResult]) -> None:
+        self._results = results
+    async def resolve(
+        self, host: str, port: int = 0, family: int = socket.AF_INET
+    ) -> list[ResolveResult]:
+        return self._results
+    async def close(self) -> None:  # pragma: no cover - aiohttp contract
+        return
+async def _read_aiohttp_body_capped(
+    response: aiohttp.ClientResponse, max_bytes: int
+) -> bytes:
+    received = 0
+    parts: list[bytes] = []
+    async for chunk in response.content.iter_chunked(65_536):
+        if received >= max_bytes:
+            break
+        remaining = max_bytes - received
+        if len(chunk) <= remaining:
+            received += len(chunk)
+            parts.append(chunk)
+        else:
+            parts.append(chunk[:remaining])
+            break
+    return b"".join(parts)
+async def _drain_aiohttp_body_capped(
+    response: aiohttp.ClientResponse, max_bytes: int
+) -> None:
+    if max_bytes <= 0:
+        return
+    received = 0
+    async for chunk in response.content.iter_chunked(65_536):
+        received += len(chunk)
+        if received >= max_bytes:
+            break
+async def _run_web_search(query: str) -> list[dict[str, str]]:
+    async with (
+        httpx.AsyncClient(
+            timeout=_REQUEST_TIMEOUT_S,
+            follow_redirects=True,
+            headers=_WEB_TOOL_HTTP_HEADERS,
+        ) as client,
+        client.stream(
+            "GET",
+            "https://lite.duckduckgo.com/lite/",
+            params={"q": query},
+        ) as response,
+    ):
+        response.raise_for_status()
+        body_bytes = await _read_response_body_capped(
+            response, constants._MAX_WEB_FETCH_RESPONSE_BYTES
+        )
+    text = body_bytes.decode("utf-8", errors="replace")
+    parser = SearchResultParser()
+    parser.feed(text)
+    return parser.results[:_MAX_SEARCH_RESULTS]
+async def _run_web_fetch(url: str, egress: WebFetchEgressPolicy) -> dict[str, str]:
+    """Fetch URL with manual redirects; each hop is DNS-pinned to validated addresses."""
+    current_url = url
+    redirect_hops = 0
+    timeout = ClientTimeout(total=_REQUEST_TIMEOUT_S)
+    while True:
+        addr_infos = await asyncio.to_thread(
+            get_validated_stream_addrinfos_for_egress, current_url, egress
+        )
+        host = urlparse(current_url).hostname or ""
+        results = getaddrinfo_rows_to_resolve_results(host, addr_infos)
+        resolver = _PinnedEgressStaticResolver(results)
+        connector = TCPConnector(
+            resolver=resolver,
+            force_close=True,
+        )
+        try:
+            async with (
+                ClientSession(
+                    timeout=timeout,
+                    headers=_WEB_TOOL_HTTP_HEADERS,
+                    connector=connector,
+                ) as session,
+                session.get(current_url, allow_redirects=False) as response,
+            ):
+                if response.status in _WEB_FETCH_REDIRECT_STATUSES:
+                    await _drain_aiohttp_body_capped(
+                        response, _REDIRECT_RESPONSE_BODY_CAP_BYTES
+                    )
+                    if redirect_hops >= constants._MAX_WEB_FETCH_REDIRECTS:
+                        raise WebFetchEgressViolation(
+                            "web_fetch exceeded maximum redirects "
+                            f"({constants._MAX_WEB_FETCH_REDIRECTS})"
+                        )
+                    location = response.headers.get("location")
+                    if not location or not location.strip():
+                        raise WebFetchEgressViolation(
+                            "web_fetch redirect response missing Location header"
+                        )
+                    current_url = urljoin(str(response.url), location.strip())
+                    redirect_hops += 1
+                    continue
+                response.raise_for_status()
+                content_type = response.headers.get("content-type", "text/plain")
+                final_url = str(response.url)
+                encoding = response.get_encoding() or "utf-8"
+                body_bytes = await _read_aiohttp_body_capped(
+                    response, constants._MAX_WEB_FETCH_RESPONSE_BYTES
+                )
+        finally:
+            await connector.close()
+        break
+    text = body_bytes.decode(encoding, errors="replace")
+    title = final_url
+    data = text
+    if "html" in content_type.lower():
+        parser = HTMLTextParser()
+        parser.feed(text)
+        title = parser.title or final_url
+        data = "\n".join(parser.text_parts)
+    return {
+        "url": final_url,
+        "title": title,
+        "media_type": "text/plain",
+        "data": data[:_MAX_FETCH_CHARS],
+    }

api/web_tools/parsers.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""HTML parsing for web_search / web_fetch."""
+from __future__ import annotations
+import html
+import re
+from html.parser import HTMLParser
+from typing import Any
+from urllib.parse import parse_qs, unquote, urlparse
+class SearchResultParser(HTMLParser):
+    """DuckDuckGo lite HTML: extract result links and titles."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.results: list[dict[str, str]] = []
+        self._href: str | None = None
+        self._title_parts: list[str] = []
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag != "a":
+            return
+        href = dict(attrs).get("href")
+        if not href or "uddg=" not in href:
+            return
+        parsed = urlparse(href)
+        query = parse_qs(parsed.query)
+        uddg = query.get("uddg", [""])[0]
+        if not uddg:
+            return
+        self._href = unquote(uddg)
+        self._title_parts = []
+    def handle_data(self, data: str) -> None:
+        if self._href is not None:
+            self._title_parts.append(data)
+    def handle_endtag(self, tag: str) -> None:
+        if tag != "a" or self._href is None:
+            return
+        title = " ".join("".join(self._title_parts).split())
+        if title and not any(result["url"] == self._href for result in self.results):
+            self.results.append({"title": html.unescape(title), "url": self._href})
+        self._href = None
+        self._title_parts = []
+class HTMLTextParser(HTMLParser):
+    """Strip scripts/styles and collect visible text + title for fetch previews."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.title = ""
+        self.text_parts: list[str] = []
+        self._in_title = False
+        self._skip_depth = 0
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag in {"script", "style", "noscript"}:
+            self._skip_depth += 1
+        elif tag == "title":
+            self._in_title = True
+    def handle_endtag(self, tag: str) -> None:
+        if tag in {"script", "style", "noscript"} and self._skip_depth:
+            self._skip_depth -= 1
+        elif tag == "title":
+            self._in_title = False
+    def handle_data(self, data: str) -> None:
+        text = " ".join(data.split())
+        if not text:
+            return
+        if self._in_title:
+            self.title = f"{self.title} {text}".strip()
+        elif not self._skip_depth:
+            self.text_parts.append(text)
+def content_text(content: Any) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts = []
+        for item in content:
+            if isinstance(item, dict):
+                parts.append(str(item.get("text", "")))
+            else:
+                parts.append(str(getattr(item, "text", "")))
+        return "\n".join(part for part in parts if part)
+    return str(content)
+def extract_query(text: str) -> str:
+    match = re.search(r"query:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL)
+    if match:
+        return match.group(1).strip().strip("\"'")
+    return text.strip()
+def extract_url(text: str) -> str:
+    match = re.search(r"https?://\S+", text)
+    return match.group(0).rstrip(").,]") if match else text.strip()

api/web_tools/request.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Detect forced Anthropic web server tool requests."""
+from __future__ import annotations
+from api.models.anthropic import MessagesRequest, Tool
+def request_text(request: MessagesRequest) -> str:
+    """Join all user/assistant message content into one string for tool input parsing."""
+    from .parsers import content_text
+    return "\n".join(content_text(message.content) for message in request.messages)
+def forced_tool_turn_text(request: MessagesRequest) -> str:
+    """Text for parsing forced server-tool inputs: latest user turn only (avoids stale history)."""
+    if not request.messages:
+        return ""
+    from .parsers import content_text
+    for message in reversed(request.messages):
+        if message.role == "user":
+            return content_text(message.content)
+    return ""
+def forced_server_tool_name(request: MessagesRequest) -> str | None:
+    """Return web_search or web_fetch only when tool_choice forces that server tool."""
+    tc = request.tool_choice
+    if not isinstance(tc, dict):
+        return None
+    if tc.get("type") != "tool":
+        return None
+    name = tc.get("name")
+    if name in {"web_search", "web_fetch"}:
+        return str(name)
+    return None
+def has_tool_named(request: MessagesRequest, name: str) -> bool:
+    return any(tool.name == name for tool in request.tools or [])
+def is_web_server_tool_request(request: MessagesRequest) -> bool:
+    """True when the client forces a web server tool via tool_choice (not merely listed)."""
+    forced = forced_server_tool_name(request)
+    if forced is None:
+        return False
+    return has_tool_named(request, forced)
+def is_anthropic_server_tool_definition(tool: Tool) -> bool:
+    """Whether ``tool`` refers to an Anthropic server tool (web_search / web_fetch family)."""
+    name = (tool.name or "").strip()
+    if name in ("web_search", "web_fetch"):
+        return True
+    typ = tool.type
+    if isinstance(typ, str):
+        return typ.startswith("web_search") or typ.startswith("web_fetch")
+    return False
+def has_listed_anthropic_server_tools(request: MessagesRequest) -> bool:
+    """True when tools include web_search / web_fetch-style entries (listed, forced or not)."""
+    return any(is_anthropic_server_tool_definition(t) for t in (request.tools or []))
+def openai_chat_upstream_server_tool_error(
+    request: MessagesRequest, *, web_tools_enabled: bool
+) -> str | None:
+    """Return a user-facing error when OpenAI Chat upstream cannot satisfy server-tool semantics."""
+    forced = forced_server_tool_name(request)
+    if forced and not web_tools_enabled:
+        return (
+            f"tool_choice forces Anthropic server tool {forced!r}, but local web server tools are "
+            "disabled (ENABLE_WEB_SERVER_TOOLS=false). Enable them to use this tool."
+        )
+    if not forced and has_listed_anthropic_server_tools(request):
+        return (
+            "OpenAI Chat upstreams (NVIDIA NIM) cannot use listed Anthropic server tools "
+            "(web_search / web_fetch) without the local web server tool handler. "
+            "Set ENABLE_WEB_SERVER_TOOLS=true and force the tool with "
+            "tool_choice, or remove these tools from the request."
+        )
+    return None

api/web_tools/streaming.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""SSE streaming for local web_search / web_fetch server tool results."""
+from __future__ import annotations
+import uuid
+from collections.abc import AsyncIterator
+from datetime import UTC, datetime
+from typing import Any
+from api.models.anthropic import MessagesRequest
+from core.anthropic.server_tool_sse import (
+    SERVER_TOOL_USE,
+    WEB_FETCH_TOOL_ERROR,
+    WEB_FETCH_TOOL_RESULT,
+    WEB_SEARCH_TOOL_RESULT,
+    WEB_SEARCH_TOOL_RESULT_ERROR,
+)
+from core.anthropic.sse import format_sse_event
+from .constants import _MAX_FETCH_CHARS
+from .egress import WebFetchEgressPolicy
+from .parsers import extract_query, extract_url
+from .request import (
+    forced_server_tool_name,
+    forced_tool_turn_text,
+    has_tool_named,
+)
+def _search_summary(query: str, results: list[dict[str, str]]) -> str:
+    if not results:
+        return f"No web search results found for: {query}"
+    lines = [f"Search results for: {query}"]
+    for index, result in enumerate(results, start=1):
+        lines.append(f"{index}. {result['title']}\n{result['url']}")
+    return "\n\n".join(lines)
+async def stream_web_server_tool_response(
+    request: MessagesRequest,
+    input_tokens: int,
+    *,
+    web_fetch_egress: WebFetchEgressPolicy,
+    verbose_client_errors: bool = False,
+) -> AsyncIterator[str]:
+    """Stream a minimal Anthropic-shaped turn for forced `web_search` / `web_fetch` (local fallback).
+    When `ENABLE_WEB_SERVER_TOOLS` is on, this is a proxy-side execution path — not a full
+    hosted Anthropic citation or encrypted-content pipeline.
+    """
+    from . import outbound
+    tool_name = forced_server_tool_name(request)
+    if tool_name is None or not has_tool_named(request, tool_name):
+        return
+    text = forced_tool_turn_text(request)
+    message_id = f"msg_{uuid.uuid4()}"
+    tool_id = f"srvtoolu_{uuid.uuid4().hex}"
+    usage_key = (
+        "web_search_requests" if tool_name == "web_search" else "web_fetch_requests"
+    )
+    tool_input = (
+        {"query": extract_query(text)}
+        if tool_name == "web_search"
+        else {"url": extract_url(text)}
+    )
+    _result_block_for_tool = {
+        "web_search": WEB_SEARCH_TOOL_RESULT,
+        "web_fetch": WEB_FETCH_TOOL_RESULT,
+    }
+    _error_payload_type_for_tool = {
+        "web_search": WEB_SEARCH_TOOL_RESULT_ERROR,
+        "web_fetch": WEB_FETCH_TOOL_ERROR,
+    }
+    yield format_sse_event(
+        "message_start",
+        {
+            "type": "message_start",
+            "message": {
+                "id": message_id,
+                "type": "message",
+                "role": "assistant",
+                "content": [],
+                "model": request.model,
+                "stop_reason": None,
+                "stop_sequence": None,
+                "usage": {"input_tokens": input_tokens, "output_tokens": 1},
+            },
+        },
+    )
+    yield format_sse_event(
+        "content_block_start",
+        {
+            "type": "content_block_start",
+            "index": 0,
+            "content_block": {
+                "type": SERVER_TOOL_USE,
+                "id": tool_id,
+                "name": tool_name,
+                "input": tool_input,
+            },
+        },
+    )
+    yield format_sse_event(
+        "content_block_stop", {"type": "content_block_stop", "index": 0}
+    )
+    try:
+        if tool_name == "web_search":
+            query = str(tool_input["query"])
+            results = await outbound._run_web_search(query)
+            result_content: Any = [
+                {
+                    "type": "web_search_result",
+                    "title": result["title"],
+                    "url": result["url"],
+                }
+                for result in results
+            ]
+            summary = _search_summary(query, results)
+            result_block_type = WEB_SEARCH_TOOL_RESULT
+        else:
+            fetched = await outbound._run_web_fetch(
+                str(tool_input["url"]), web_fetch_egress
+            )
+            result_content = {
+                "type": "web_fetch_result",
+                "url": fetched["url"],
+                "content": {
+                    "type": "document",
+                    "source": {
+                        "type": "text",
+                        "media_type": fetched["media_type"],
+                        "data": fetched["data"],
+                    },
+                    "title": fetched["title"],
+                    "citations": {"enabled": True},
+                },
+                "retrieved_at": datetime.now(UTC).isoformat(),
+            }
+            summary = fetched["data"][:_MAX_FETCH_CHARS]
+            result_block_type = WEB_FETCH_TOOL_RESULT
+    except Exception as error:
+        fetch_url = str(tool_input["url"]) if tool_name == "web_fetch" else None
+        outbound._log_web_tool_failure(tool_name, error, fetch_url=fetch_url)
+        result_block_type = _result_block_for_tool[tool_name]
+        result_content = {
+            "type": _error_payload_type_for_tool[tool_name],
+            "error_code": "unavailable",
+        }
+        summary = outbound._web_tool_client_error_summary(
+            tool_name, error, verbose=verbose_client_errors
+        )
+    output_tokens = max(1, len(summary) // 4)
+    yield format_sse_event(
+        "content_block_start",
+        {
+            "type": "content_block_start",
+            "index": 1,
+            "content_block": {
+                "type": result_block_type,
+                "tool_use_id": tool_id,
+                "content": result_content,
+            },
+        },
+    )
+    yield format_sse_event(
+        "content_block_stop", {"type": "content_block_stop", "index": 1}
+    )
+    # Model-facing summary: stream as normal text deltas (CLI/transcript code reads `text_delta`,
+    # not eager `text` on `content_block_start`).
+    yield format_sse_event(
+        "content_block_start",
+        {
+            "type": "content_block_start",
+            "index": 2,
+            "content_block": {"type": "text", "text": ""},
+        },
+    )
+    yield format_sse_event(
+        "content_block_delta",
+        {
+            "type": "content_block_delta",
+            "index": 2,
+            "delta": {"type": "text_delta", "text": summary},
+        },
+    )
+    yield format_sse_event(
+        "content_block_stop", {"type": "content_block_stop", "index": 2}
+    )
+    yield format_sse_event(
+        "message_delta",
+        {
+            "type": "message_delta",
+            "delta": {"stop_reason": "end_turn", "stop_sequence": None},
+            "usage": {
+                "input_tokens": input_tokens,
+                "output_tokens": output_tokens,
+                "server_tool_use": {usage_key: 1},
+            },
+        },
+    )
+    yield format_sse_event("message_stop", {"type": "message_stop"})

cli/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""CLI integration for Claude Code."""
+from .manager import CLISessionManager
+from .session import CLISession
+__all__ = ["CLISession", "CLISessionManager"]

cli/entrypoints.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""CLI entry points for the installed package."""
+from __future__ import annotations
+from pathlib import Path
+def _load_env_template() -> str:
+    """Load the canonical root env template from package resources or source."""
+    import importlib.resources
+    packaged = importlib.resources.files("cli").joinpath("env.example")
+    if packaged.is_file():
+        return packaged.read_text("utf-8")
+    source_template = Path(__file__).resolve().parents[1] / ".env.example"
+    if source_template.is_file():
+        return source_template.read_text(encoding="utf-8")
+    raise FileNotFoundError("Could not find bundled or source .env.example template.")
+def serve() -> None:
+    """Start the FastAPI server (registered as `free-claude-code` script)."""
+    import uvicorn
+    from cli.process_registry import kill_all_best_effort
+    from config.settings import get_settings
+    settings = get_settings()
+    try:
+        uvicorn.run(
+            "api.app:create_asgi_app",
+            factory=True,
+            host=settings.host,
+            port=settings.port,
+            log_level="debug",
+            timeout_graceful_shutdown=5,
+        )
+    finally:
+        kill_all_best_effort()
+def init() -> None:
+    """Scaffold config at ~/.config/free-claude-code/.env (registered as `fcc-init`)."""
+    config_dir = Path.home() / ".config" / "free-claude-code"
+    env_file = config_dir / ".env"
+    if env_file.exists():
+        print(f"Config already exists at {env_file}")
+        print("Delete it first if you want to reset to defaults.")
+        return
+    config_dir.mkdir(parents=True, exist_ok=True)
+    template = _load_env_template()
+    env_file.write_text(template, encoding="utf-8")
+    print(f"Config created at {env_file}")
+    print(
+        "Edit it to set your API keys and model preferences, then run: free-claude-code"
+    )

cli/manager.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+CLI Session Manager for Multi-Instance Claude CLI Support
+Manages a pool of CLISession instances, each handling one conversation.
+This enables true parallel processing where multiple conversations run
+simultaneously in separate CLI processes.
+"""
+import asyncio
+import uuid
+from loguru import logger
+from .session import CLISession
+class CLISessionManager:
+    """
+    Manages multiple CLISession instances for parallel conversation processing.
+    Each new conversation gets its own CLISession with its own subprocess.
+    Replies to existing conversations reuse the same CLISession instance.
+    """
+    def __init__(
+        self,
+        workspace_path: str,
+        api_url: str,
+        allowed_dirs: list[str] | None = None,
+        plans_directory: str | None = None,
+        claude_bin: str = "claude",
+        *,
+        log_raw_cli_diagnostics: bool = False,
+        log_messaging_error_details: bool = False,
+    ):
+        """
+        Initialize the session manager.
+        Args:
+            workspace_path: Working directory for CLI processes
+            api_url: API URL for the proxy
+            allowed_dirs: Directories the CLI is allowed to access
+            plans_directory: Directory for Claude Code CLI plan files (passed via --settings)
+        """
+        self.workspace = workspace_path
+        self.api_url = api_url
+        self.allowed_dirs = allowed_dirs or []
+        self.plans_directory = plans_directory
+        self.claude_bin = claude_bin
+        self._log_raw_cli_diagnostics = log_raw_cli_diagnostics
+        self._log_messaging_error_details = log_messaging_error_details
+        self._sessions: dict[str, CLISession] = {}
+        self._pending_sessions: dict[str, CLISession] = {}
+        self._temp_to_real: dict[str, str] = {}
+        self._real_to_temp: dict[str, str] = {}
+        self._lock = asyncio.Lock()
+        logger.info("CLISessionManager initialized")
+    async def get_or_create_session(
+        self, session_id: str | None = None
+    ) -> tuple[CLISession, str, bool]:
+        """
+        Get an existing session or create a new one.
+        Returns:
+            Tuple of (CLISession instance, session_id, is_new_session)
+        """
+        async with self._lock:
+            if session_id:
+                lookup_id = self._temp_to_real.get(session_id, session_id)
+                if lookup_id in self._sessions:
+                    return self._sessions[lookup_id], lookup_id, False
+                if lookup_id in self._pending_sessions:
+                    return self._pending_sessions[lookup_id], lookup_id, False
+            temp_id = session_id if session_id else f"pending_{uuid.uuid4().hex[:8]}"
+            new_session = CLISession(
+                workspace_path=self.workspace,
+                api_url=self.api_url,
+                allowed_dirs=self.allowed_dirs,
+                plans_directory=self.plans_directory,
+                claude_bin=self.claude_bin,
+                log_raw_cli_diagnostics=self._log_raw_cli_diagnostics,
+            )
+            self._pending_sessions[temp_id] = new_session
+            logger.info(f"Created new session: {temp_id}")
+            return new_session, temp_id, True
+    async def register_real_session_id(
+        self, temp_id: str, real_session_id: str
+    ) -> bool:
+        """Register the real session ID from CLI output."""
+        async with self._lock:
+            if temp_id not in self._pending_sessions:
+                logger.warning(f"Temp session {temp_id} not found")
+                return False
+            session = self._pending_sessions.pop(temp_id)
+            self._sessions[real_session_id] = session
+            self._temp_to_real[temp_id] = real_session_id
+            self._real_to_temp[real_session_id] = temp_id
+            logger.info(f"Registered session: {temp_id} -> {real_session_id}")
+            return True
+    async def remove_session(self, session_id: str) -> bool:
+        """Remove a session from the manager."""
+        async with self._lock:
+            if session_id in self._pending_sessions:
+                session = self._pending_sessions.pop(session_id)
+                await session.stop()
+                return True
+            if session_id in self._sessions:
+                session = self._sessions.pop(session_id)
+                await session.stop()
+                temp_id = self._real_to_temp.pop(session_id, None)
+                if temp_id is not None:
+                    self._temp_to_real.pop(temp_id, None)
+                return True
+            return False
+    async def stop_all(self):
+        """Stop all sessions."""
+        async with self._lock:
+            all_sessions = list(self._sessions.values()) + list(
+                self._pending_sessions.values()
+            )
+            for session in all_sessions:
+                try:
+                    await session.stop()
+                except Exception as e:
+                    if self._log_messaging_error_details:
+                        logger.error(
+                            "Error stopping session: {}: {}",
+                            type(e).__name__,
+                            e,
+                        )
+                    else:
+                        logger.error(
+                            "Error stopping session: exc_type={}",
+                            type(e).__name__,
+                        )
+            self._sessions.clear()
+            self._pending_sessions.clear()
+            self._temp_to_real.clear()
+            self._real_to_temp.clear()
+            logger.info("All sessions stopped")
+    def get_stats(self) -> dict:
+        """Get session statistics."""
+        return {
+            "active_sessions": len(self._sessions),
+            "pending_sessions": len(self._pending_sessions),
+            "busy_count": sum(1 for s in self._sessions.values() if s.is_busy),
+        }

cli/process_registry.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Track and clean up spawned CLI subprocesses.
+This is a safety net for cases where the server is interrupted (Ctrl+C) and the
+FastAPI lifespan cleanup doesn't run to completion. We only track processes we
+spawn so we don't accidentally kill unrelated system processes.
+"""
+from __future__ import annotations
+import atexit
+import os
+import subprocess
+import threading
+from loguru import logger
+_lock = threading.Lock()
+_pids: set[int] = set()
+_atexit_registered = False
+def ensure_atexit_registered() -> None:
+    global _atexit_registered
+    with _lock:
+        if _atexit_registered:
+            return
+        atexit.register(kill_all_best_effort)
+        _atexit_registered = True
+def register_pid(pid: int) -> None:
+    if not pid:
+        return
+    ensure_atexit_registered()
+    with _lock:
+        _pids.add(int(pid))
+def unregister_pid(pid: int) -> None:
+    if not pid:
+        return
+    with _lock:
+        _pids.discard(int(pid))
+def kill_all_best_effort() -> None:
+    """Kill any still-running registered pids (best-effort)."""
+    with _lock:
+        pids = list(_pids)
+        _pids.clear()
+    if not pids:
+        return
+    if os.name == "nt":
+        for pid in pids:
+            try:
+                # /T kills child processes, /F forces termination.
+                subprocess.run(
+                    ["taskkill", "/PID", str(pid), "/T", "/F"],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                    check=False,
+                )
+            except Exception as e:
+                logger.debug("process_registry: taskkill failed pid=%s: %s", pid, e)
+        return
+    # Best-effort fallback for non-Windows.
+    for pid in pids:
+        try:
+            os.kill(pid, 9)
+        except Exception as e:
+            logger.debug("process_registry: kill failed pid=%s: %s", pid, e)