Spaces:

NeerajCodz
/

scrapeRL

Running

App Files Files Community

NeerajCodz commited on Mar 27

Commit

ca1fd98

1 Parent(s): 3bfb250

feat: add multi-model LLM router with providers

Browse files

Files changed (18) hide show

backend/app/models/__init__.py +57 -0
backend/app/models/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/models/__pycache__/ensemble.cpython-314.pyc +0 -0
backend/app/models/__pycache__/router.cpython-314.pyc +0 -0
backend/app/models/ensemble.py +505 -0
backend/app/models/providers/__init__.py +31 -0
backend/app/models/providers/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/models/providers/__pycache__/anthropic.cpython-314.pyc +0 -0
backend/app/models/providers/__pycache__/base.cpython-314.pyc +0 -0
backend/app/models/providers/__pycache__/google.cpython-314.pyc +0 -0
backend/app/models/providers/__pycache__/groq.cpython-314.pyc +0 -0
backend/app/models/providers/__pycache__/openai.cpython-314.pyc +0 -0
backend/app/models/providers/anthropic.py +413 -0
backend/app/models/providers/base.py +374 -0
backend/app/models/providers/google.py +421 -0
backend/app/models/providers/groq.py +361 -0
backend/app/models/providers/openai.py +353 -0
backend/app/models/router.py +526 -0

backend/app/models/__init__.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Models module - LLM providers, routing, and ensemble capabilities."""
+from app.models.router import (
+    SmartModelRouter,
+    RoutingStrategy,
+    RoutingConfig,
+    CostTracker,
+    ModelScore,
+)
+from app.models.ensemble import (
+    ModelEnsemble,
+    AggregationStrategy,
+    EnsembleResult,
+)
+from app.models.providers import (
+    # Base
+    BaseProvider,
+    ProviderError,
+    RateLimitError,
+    ModelNotFoundError,
+    CompletionResponse,
+    ModelInfo,
+    TokenUsage,
+    # Providers
+    OpenAIProvider,
+    AnthropicProvider,
+    GoogleProvider,
+    GroqProvider,
+)
+from app.models.providers.base import TaskType
+__all__ = [
+    # Router
+    "SmartModelRouter",
+    "RoutingStrategy",
+    "RoutingConfig",
+    "CostTracker",
+    "ModelScore",
+    "TaskType",
+    # Ensemble
+    "ModelEnsemble",
+    "AggregationStrategy",
+    "EnsembleResult",
+    # Base
+    "BaseProvider",
+    "ProviderError",
+    "RateLimitError",
+    "ModelNotFoundError",
+    "CompletionResponse",
+    "ModelInfo",
+    "TokenUsage",
+    # Providers
+    "OpenAIProvider",
+    "AnthropicProvider",
+    "GoogleProvider",
+    "GroqProvider",
+]

backend/app/models/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (1.03 kB). View file

backend/app/models/__pycache__/ensemble.cpython-314.pyc ADDED Viewed

Binary file (23.7 kB). View file

backend/app/models/__pycache__/router.cpython-314.pyc ADDED Viewed

Binary file (27.7 kB). View file

backend/app/models/ensemble.py ADDED Viewed

	@@ -0,0 +1,505 @@

+"""Model ensemble for running multiple models and aggregating results."""
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+from app.models.providers.base import (
+    BaseProvider,
+    CompletionResponse,
+    ProviderError,
+    TokenUsage,
+)
+from app.models.router import SmartModelRouter
+logger = logging.getLogger(__name__)
+class AggregationStrategy(str, Enum):
+    """Strategy for aggregating ensemble results."""
+    MAJORITY_VOTE = "majority_vote"  # Use most common response
+    CONFIDENCE_WEIGHTED = "confidence_weighted"  # Weight by model confidence
+    FIRST_SUCCESS = "first_success"  # Use first successful response
+    BEST_QUALITY = "best_quality"  # Use response from highest quality model
+    CONCATENATE = "concatenate"  # Combine all responses
+    CONSENSUS = "consensus"  # Only return if models agree
+@dataclass
+class EnsembleResult:
+    """Result from an ensemble run."""
+    content: str
+    responses: list[CompletionResponse]
+    agreement_score: float  # 0-1, how much models agreed
+    strategy: AggregationStrategy
+    selected_model: str | None = None
+    total_cost: float = 0.0
+    total_tokens: TokenUsage = field(default_factory=TokenUsage)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "content": self.content,
+            "responses": [r.to_dict() for r in self.responses],
+            "agreement_score": self.agreement_score,
+            "strategy": self.strategy.value,
+            "selected_model": self.selected_model,
+            "total_cost": self.total_cost,
+            "total_tokens": {
+                "prompt": self.total_tokens.prompt_tokens,
+                "completion": self.total_tokens.completion_tokens,
+                "total": self.total_tokens.total_tokens,
+            },
+            "metadata": self.metadata,
+        }
+class ModelEnsemble:
+    """Run multiple models and aggregate their results."""
+    # Model quality tiers for weighted voting
+    MODEL_QUALITY_TIERS: dict[str, float] = {
+        # Tier 1: Highest quality
+        "claude-3-opus-20240229": 1.0,
+        "gpt-4o": 0.98,
+        "claude-3-5-sonnet-20241022": 0.97,
+        "gemini-1.5-pro": 0.95,
+        # Tier 2: High quality
+        "gpt-4-turbo": 0.90,
+        "gpt-4": 0.88,
+        "claude-3-sonnet-20240229": 0.85,
+        "llama-3.3-70b-versatile": 0.83,
+        # Tier 3: Good quality
+        "gpt-4o-mini": 0.75,
+        "claude-3-5-haiku-20241022": 0.73,
+        "gemini-1.5-flash": 0.70,
+        "mixtral-8x7b-32768": 0.68,
+        # Tier 4: Fast/cheap
+        "claude-3-haiku-20240307": 0.60,
+        "llama-3.1-8b-instant": 0.55,
+        "gpt-3.5-turbo": 0.50,
+    }
+    def __init__(
+        self,
+        router: SmartModelRouter,
+        default_models: list[str] | None = None,
+        default_strategy: AggregationStrategy = AggregationStrategy.CONFIDENCE_WEIGHTED,
+        timeout: float = 60.0,
+    ):
+        """Initialize the ensemble.
+        Args:
+            router: SmartModelRouter instance for accessing providers
+            default_models: Default models to use in ensemble
+            default_strategy: Default aggregation strategy
+            timeout: Timeout for each model completion
+        """
+        self.router = router
+        self.default_models = default_models or []
+        self.default_strategy = default_strategy
+        self.timeout = timeout
+    async def run(
+        self,
+        messages: list[dict[str, Any]],
+        models: list[str] | None = None,
+        strategy: AggregationStrategy | None = None,
+        min_responses: int = 1,
+        **kwargs: Any,
+    ) -> EnsembleResult:
+        """Run multiple models and aggregate results.
+        Args:
+            messages: List of message dicts
+            models: List of model IDs to use (uses defaults if not specified)
+            strategy: Aggregation strategy (uses default if not specified)
+            min_responses: Minimum number of successful responses required
+            **kwargs: Additional completion parameters
+        Returns:
+            EnsembleResult with aggregated content and metadata
+        Raises:
+            ProviderError: If not enough models respond successfully
+        """
+        models_to_use = models or self.default_models
+        strategy = strategy or self.default_strategy
+        if not models_to_use:
+            # Use top 3 available models
+            available = self.router.get_available_models()
+            models_to_use = [m.id for m in available[:3]]
+        if not models_to_use:
+            raise ProviderError("No models available for ensemble", "ensemble")
+        # Run all models concurrently
+        tasks = []
+        for model_id in models_to_use:
+            provider = self.router.get_provider_for_model(model_id)
+            if provider:
+                task = self._run_model(provider, model_id, messages, **kwargs)
+                tasks.append((model_id, task))
+        if not tasks:
+            raise ProviderError("No valid models for ensemble", "ensemble")
+        # Gather results
+        responses: list[CompletionResponse] = []
+        errors: list[tuple[str, Exception]] = []
+        results = await asyncio.gather(
+            *[t[1] for t in tasks],
+            return_exceptions=True,
+        )
+        for (model_id, _), result in zip(tasks, results):
+            if isinstance(result, Exception):
+                logger.warning(f"Model {model_id} failed: {result}")
+                errors.append((model_id, result))
+            elif result is not None:
+                responses.append(result)
+        if len(responses) < min_responses:
+            raise ProviderError(
+                f"Only {len(responses)} models responded, need {min_responses}. "
+                f"Errors: {[str(e) for _, e in errors]}",
+                "ensemble",
+            )
+        # Aggregate results
+        result = self._aggregate(responses, strategy)
+        return result
+    async def _run_model(
+        self,
+        provider: BaseProvider,
+        model_id: str,
+        messages: list[dict[str, Any]],
+        **kwargs: Any,
+    ) -> CompletionResponse | None:
+        """Run a single model with timeout."""
+        try:
+            return await asyncio.wait_for(
+                provider.complete(messages, model_id, **kwargs),
+                timeout=self.timeout,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(f"Model {model_id} timed out")
+            return None
+        except Exception as e:
+            logger.warning(f"Model {model_id} error: {e}")
+            raise
+    def _aggregate(
+        self,
+        responses: list[CompletionResponse],
+        strategy: AggregationStrategy,
+    ) -> EnsembleResult:
+        """Aggregate responses based on strategy."""
+        if not responses:
+            raise ProviderError("No responses to aggregate", "ensemble")
+        # Calculate total cost and tokens
+        total_cost = sum(r.cost for r in responses)
+        total_tokens = TokenUsage()
+        for r in responses:
+            total_tokens = total_tokens + r.usage
+        # Calculate agreement score
+        agreement_score = self._calculate_agreement(responses)
+        # Select content based on strategy
+        if strategy == AggregationStrategy.FIRST_SUCCESS:
+            content, selected_model = self._first_success(responses)
+        elif strategy == AggregationStrategy.MAJORITY_VOTE:
+            content, selected_model = self._majority_vote(responses)
+        elif strategy == AggregationStrategy.CONFIDENCE_WEIGHTED:
+            content, selected_model = self._confidence_weighted(responses)
+        elif strategy == AggregationStrategy.BEST_QUALITY:
+            content, selected_model = self._best_quality(responses)
+        elif strategy == AggregationStrategy.CONCATENATE:
+            content, selected_model = self._concatenate(responses)
+        elif strategy == AggregationStrategy.CONSENSUS:
+            content, selected_model = self._consensus(responses, agreement_score)
+        else:
+            content, selected_model = self._first_success(responses)
+        return EnsembleResult(
+            content=content,
+            responses=responses,
+            agreement_score=agreement_score,
+            strategy=strategy,
+            selected_model=selected_model,
+            total_cost=total_cost,
+            total_tokens=total_tokens,
+            metadata={
+                "num_responses": len(responses),
+                "models_used": [r.model for r in responses],
+            },
+        )
+    def _calculate_agreement(self, responses: list[CompletionResponse]) -> float:
+        """Calculate agreement score between responses.
+        Uses simple similarity based on common words/tokens.
+        """
+        if len(responses) < 2:
+            return 1.0
+        # Tokenize responses (simple word-based)
+        response_tokens = []
+        for r in responses:
+            words = set(r.content.lower().split())
+            response_tokens.append(words)
+        # Calculate pairwise Jaccard similarity
+        similarities = []
+        for i in range(len(response_tokens)):
+            for j in range(i + 1, len(response_tokens)):
+                set_i = response_tokens[i]
+                set_j = response_tokens[j]
+                if not set_i and not set_j:
+                    similarities.append(1.0)
+                elif not set_i or not set_j:
+                    similarities.append(0.0)
+                else:
+                    intersection = len(set_i & set_j)
+                    union = len(set_i | set_j)
+                    similarities.append(intersection / union)
+        return sum(similarities) / len(similarities) if similarities else 1.0
+    def _first_success(
+        self, responses: list[CompletionResponse]
+    ) -> tuple[str, str | None]:
+        """Return the first successful response."""
+        r = responses[0]
+        return r.content, r.model
+    def _majority_vote(
+        self, responses: list[CompletionResponse]
+    ) -> tuple[str, str | None]:
+        """Return the most common response (by content similarity)."""
+        if len(responses) == 1:
+            return responses[0].content, responses[0].model
+        # Find response most similar to others
+        best_idx = 0
+        best_score = 0.0
+        for i, r in enumerate(responses):
+            score = 0.0
+            words_i = set(r.content.lower().split())
+            for j, other in enumerate(responses):
+                if i != j:
+                    words_j = set(other.content.lower().split())
+                    if words_i and words_j:
+                        intersection = len(words_i & words_j)
+                        union = len(words_i | words_j)
+                        score += intersection / union
+            if score > best_score:
+                best_score = score
+                best_idx = i
+        return responses[best_idx].content, responses[best_idx].model
+    def _confidence_weighted(
+        self, responses: list[CompletionResponse]
+    ) -> tuple[str, str | None]:
+        """Weight responses by model quality/confidence."""
+        if len(responses) == 1:
+            return responses[0].content, responses[0].model
+        # Score each response by model quality
+        scored = []
+        for r in responses:
+            quality = self.MODEL_QUALITY_TIERS.get(r.model, 0.5)
+            scored.append((quality, r))
+        # Sort by quality
+        scored.sort(key=lambda x: x[0], reverse=True)
+        # Return highest quality response
+        best = scored[0][1]
+        return best.content, best.model
+    def _best_quality(
+        self, responses: list[CompletionResponse]
+    ) -> tuple[str, str | None]:
+        """Return response from highest quality model."""
+        best_quality = 0.0
+        best_response = responses[0]
+        for r in responses:
+            quality = self.MODEL_QUALITY_TIERS.get(r.model, 0.5)
+            if quality > best_quality:
+                best_quality = quality
+                best_response = r
+        return best_response.content, best_response.model
+    def _concatenate(
+        self, responses: list[CompletionResponse]
+    ) -> tuple[str, str | None]:
+        """Concatenate all responses."""
+        parts = []
+        models = []
+        for r in responses:
+            parts.append(f"[{r.model}]:\n{r.content}")
+            models.append(r.model)
+        content = "\n\n---\n\n".join(parts)
+        return content, None  # No single model selected
+    def _consensus(
+        self,
+        responses: list[CompletionResponse],
+        agreement_score: float,
+    ) -> tuple[str, str | None]:
+        """Return result only if models agree (high agreement score)."""
+        if agreement_score < 0.5:
+            # Low agreement, return best quality with warning
+            content, model = self._best_quality(responses)
+            return f"[LOW CONSENSUS - {agreement_score:.2f}]\n{content}", model
+        # Good agreement, return majority vote
+        return self._majority_vote(responses)
+    async def compare(
+        self,
+        messages: list[dict[str, Any]],
+        models: list[str] | None = None,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Compare responses from multiple models side-by-side.
+        Args:
+            messages: List of message dicts
+            models: List of model IDs to compare
+            **kwargs: Additional completion parameters
+        Returns:
+            Dictionary with comparison data
+        """
+        result = await self.run(
+            messages,
+            models,
+            strategy=AggregationStrategy.CONCATENATE,
+            **kwargs,
+        )
+        # Build comparison
+        comparison = {
+            "responses": [],
+            "agreement_score": result.agreement_score,
+            "total_cost": result.total_cost,
+            "total_tokens": {
+                "prompt": result.total_tokens.prompt_tokens,
+                "completion": result.total_tokens.completion_tokens,
+                "total": result.total_tokens.total_tokens,
+            },
+        }
+        for r in result.responses:
+            comparison["responses"].append({
+                "model": r.model,
+                "provider": r.provider,
+                "content": r.content,
+                "cost": r.cost,
+                "latency_ms": r.latency_ms,
+                "tokens": {
+                    "prompt": r.usage.prompt_tokens,
+                    "completion": r.usage.completion_tokens,
+                },
+                "quality_tier": self.MODEL_QUALITY_TIERS.get(r.model, 0.5),
+            })
+        return comparison
+    async def debate(
+        self,
+        messages: list[dict[str, Any]],
+        models: list[str] | None = None,
+        rounds: int = 2,
+        **kwargs: Any,
+    ) -> EnsembleResult:
+        """Run a debate between models where they can respond to each other.
+        Args:
+            messages: Initial messages
+            models: Models to participate in debate
+            rounds: Number of debate rounds
+            **kwargs: Additional completion parameters
+        Returns:
+            Final ensemble result with debate history
+        """
+        models_to_use = models or self.default_models[:2]  # Default to 2 models
+        if len(models_to_use) < 2:
+            raise ProviderError("Debate requires at least 2 models", "ensemble")
+        all_responses: list[CompletionResponse] = []
+        debate_history: list[dict[str, Any]] = []
+        current_messages = messages.copy()
+        for round_num in range(rounds):
+            round_responses = []
+            for model_id in models_to_use:
+                provider = self.router.get_provider_for_model(model_id)
+                if not provider:
+                    continue
+                try:
+                    response = await asyncio.wait_for(
+                        provider.complete(current_messages, model_id, **kwargs),
+                        timeout=self.timeout,
+                    )
+                    round_responses.append(response)
+                    all_responses.append(response)
+                    debate_history.append({
+                        "round": round_num + 1,
+                        "model": model_id,
+                        "content": response.content,
+                    })
+                except Exception as e:
+                    logger.warning(f"Model {model_id} failed in round {round_num + 1}: {e}")
+            # Add responses to messages for next round
+            if round_responses and round_num < rounds - 1:
+                for r in round_responses:
+                    current_messages.append({
+                        "role": "assistant",
+                        "content": f"[{r.model}]: {r.content}",
+                    })
+                # Ask for follow-up
+                current_messages.append({
+                    "role": "user",
+                    "content": "Consider the other perspectives and refine your answer.",
+                })
+        # Aggregate final round responses
+        final_responses = all_responses[-len(models_to_use) :]
+        result = self._aggregate(final_responses, AggregationStrategy.CONFIDENCE_WEIGHTED)
+        # Add debate history to metadata
+        result.metadata["debate_history"] = debate_history
+        result.metadata["total_rounds"] = rounds
+        return result

backend/app/models/providers/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""LLM Providers - Multiple provider implementations for model routing."""
+from app.models.providers.base import (
+    BaseProvider,
+    ProviderError,
+    RateLimitError,
+    ModelNotFoundError,
+    CompletionResponse,
+    ModelInfo,
+    TokenUsage,
+)
+from app.models.providers.openai import OpenAIProvider
+from app.models.providers.anthropic import AnthropicProvider
+from app.models.providers.google import GoogleProvider
+from app.models.providers.groq import GroqProvider
+__all__ = [
+    # Base
+    "BaseProvider",
+    "ProviderError",
+    "RateLimitError",
+    "ModelNotFoundError",
+    "CompletionResponse",
+    "ModelInfo",
+    "TokenUsage",
+    # Providers
+    "OpenAIProvider",
+    "AnthropicProvider",
+    "GoogleProvider",
+    "GroqProvider",
+]

backend/app/models/providers/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (824 Bytes). View file

backend/app/models/providers/__pycache__/anthropic.cpython-314.pyc ADDED Viewed

Binary file (17.7 kB). View file

backend/app/models/providers/__pycache__/base.cpython-314.pyc ADDED Viewed

Binary file (22.1 kB). View file

backend/app/models/providers/__pycache__/google.cpython-314.pyc ADDED Viewed

Binary file (18 kB). View file

backend/app/models/providers/__pycache__/groq.cpython-314.pyc ADDED Viewed

Binary file (14.4 kB). View file

backend/app/models/providers/__pycache__/openai.cpython-314.pyc ADDED Viewed

Binary file (14.9 kB). View file

backend/app/models/providers/anthropic.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""Anthropic provider implementation."""
+import json
+import time
+from typing import Any, AsyncIterator
+import httpx
+from app.models.providers.base import (
+    AuthenticationError,
+    BaseProvider,
+    CompletionResponse,
+    ModelInfo,
+    ModelNotFoundError,
+    ProviderError,
+    RateLimitError,
+    TokenUsage,
+)
+class AnthropicProvider(BaseProvider):
+    """Anthropic API provider supporting Claude models."""
+    PROVIDER_NAME = "anthropic"
+    DEFAULT_BASE_URL = "https://api.anthropic.com/v1"
+    API_VERSION = "2023-06-01"
+    # Model definitions with pricing (per 1K tokens)
+    MODELS = {
+        "claude-3-opus-20240229": ModelInfo(
+            id="claude-3-opus-20240229",
+            name="Claude 3 Opus",
+            provider="anthropic",
+            context_window=200000,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.015,
+            cost_per_1k_output=0.075,
+        ),
+        "claude-3-sonnet-20240229": ModelInfo(
+            id="claude-3-sonnet-20240229",
+            name="Claude 3 Sonnet",
+            provider="anthropic",
+            context_window=200000,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.003,
+            cost_per_1k_output=0.015,
+        ),
+        "claude-3-5-sonnet-20241022": ModelInfo(
+            id="claude-3-5-sonnet-20241022",
+            name="Claude 3.5 Sonnet",
+            provider="anthropic",
+            context_window=200000,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.003,
+            cost_per_1k_output=0.015,
+        ),
+        "claude-3-haiku-20240307": ModelInfo(
+            id="claude-3-haiku-20240307",
+            name="Claude 3 Haiku",
+            provider="anthropic",
+            context_window=200000,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.00025,
+            cost_per_1k_output=0.00125,
+        ),
+        "claude-3-5-haiku-20241022": ModelInfo(
+            id="claude-3-5-haiku-20241022",
+            name="Claude 3.5 Haiku",
+            provider="anthropic",
+            context_window=200000,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.001,
+            cost_per_1k_output=0.005,
+        ),
+    }
+    # Aliases for convenience
+    MODEL_ALIASES = {
+        "claude-3-opus": "claude-3-opus-20240229",
+        "claude-3-sonnet": "claude-3-sonnet-20240229",
+        "claude-3.5-sonnet": "claude-3-5-sonnet-20241022",
+        "claude-3-haiku": "claude-3-haiku-20240307",
+        "claude-3.5-haiku": "claude-3-5-haiku-20241022",
+    }
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        timeout: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_rpm: int = 60,
+    ):
+        super().__init__(
+            api_key=api_key,
+            base_url=base_url or self.DEFAULT_BASE_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+            rate_limit_rpm=rate_limit_rpm,
+        )
+        self._client: httpx.AsyncClient | None = None
+    async def initialize(self) -> None:
+        """Initialize the HTTP client."""
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            headers={
+                "x-api-key": self.api_key,
+                "anthropic-version": self.API_VERSION,
+                "Content-Type": "application/json",
+            },
+            timeout=self.timeout,
+        )
+    async def shutdown(self) -> None:
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+    async def _ensure_client(self) -> httpx.AsyncClient:
+        """Ensure client is initialized."""
+        if not self._client:
+            await self.initialize()
+        return self._client  # type: ignore
+    def _resolve_model(self, model: str) -> str:
+        """Resolve model alias to full model ID."""
+        return self.MODEL_ALIASES.get(model, model)
+    def get_models(self) -> list[ModelInfo]:
+        """Get available Anthropic models."""
+        return list(self.MODELS.values())
+    def _convert_messages(
+        self, messages: list[dict[str, Any]]
+    ) -> tuple[str | None, list[dict[str, Any]]]:
+        """Convert OpenAI-style messages to Anthropic format.
+        Returns:
+            Tuple of (system_message, converted_messages)
+        """
+        system_message: str | None = None
+        converted: list[dict[str, Any]] = []
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                system_message = content
+            elif role == "assistant":
+                converted.append({"role": "assistant", "content": content})
+            elif role == "user":
+                converted.append({"role": "user", "content": content})
+            elif role == "function":
+                # Convert function result to user message
+                converted.append({
+                    "role": "user",
+                    "content": f"Function result for {msg.get('name', 'function')}: {content}",
+                })
+            elif role == "tool":
+                # Convert tool result
+                converted.append({
+                    "role": "user",
+                    "content": [{
+                        "type": "tool_result",
+                        "tool_use_id": msg.get("tool_call_id", ""),
+                        "content": content,
+                    }],
+                })
+        return system_message, converted
+    def _convert_tools(
+        self, tools: list[dict[str, Any]] | None
+    ) -> list[dict[str, Any]] | None:
+        """Convert OpenAI-style tools to Anthropic format."""
+        if not tools:
+            return None
+        converted = []
+        for tool in tools:
+            if tool.get("type") == "function":
+                func = tool["function"]
+                converted.append({
+                    "name": func["name"],
+                    "description": func.get("description", ""),
+                    "input_schema": func.get("parameters", {"type": "object", "properties": {}}),
+                })
+        return converted if converted else None
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        function_call: str | dict[str, str] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        stop: list[str] | None = None,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion using Anthropic API."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        # Convert messages
+        system_message, converted_messages = self._convert_messages(messages)
+        # Build request payload
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": converted_messages,
+            "max_tokens": max_tokens or model_info.max_output_tokens,
+        }
+        if system_message:
+            payload["system"] = system_message
+        if temperature is not None:
+            payload["temperature"] = temperature
+        if stop:
+            payload["stop_sequences"] = stop
+        # Convert tools (prefer tools over functions)
+        anthropic_tools = self._convert_tools(tools)
+        if not anthropic_tools and functions:
+            # Convert legacy functions format
+            anthropic_tools = [
+                {
+                    "name": f["name"],
+                    "description": f.get("description", ""),
+                    "input_schema": f.get("parameters", {"type": "object", "properties": {}}),
+                }
+                for f in functions
+            ]
+        if anthropic_tools:
+            payload["tools"] = anthropic_tools
+            # Handle tool choice
+            if tool_choice == "auto" or tool_choice is None:
+                payload["tool_choice"] = {"type": "auto"}
+            elif tool_choice == "required":
+                payload["tool_choice"] = {"type": "any"}
+            elif isinstance(tool_choice, dict) and "function" in tool_choice:
+                payload["tool_choice"] = {"type": "tool", "name": tool_choice["function"]["name"]}
+        start_time = time.time()
+        try:
+            response = await self._retry_with_backoff(
+                self._make_request, client, payload
+            )
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)
+        latency_ms = (time.time() - start_time) * 1000
+        # Parse response
+        content_blocks = response.get("content", [])
+        usage_data = response.get("usage", {})
+        # Extract text content and tool uses
+        text_content = ""
+        tool_calls = []
+        for block in content_blocks:
+            if block["type"] == "text":
+                text_content += block["text"]
+            elif block["type"] == "tool_use":
+                tool_calls.append({
+                    "id": block["id"],
+                    "type": "function",
+                    "function": {
+                        "name": block["name"],
+                        "arguments": json.dumps(block["input"]),
+                    },
+                })
+        usage = TokenUsage(
+            prompt_tokens=usage_data.get("input_tokens", 0),
+            completion_tokens=usage_data.get("output_tokens", 0),
+            total_tokens=usage_data.get("input_tokens", 0) + usage_data.get("output_tokens", 0),
+        )
+        cost = self.calculate_cost(model, usage)
+        self._track_usage(usage, cost)
+        return CompletionResponse(
+            content=text_content,
+            model=response.get("model", model),
+            provider=self.PROVIDER_NAME,
+            usage=usage,
+            finish_reason=response.get("stop_reason"),
+            function_call=None,
+            tool_calls=tool_calls if tool_calls else None,
+            raw_response=response,
+            latency_ms=latency_ms,
+            cost=cost,
+        )
+    async def _make_request(
+        self, client: httpx.AsyncClient, payload: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Make the API request."""
+        response = await client.post("/messages", json=payload)
+        response.raise_for_status()
+        return response.json()
+    def _handle_http_error(self, error: httpx.HTTPStatusError) -> None:
+        """Handle HTTP errors from Anthropic."""
+        status = error.response.status_code
+        try:
+            body = error.response.json()
+            message = body.get("error", {}).get("message", str(error))
+        except Exception:
+            message = str(error)
+        if status == 401:
+            raise AuthenticationError(self.PROVIDER_NAME, message)
+        elif status == 429:
+            retry_after = error.response.headers.get("retry-after")
+            raise RateLimitError(
+                self.PROVIDER_NAME,
+                retry_after=float(retry_after) if retry_after else None,
+                message=message,
+            )
+        elif status == 404:
+            raise ModelNotFoundError(self.PROVIDER_NAME, "unknown")
+        else:
+            raise ProviderError(message, self.PROVIDER_NAME, status)
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion from Anthropic."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        system_message, converted_messages = self._convert_messages(messages)
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": converted_messages,
+            "max_tokens": max_tokens or model_info.max_output_tokens,
+            "stream": True,
+        }
+        if system_message:
+            payload["system"] = system_message
+        if temperature is not None:
+            payload["temperature"] = temperature
+        try:
+            async with client.stream("POST", "/messages", json=payload) as response:
+                response.raise_for_status()
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[6:]
+                        try:
+                            event = json.loads(data)
+                            event_type = event.get("type")
+                            if event_type == "content_block_delta":
+                                delta = event.get("delta", {})
+                                if delta.get("type") == "text_delta":
+                                    yield delta.get("text", "")
+                        except json.JSONDecodeError:
+                            continue
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)

backend/app/models/providers/base.py ADDED Viewed

	@@ -0,0 +1,374 @@

+"""Base provider abstract class and common types."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any, AsyncIterator, Callable
+import asyncio
+import time
+class ProviderError(Exception):
+    """Base exception for provider errors."""
+    def __init__(self, message: str, provider: str, status_code: int | None = None):
+        self.message = message
+        self.provider = provider
+        self.status_code = status_code
+        super().__init__(f"[{provider}] {message}")
+class RateLimitError(ProviderError):
+    """Rate limit exceeded error."""
+    def __init__(
+        self,
+        provider: str,
+        retry_after: float | None = None,
+        message: str = "Rate limit exceeded",
+    ):
+        self.retry_after = retry_after
+        super().__init__(message, provider, status_code=429)
+class ModelNotFoundError(ProviderError):
+    """Model not found or not available error."""
+    def __init__(self, provider: str, model: str):
+        super().__init__(f"Model '{model}' not found", provider, status_code=404)
+class AuthenticationError(ProviderError):
+    """Authentication failed error."""
+    def __init__(self, provider: str, message: str = "Authentication failed"):
+        super().__init__(message, provider, status_code=401)
+@dataclass
+class TokenUsage:
+    """Token usage tracking."""
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+    def __add__(self, other: "TokenUsage") -> "TokenUsage":
+        return TokenUsage(
+            prompt_tokens=self.prompt_tokens + other.prompt_tokens,
+            completion_tokens=self.completion_tokens + other.completion_tokens,
+            total_tokens=self.total_tokens + other.total_tokens,
+        )
+@dataclass
+class CompletionResponse:
+    """Standardized completion response across providers."""
+    content: str
+    model: str
+    provider: str
+    usage: TokenUsage
+    finish_reason: str | None = None
+    function_call: dict[str, Any] | None = None
+    tool_calls: list[dict[str, Any]] | None = None
+    raw_response: dict[str, Any] | None = None
+    latency_ms: float = 0.0
+    cost: float = 0.0
+    timestamp: datetime = field(default_factory=datetime.utcnow)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert response to dictionary."""
+        return {
+            "content": self.content,
+            "model": self.model,
+            "provider": self.provider,
+            "usage": {
+                "prompt_tokens": self.usage.prompt_tokens,
+                "completion_tokens": self.usage.completion_tokens,
+                "total_tokens": self.usage.total_tokens,
+            },
+            "finish_reason": self.finish_reason,
+            "function_call": self.function_call,
+            "tool_calls": self.tool_calls,
+            "latency_ms": self.latency_ms,
+            "cost": self.cost,
+            "timestamp": self.timestamp.isoformat(),
+        }
+@dataclass
+class ModelInfo:
+    """Model information and capabilities."""
+    id: str
+    name: str
+    provider: str
+    context_window: int
+    max_output_tokens: int
+    supports_functions: bool = False
+    supports_vision: bool = False
+    supports_streaming: bool = True
+    cost_per_1k_input: float = 0.0
+    cost_per_1k_output: float = 0.0
+    @property
+    def cost_per_million_input(self) -> float:
+        """Cost per million input tokens."""
+        return self.cost_per_1k_input * 1000
+    @property
+    def cost_per_million_output(self) -> float:
+        """Cost per million output tokens."""
+        return self.cost_per_1k_output * 1000
+class TaskType(str, Enum):
+    """Types of tasks for model routing."""
+    GENERAL = "general"
+    CODE = "code"
+    REASONING = "reasoning"
+    EXTRACTION = "extraction"
+    SUMMARIZATION = "summarization"
+    CLASSIFICATION = "classification"
+    CREATIVE = "creative"
+    FAST = "fast"
+@dataclass
+class RateLimitState:
+    """Rate limiter state."""
+    tokens: float
+    last_update: float
+    max_tokens: float
+    refill_rate: float  # tokens per second
+class BaseProvider(ABC):
+    """Abstract base class for LLM providers."""
+    PROVIDER_NAME: str = "base"
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        timeout: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_rpm: int = 60,
+    ):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.timeout = timeout
+        self.max_retries = max_retries
+        # Rate limiting (token bucket)
+        self._rate_limit = RateLimitState(
+            tokens=rate_limit_rpm,
+            last_update=time.time(),
+            max_tokens=rate_limit_rpm,
+            refill_rate=rate_limit_rpm / 60.0,
+        )
+        self._rate_limit_lock = asyncio.Lock()
+        # Usage tracking
+        self._total_usage = TokenUsage()
+        self._total_cost: float = 0.0
+        self._request_count: int = 0
+    @abstractmethod
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        function_call: str | dict[str, str] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        stop: list[str] | None = None,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion from the model.
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            model: Model identifier
+            temperature: Sampling temperature (0-2)
+            max_tokens: Maximum tokens to generate
+            functions: Function definitions for function calling
+            function_call: Function call mode or specific function
+            tools: Tool definitions (newer format)
+            tool_choice: Tool choice mode or specific tool
+            stop: Stop sequences
+            **kwargs: Additional provider-specific parameters
+        Returns:
+            CompletionResponse with generated content and metadata
+        """
+        ...
+    @abstractmethod
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion from the model.
+        Args:
+            messages: List of message dicts
+            model: Model identifier
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            **kwargs: Additional parameters
+        Yields:
+            Content chunks as they arrive
+        """
+        ...
+    @abstractmethod
+    def get_models(self) -> list[ModelInfo]:
+        """Get list of available models from this provider.
+        Returns:
+            List of ModelInfo objects
+        """
+        ...
+    def get_model_info(self, model_id: str) -> ModelInfo | None:
+        """Get info for a specific model.
+        Args:
+            model_id: Model identifier
+        Returns:
+            ModelInfo or None if not found
+        """
+        for model in self.get_models():
+            if model.id == model_id:
+                return model
+        return None
+    def calculate_cost(self, model: str, usage: TokenUsage) -> float:
+        """Calculate cost for a completion.
+        Args:
+            model: Model identifier
+            usage: Token usage
+        Returns:
+            Cost in USD
+        """
+        model_info = self.get_model_info(model)
+        if not model_info:
+            return 0.0
+        input_cost = (usage.prompt_tokens / 1000) * model_info.cost_per_1k_input
+        output_cost = (usage.completion_tokens / 1000) * model_info.cost_per_1k_output
+        return input_cost + output_cost
+    async def _acquire_rate_limit(self) -> None:
+        """Acquire a token from the rate limiter."""
+        async with self._rate_limit_lock:
+            now = time.time()
+            elapsed = now - self._rate_limit.last_update
+            # Refill tokens
+            self._rate_limit.tokens = min(
+                self._rate_limit.max_tokens,
+                self._rate_limit.tokens + elapsed * self._rate_limit.refill_rate,
+            )
+            self._rate_limit.last_update = now
+            if self._rate_limit.tokens < 1:
+                # Calculate wait time
+                wait_time = (1 - self._rate_limit.tokens) / self._rate_limit.refill_rate
+                await asyncio.sleep(wait_time)
+                self._rate_limit.tokens = 0
+            else:
+                self._rate_limit.tokens -= 1
+    def _track_usage(self, usage: TokenUsage, cost: float) -> None:
+        """Track usage and cost."""
+        self._total_usage = self._total_usage + usage
+        self._total_cost += cost
+        self._request_count += 1
+    @property
+    def total_usage(self) -> TokenUsage:
+        """Get total token usage."""
+        return self._total_usage
+    @property
+    def total_cost(self) -> float:
+        """Get total cost in USD."""
+        return self._total_cost
+    @property
+    def request_count(self) -> int:
+        """Get total request count."""
+        return self._request_count
+    def reset_tracking(self) -> None:
+        """Reset usage tracking."""
+        self._total_usage = TokenUsage()
+        self._total_cost = 0.0
+        self._request_count = 0
+    async def _retry_with_backoff(
+        self,
+        func: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Any:
+        """Retry a function with exponential backoff.
+        Args:
+            func: Async function to retry
+            *args: Positional arguments
+            **kwargs: Keyword arguments
+        Returns:
+            Function result
+        Raises:
+            Last exception if all retries fail
+        """
+        last_exception: Exception | None = None
+        for attempt in range(self.max_retries):
+            try:
+                return await func(*args, **kwargs)
+            except RateLimitError as e:
+                last_exception = e
+                wait_time = e.retry_after or (2**attempt)
+                await asyncio.sleep(wait_time)
+            except ProviderError as e:
+                # Don't retry auth or not found errors
+                if e.status_code in (401, 403, 404):
+                    raise
+                last_exception = e
+                await asyncio.sleep(2**attempt)
+        if last_exception:
+            raise last_exception
+    async def initialize(self) -> None:
+        """Initialize the provider (optional setup)."""
+        pass
+    async def shutdown(self) -> None:
+        """Cleanup resources."""
+        pass
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(requests={self._request_count}, cost=${self._total_cost:.4f})"

backend/app/models/providers/google.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""Google AI provider implementation (Gemini models)."""
+import json
+import time
+from typing import Any, AsyncIterator
+import httpx
+from app.models.providers.base import (
+    AuthenticationError,
+    BaseProvider,
+    CompletionResponse,
+    ModelInfo,
+    ModelNotFoundError,
+    ProviderError,
+    RateLimitError,
+    TokenUsage,
+)
+class GoogleProvider(BaseProvider):
+    """Google AI API provider supporting Gemini models."""
+    PROVIDER_NAME = "google"
+    DEFAULT_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
+    # Model definitions with pricing (per 1K tokens)
+    MODELS = {
+        "gemini-1.5-pro": ModelInfo(
+            id="gemini-1.5-pro",
+            name="Gemini 1.5 Pro",
+            provider="google",
+            context_window=2097152,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.00125,
+            cost_per_1k_output=0.005,
+        ),
+        "gemini-1.5-flash": ModelInfo(
+            id="gemini-1.5-flash",
+            name="Gemini 1.5 Flash",
+            provider="google",
+            context_window=1048576,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.000075,
+            cost_per_1k_output=0.0003,
+        ),
+        "gemini-2.0-flash-exp": ModelInfo(
+            id="gemini-2.0-flash-exp",
+            name="Gemini 2.0 Flash (Experimental)",
+            provider="google",
+            context_window=1048576,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.0,
+            cost_per_1k_output=0.0,
+        ),
+        "gemini-pro": ModelInfo(
+            id="gemini-pro",
+            name="Gemini Pro",
+            provider="google",
+            context_window=32760,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.0005,
+            cost_per_1k_output=0.0015,
+        ),
+    }
+    # Aliases
+    MODEL_ALIASES = {
+        "gemini-flash": "gemini-1.5-flash",
+        "gemini-1.5": "gemini-1.5-pro",
+    }
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        timeout: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_rpm: int = 60,
+    ):
+        super().__init__(
+            api_key=api_key,
+            base_url=base_url or self.DEFAULT_BASE_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+            rate_limit_rpm=rate_limit_rpm,
+        )
+        self._client: httpx.AsyncClient | None = None
+    async def initialize(self) -> None:
+        """Initialize the HTTP client."""
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            headers={"Content-Type": "application/json"},
+            timeout=self.timeout,
+        )
+    async def shutdown(self) -> None:
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+    async def _ensure_client(self) -> httpx.AsyncClient:
+        """Ensure client is initialized."""
+        if not self._client:
+            await self.initialize()
+        return self._client  # type: ignore
+    def _resolve_model(self, model: str) -> str:
+        """Resolve model alias to full model ID."""
+        return self.MODEL_ALIASES.get(model, model)
+    def get_models(self) -> list[ModelInfo]:
+        """Get available Google AI models."""
+        return list(self.MODELS.values())
+    def _convert_messages(
+        self, messages: list[dict[str, Any]]
+    ) -> tuple[str | None, list[dict[str, Any]]]:
+        """Convert OpenAI-style messages to Gemini format.
+        Returns:
+            Tuple of (system_instruction, contents)
+        """
+        system_instruction: str | None = None
+        contents: list[dict[str, Any]] = []
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                system_instruction = content
+            elif role == "assistant":
+                contents.append({
+                    "role": "model",
+                    "parts": [{"text": content}] if isinstance(content, str) else content,
+                })
+            elif role == "user":
+                contents.append({
+                    "role": "user",
+                    "parts": [{"text": content}] if isinstance(content, str) else content,
+                })
+            elif role == "function":
+                # Function response
+                contents.append({
+                    "role": "function",
+                    "parts": [{
+                        "functionResponse": {
+                            "name": msg.get("name", "function"),
+                            "response": {"result": content},
+                        }
+                    }],
+                })
+            elif role == "tool":
+                # Tool response
+                contents.append({
+                    "role": "function",
+                    "parts": [{
+                        "functionResponse": {
+                            "name": msg.get("tool_call_id", "tool"),
+                            "response": {"result": content},
+                        }
+                    }],
+                })
+        return system_instruction, contents
+    def _convert_tools(
+        self, tools: list[dict[str, Any]] | None
+    ) -> list[dict[str, Any]] | None:
+        """Convert OpenAI-style tools to Gemini format."""
+        if not tools:
+            return None
+        function_declarations = []
+        for tool in tools:
+            if tool.get("type") == "function":
+                func = tool["function"]
+                function_declarations.append({
+                    "name": func["name"],
+                    "description": func.get("description", ""),
+                    "parameters": func.get("parameters", {"type": "object", "properties": {}}),
+                })
+        return [{"functionDeclarations": function_declarations}] if function_declarations else None
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        function_call: str | dict[str, str] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        stop: list[str] | None = None,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion using Google AI API."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        # Convert messages
+        system_instruction, contents = self._convert_messages(messages)
+        # Build request payload
+        payload: dict[str, Any] = {
+            "contents": contents,
+            "generationConfig": {
+                "temperature": temperature,
+            },
+        }
+        if max_tokens:
+            payload["generationConfig"]["maxOutputTokens"] = max_tokens
+        if stop:
+            payload["generationConfig"]["stopSequences"] = stop
+        if system_instruction:
+            payload["systemInstruction"] = {"parts": [{"text": system_instruction}]}
+        # Convert tools
+        gemini_tools = self._convert_tools(tools)
+        if not gemini_tools and functions:
+            gemini_tools = [{
+                "functionDeclarations": [
+                    {
+                        "name": f["name"],
+                        "description": f.get("description", ""),
+                        "parameters": f.get("parameters", {"type": "object", "properties": {}}),
+                    }
+                    for f in functions
+                ]
+            }]
+        if gemini_tools:
+            payload["tools"] = gemini_tools
+        start_time = time.time()
+        url = f"/models/{model}:generateContent?key={self.api_key}"
+        try:
+            response = await self._retry_with_backoff(
+                self._make_request, client, url, payload
+            )
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)
+        latency_ms = (time.time() - start_time) * 1000
+        # Parse response
+        candidates = response.get("candidates", [])
+        if not candidates:
+            raise ProviderError("No candidates in response", self.PROVIDER_NAME)
+        candidate = candidates[0]
+        content_parts = candidate.get("content", {}).get("parts", [])
+        # Extract text content and function calls
+        text_content = ""
+        tool_calls = []
+        for part in content_parts:
+            if "text" in part:
+                text_content += part["text"]
+            elif "functionCall" in part:
+                fc = part["functionCall"]
+                tool_calls.append({
+                    "id": f"call_{fc['name']}",
+                    "type": "function",
+                    "function": {
+                        "name": fc["name"],
+                        "arguments": json.dumps(fc.get("args", {})),
+                    },
+                })
+        # Parse usage
+        usage_data = response.get("usageMetadata", {})
+        usage = TokenUsage(
+            prompt_tokens=usage_data.get("promptTokenCount", 0),
+            completion_tokens=usage_data.get("candidatesTokenCount", 0),
+            total_tokens=usage_data.get("totalTokenCount", 0),
+        )
+        cost = self.calculate_cost(model, usage)
+        self._track_usage(usage, cost)
+        # Map finish reason
+        finish_reason_map = {
+            "STOP": "stop",
+            "MAX_TOKENS": "length",
+            "SAFETY": "content_filter",
+            "RECITATION": "content_filter",
+        }
+        finish_reason = finish_reason_map.get(
+            candidate.get("finishReason", ""), candidate.get("finishReason")
+        )
+        return CompletionResponse(
+            content=text_content,
+            model=model,
+            provider=self.PROVIDER_NAME,
+            usage=usage,
+            finish_reason=finish_reason,
+            function_call=None,
+            tool_calls=tool_calls if tool_calls else None,
+            raw_response=response,
+            latency_ms=latency_ms,
+            cost=cost,
+        )
+    async def _make_request(
+        self, client: httpx.AsyncClient, url: str, payload: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Make the API request."""
+        response = await client.post(url, json=payload)
+        response.raise_for_status()
+        return response.json()
+    def _handle_http_error(self, error: httpx.HTTPStatusError) -> None:
+        """Handle HTTP errors from Google AI."""
+        status = error.response.status_code
+        try:
+            body = error.response.json()
+            message = body.get("error", {}).get("message", str(error))
+        except Exception:
+            message = str(error)
+        if status == 401 or status == 403:
+            raise AuthenticationError(self.PROVIDER_NAME, message)
+        elif status == 429:
+            retry_after = error.response.headers.get("retry-after")
+            raise RateLimitError(
+                self.PROVIDER_NAME,
+                retry_after=float(retry_after) if retry_after else None,
+                message=message,
+            )
+        elif status == 404:
+            raise ModelNotFoundError(self.PROVIDER_NAME, "unknown")
+        else:
+            raise ProviderError(message, self.PROVIDER_NAME, status)
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion from Google AI."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        system_instruction, contents = self._convert_messages(messages)
+        payload: dict[str, Any] = {
+            "contents": contents,
+            "generationConfig": {
+                "temperature": temperature,
+            },
+        }
+        if max_tokens:
+            payload["generationConfig"]["maxOutputTokens"] = max_tokens
+        if system_instruction:
+            payload["systemInstruction"] = {"parts": [{"text": system_instruction}]}
+        url = f"/models/{model}:streamGenerateContent?key={self.api_key}&alt=sse"
+        try:
+            async with client.stream("POST", url, json=payload) as response:
+                response.raise_for_status()
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[6:]
+                        try:
+                            chunk = json.loads(data)
+                            candidates = chunk.get("candidates", [])
+                            if candidates:
+                                parts = candidates[0].get("content", {}).get("parts", [])
+                                for part in parts:
+                                    if "text" in part:
+                                        yield part["text"]
+                        except json.JSONDecodeError:
+                            continue
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)

backend/app/models/providers/groq.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""Groq provider implementation (fast inference)."""
+import json
+import time
+from typing import Any, AsyncIterator
+import httpx
+from app.models.providers.base import (
+    AuthenticationError,
+    BaseProvider,
+    CompletionResponse,
+    ModelInfo,
+    ModelNotFoundError,
+    ProviderError,
+    RateLimitError,
+    TokenUsage,
+)
+class GroqProvider(BaseProvider):
+    """Groq API provider for fast LLM inference."""
+    PROVIDER_NAME = "groq"
+    DEFAULT_BASE_URL = "https://api.groq.com/openai/v1"
+    # Model definitions with pricing (per 1K tokens)
+    MODELS = {
+        "llama-3.3-70b-versatile": ModelInfo(
+            id="llama-3.3-70b-versatile",
+            name="Llama 3.3 70B Versatile",
+            provider="groq",
+            context_window=128000,
+            max_output_tokens=32768,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00059,
+            cost_per_1k_output=0.00079,
+        ),
+        "llama-3.1-70b-versatile": ModelInfo(
+            id="llama-3.1-70b-versatile",
+            name="Llama 3.1 70B Versatile",
+            provider="groq",
+            context_window=128000,
+            max_output_tokens=32768,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00059,
+            cost_per_1k_output=0.00079,
+        ),
+        "llama-3.1-8b-instant": ModelInfo(
+            id="llama-3.1-8b-instant",
+            name="Llama 3.1 8B Instant",
+            provider="groq",
+            context_window=128000,
+            max_output_tokens=8000,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00005,
+            cost_per_1k_output=0.00008,
+        ),
+        "llama3-70b-8192": ModelInfo(
+            id="llama3-70b-8192",
+            name="Llama 3 70B",
+            provider="groq",
+            context_window=8192,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00059,
+            cost_per_1k_output=0.00079,
+        ),
+        "llama3-8b-8192": ModelInfo(
+            id="llama3-8b-8192",
+            name="Llama 3 8B",
+            provider="groq",
+            context_window=8192,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00005,
+            cost_per_1k_output=0.00008,
+        ),
+        "mixtral-8x7b-32768": ModelInfo(
+            id="mixtral-8x7b-32768",
+            name="Mixtral 8x7B",
+            provider="groq",
+            context_window=32768,
+            max_output_tokens=32768,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00024,
+            cost_per_1k_output=0.00024,
+        ),
+        "gemma2-9b-it": ModelInfo(
+            id="gemma2-9b-it",
+            name="Gemma 2 9B IT",
+            provider="groq",
+            context_window=8192,
+            max_output_tokens=8192,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.00020,
+            cost_per_1k_output=0.00020,
+        ),
+    }
+    # Aliases for convenience
+    MODEL_ALIASES = {
+        "llama3": "llama3-70b-8192",
+        "llama3-70b": "llama3-70b-8192",
+        "llama3-8b": "llama3-8b-8192",
+        "llama-3.1": "llama-3.1-70b-versatile",
+        "llama-3.3": "llama-3.3-70b-versatile",
+        "mixtral": "mixtral-8x7b-32768",
+        "gemma2": "gemma2-9b-it",
+    }
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        timeout: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_rpm: int = 30,  # Groq has stricter limits
+    ):
+        super().__init__(
+            api_key=api_key,
+            base_url=base_url or self.DEFAULT_BASE_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+            rate_limit_rpm=rate_limit_rpm,
+        )
+        self._client: httpx.AsyncClient | None = None
+    async def initialize(self) -> None:
+        """Initialize the HTTP client."""
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            timeout=self.timeout,
+        )
+    async def shutdown(self) -> None:
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+    async def _ensure_client(self) -> httpx.AsyncClient:
+        """Ensure client is initialized."""
+        if not self._client:
+            await self.initialize()
+        return self._client  # type: ignore
+    def _resolve_model(self, model: str) -> str:
+        """Resolve model alias to full model ID."""
+        return self.MODEL_ALIASES.get(model, model)
+    def get_models(self) -> list[ModelInfo]:
+        """Get available Groq models."""
+        return list(self.MODELS.values())
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        function_call: str | dict[str, str] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        stop: list[str] | None = None,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion using Groq API (OpenAI-compatible)."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        # Build request payload (OpenAI-compatible format)
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+        }
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+        if stop:
+            payload["stop"] = stop
+        # Function calling
+        if functions and model_info.supports_functions:
+            payload["functions"] = functions
+            if function_call:
+                payload["function_call"] = function_call
+        # Tools (newer format)
+        if tools and model_info.supports_functions:
+            payload["tools"] = tools
+            if tool_choice:
+                payload["tool_choice"] = tool_choice
+        # Additional params
+        for key in ["top_p", "presence_penalty", "frequency_penalty"]:
+            if key in kwargs:
+                payload[key] = kwargs[key]
+        start_time = time.time()
+        try:
+            response = await self._retry_with_backoff(
+                self._make_request, client, payload
+            )
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)
+        latency_ms = (time.time() - start_time) * 1000
+        # Parse response (OpenAI-compatible)
+        choice = response["choices"][0]
+        message = choice["message"]
+        usage_data = response.get("usage", {})
+        usage = TokenUsage(
+            prompt_tokens=usage_data.get("prompt_tokens", 0),
+            completion_tokens=usage_data.get("completion_tokens", 0),
+            total_tokens=usage_data.get("total_tokens", 0),
+        )
+        cost = self.calculate_cost(model, usage)
+        self._track_usage(usage, cost)
+        # Extract function/tool calls
+        func_call = message.get("function_call")
+        tool_calls_raw = message.get("tool_calls")
+        tool_calls = None
+        if tool_calls_raw:
+            tool_calls = [
+                {
+                    "id": tc["id"],
+                    "type": tc["type"],
+                    "function": {
+                        "name": tc["function"]["name"],
+                        "arguments": tc["function"]["arguments"],
+                    },
+                }
+                for tc in tool_calls_raw
+            ]
+        return CompletionResponse(
+            content=message.get("content") or "",
+            model=response.get("model", model),
+            provider=self.PROVIDER_NAME,
+            usage=usage,
+            finish_reason=choice.get("finish_reason"),
+            function_call=func_call,
+            tool_calls=tool_calls,
+            raw_response=response,
+            latency_ms=latency_ms,
+            cost=cost,
+        )
+    async def _make_request(
+        self, client: httpx.AsyncClient, payload: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Make the API request."""
+        response = await client.post("/chat/completions", json=payload)
+        response.raise_for_status()
+        return response.json()
+    def _handle_http_error(self, error: httpx.HTTPStatusError) -> None:
+        """Handle HTTP errors from Groq."""
+        status = error.response.status_code
+        try:
+            body = error.response.json()
+            message = body.get("error", {}).get("message", str(error))
+        except Exception:
+            message = str(error)
+        if status == 401:
+            raise AuthenticationError(self.PROVIDER_NAME, message)
+        elif status == 429:
+            retry_after = error.response.headers.get("retry-after")
+            raise RateLimitError(
+                self.PROVIDER_NAME,
+                retry_after=float(retry_after) if retry_after else None,
+                message=message,
+            )
+        elif status == 404:
+            raise ModelNotFoundError(self.PROVIDER_NAME, "unknown")
+        else:
+            raise ProviderError(message, self.PROVIDER_NAME, status)
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion from Groq."""
+        await self._acquire_rate_limit()
+        model = self._resolve_model(model)
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+            "stream": True,
+        }
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+        try:
+            async with client.stream("POST", "/chat/completions", json=payload) as response:
+                response.raise_for_status()
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[6:]
+                        if data == "[DONE]":
+                            break
+                        try:
+                            chunk = json.loads(data)
+                            delta = chunk["choices"][0].get("delta", {})
+                            content = delta.get("content")
+                            if content:
+                                yield content
+                        except json.JSONDecodeError:
+                            continue
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)

backend/app/models/providers/openai.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""OpenAI provider implementation."""
+import json
+import time
+from typing import Any, AsyncIterator
+import httpx
+from app.models.providers.base import (
+    AuthenticationError,
+    BaseProvider,
+    CompletionResponse,
+    ModelInfo,
+    ModelNotFoundError,
+    ProviderError,
+    RateLimitError,
+    TokenUsage,
+)
+class OpenAIProvider(BaseProvider):
+    """OpenAI API provider supporting GPT models."""
+    PROVIDER_NAME = "openai"
+    DEFAULT_BASE_URL = "https://api.openai.com/v1"
+    # Model definitions with pricing (per 1K tokens)
+    MODELS = {
+        "gpt-4o": ModelInfo(
+            id="gpt-4o",
+            name="GPT-4o",
+            provider="openai",
+            context_window=128000,
+            max_output_tokens=16384,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.005,
+            cost_per_1k_output=0.015,
+        ),
+        "gpt-4o-mini": ModelInfo(
+            id="gpt-4o-mini",
+            name="GPT-4o Mini",
+            provider="openai",
+            context_window=128000,
+            max_output_tokens=16384,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.00015,
+            cost_per_1k_output=0.0006,
+        ),
+        "gpt-4-turbo": ModelInfo(
+            id="gpt-4-turbo",
+            name="GPT-4 Turbo",
+            provider="openai",
+            context_window=128000,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=True,
+            supports_streaming=True,
+            cost_per_1k_input=0.01,
+            cost_per_1k_output=0.03,
+        ),
+        "gpt-4": ModelInfo(
+            id="gpt-4",
+            name="GPT-4",
+            provider="openai",
+            context_window=8192,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.03,
+            cost_per_1k_output=0.06,
+        ),
+        "gpt-3.5-turbo": ModelInfo(
+            id="gpt-3.5-turbo",
+            name="GPT-3.5 Turbo",
+            provider="openai",
+            context_window=16385,
+            max_output_tokens=4096,
+            supports_functions=True,
+            supports_vision=False,
+            supports_streaming=True,
+            cost_per_1k_input=0.0005,
+            cost_per_1k_output=0.0015,
+        ),
+    }
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str | None = None,
+        organization: str | None = None,
+        timeout: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_rpm: int = 60,
+    ):
+        super().__init__(
+            api_key=api_key,
+            base_url=base_url or self.DEFAULT_BASE_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+            rate_limit_rpm=rate_limit_rpm,
+        )
+        self.organization = organization
+        self._client: httpx.AsyncClient | None = None
+    async def initialize(self) -> None:
+        """Initialize the HTTP client."""
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+        if self.organization:
+            headers["OpenAI-Organization"] = self.organization
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            headers=headers,
+            timeout=self.timeout,
+        )
+    async def shutdown(self) -> None:
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+    async def _ensure_client(self) -> httpx.AsyncClient:
+        """Ensure client is initialized."""
+        if not self._client:
+            await self.initialize()
+        return self._client  # type: ignore
+    def get_models(self) -> list[ModelInfo]:
+        """Get available OpenAI models."""
+        return list(self.MODELS.values())
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        function_call: str | dict[str, str] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        stop: list[str] | None = None,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion using OpenAI API."""
+        await self._acquire_rate_limit()
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        # Build request payload
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+        }
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+        if stop:
+            payload["stop"] = stop
+        # Function calling (legacy format)
+        if functions and model_info.supports_functions:
+            payload["functions"] = functions
+            if function_call:
+                payload["function_call"] = function_call
+        # Tools (newer format)
+        if tools and model_info.supports_functions:
+            payload["tools"] = tools
+            if tool_choice:
+                payload["tool_choice"] = tool_choice
+        # Additional kwargs
+        for key in ["top_p", "presence_penalty", "frequency_penalty", "logit_bias", "user"]:
+            if key in kwargs:
+                payload[key] = kwargs[key]
+        start_time = time.time()
+        try:
+            response = await self._retry_with_backoff(
+                self._make_request, client, payload
+            )
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)
+        latency_ms = (time.time() - start_time) * 1000
+        # Parse response
+        choice = response["choices"][0]
+        message = choice["message"]
+        usage_data = response.get("usage", {})
+        usage = TokenUsage(
+            prompt_tokens=usage_data.get("prompt_tokens", 0),
+            completion_tokens=usage_data.get("completion_tokens", 0),
+            total_tokens=usage_data.get("total_tokens", 0),
+        )
+        cost = self.calculate_cost(model, usage)
+        self._track_usage(usage, cost)
+        # Extract function call / tool calls
+        func_call = message.get("function_call")
+        tool_calls_raw = message.get("tool_calls")
+        tool_calls = None
+        if tool_calls_raw:
+            tool_calls = [
+                {
+                    "id": tc["id"],
+                    "type": tc["type"],
+                    "function": {
+                        "name": tc["function"]["name"],
+                        "arguments": tc["function"]["arguments"],
+                    },
+                }
+                for tc in tool_calls_raw
+            ]
+        return CompletionResponse(
+            content=message.get("content") or "",
+            model=response.get("model", model),
+            provider=self.PROVIDER_NAME,
+            usage=usage,
+            finish_reason=choice.get("finish_reason"),
+            function_call=func_call,
+            tool_calls=tool_calls,
+            raw_response=response,
+            latency_ms=latency_ms,
+            cost=cost,
+        )
+    async def _make_request(
+        self, client: httpx.AsyncClient, payload: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Make the API request."""
+        response = await client.post("/chat/completions", json=payload)
+        response.raise_for_status()
+        return response.json()
+    def _handle_http_error(self, error: httpx.HTTPStatusError) -> None:
+        """Handle HTTP errors from OpenAI."""
+        status = error.response.status_code
+        try:
+            body = error.response.json()
+            message = body.get("error", {}).get("message", str(error))
+        except Exception:
+            message = str(error)
+        if status == 401:
+            raise AuthenticationError(self.PROVIDER_NAME, message)
+        elif status == 429:
+            retry_after = error.response.headers.get("retry-after")
+            raise RateLimitError(
+                self.PROVIDER_NAME,
+                retry_after=float(retry_after) if retry_after else None,
+                message=message,
+            )
+        elif status == 404:
+            raise ModelNotFoundError(self.PROVIDER_NAME, "unknown")
+        else:
+            raise ProviderError(message, self.PROVIDER_NAME, status)
+    async def stream(
+        self,
+        messages: list[dict[str, Any]],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Stream a completion from OpenAI."""
+        await self._acquire_rate_limit()
+        model_info = self.get_model_info(model)
+        if not model_info:
+            raise ModelNotFoundError(self.PROVIDER_NAME, model)
+        client = await self._ensure_client()
+        payload: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+            "stream": True,
+        }
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+        try:
+            async with client.stream("POST", "/chat/completions", json=payload) as response:
+                response.raise_for_status()
+                async for line in response.aiter_lines():
+                    if line.startswith("data: "):
+                        data = line[6:]
+                        if data == "[DONE]":
+                            break
+                        try:
+                            chunk = json.loads(data)
+                            delta = chunk["choices"][0].get("delta", {})
+                            content = delta.get("content")
+                            if content:
+                                yield content
+                        except json.JSONDecodeError:
+                            continue
+        except httpx.HTTPStatusError as e:
+            self._handle_http_error(e)
+    async def create_embedding(
+        self,
+        text: str | list[str],
+        model: str = "text-embedding-3-small",
+    ) -> list[list[float]]:
+        """Create embeddings for text.
+        Args:
+            text: Text or list of texts to embed
+            model: Embedding model to use
+        Returns:
+            List of embedding vectors
+        """
+        client = await self._ensure_client()
+        payload = {
+            "model": model,
+            "input": text if isinstance(text, list) else [text],
+        }
+        response = await client.post("/embeddings", json=payload)
+        response.raise_for_status()
+        data = response.json()
+        return [item["embedding"] for item in data["data"]]

backend/app/models/router.py ADDED Viewed

	@@ -0,0 +1,526 @@

+"""Smart model router for intelligent model selection and fallback."""
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Any
+from pydantic import SecretStr
+from app.models.providers.base import (
+    BaseProvider,
+    CompletionResponse,
+    ModelInfo,
+    ProviderError,
+    RateLimitError,
+    TaskType,
+    TokenUsage,
+)
+from app.models.providers.openai import OpenAIProvider
+from app.models.providers.anthropic import AnthropicProvider
+from app.models.providers.google import GoogleProvider
+from app.models.providers.groq import GroqProvider
+logger = logging.getLogger(__name__)
+class RoutingStrategy(str, Enum):
+    """Model routing strategies."""
+    BEST_QUALITY = "best_quality"  # Use highest quality model
+    BEST_SPEED = "best_speed"  # Use fastest model
+    BEST_VALUE = "best_value"  # Balance quality/cost
+    LOWEST_COST = "lowest_cost"  # Use cheapest model
+    ROUND_ROBIN = "round_robin"  # Rotate between models
+@dataclass
+class ModelScore:
+    """Scoring for model routing decisions."""
+    model_id: str
+    provider: str
+    quality_score: float = 0.0  # 0-1, higher is better
+    speed_score: float = 0.0  # 0-1, higher is faster
+    cost_score: float = 0.0  # 0-1, higher is cheaper
+    overall_score: float = 0.0
+@dataclass
+class RoutingConfig:
+    """Configuration for model routing."""
+    default_strategy: RoutingStrategy = RoutingStrategy.BEST_VALUE
+    max_fallback_attempts: int = 3
+    fallback_delay_seconds: float = 1.0
+    enable_caching: bool = True
+    cache_ttl_seconds: int = 300
+    # Task-specific model preferences
+    task_preferences: dict[TaskType, list[str]] = field(default_factory=lambda: {
+        TaskType.GENERAL: ["gpt-4o", "claude-3-5-sonnet-20241022", "gemini-1.5-pro"],
+        TaskType.CODE: ["claude-3-5-sonnet-20241022", "gpt-4o", "gemini-1.5-pro"],
+        TaskType.REASONING: ["claude-3-opus-20240229", "gpt-4o", "gemini-1.5-pro"],
+        TaskType.EXTRACTION: ["gpt-4o-mini", "claude-3-haiku-20240307", "gemini-1.5-flash"],
+        TaskType.SUMMARIZATION: ["gpt-4o-mini", "claude-3-5-haiku-20241022", "gemini-1.5-flash"],
+        TaskType.CLASSIFICATION: ["gpt-4o-mini", "claude-3-haiku-20240307", "llama-3.1-8b-instant"],
+        TaskType.CREATIVE: ["claude-3-5-sonnet-20241022", "gpt-4o", "gemini-1.5-pro"],
+        TaskType.FAST: ["llama-3.1-8b-instant", "gemini-1.5-flash", "gpt-4o-mini"],
+    })
+@dataclass
+class CostTracker:
+    """Track costs across providers and models."""
+    total_cost: float = 0.0
+    cost_by_provider: dict[str, float] = field(default_factory=dict)
+    cost_by_model: dict[str, float] = field(default_factory=dict)
+    request_count: int = 0
+    total_tokens: TokenUsage = field(default_factory=TokenUsage)
+    start_time: datetime = field(default_factory=datetime.utcnow)
+    def track(self, response: CompletionResponse) -> None:
+        """Track a completion response."""
+        self.total_cost += response.cost
+        self.request_count += 1
+        self.total_tokens = self.total_tokens + response.usage
+        # By provider
+        self.cost_by_provider[response.provider] = (
+            self.cost_by_provider.get(response.provider, 0.0) + response.cost
+        )
+        # By model
+        self.cost_by_model[response.model] = (
+            self.cost_by_model.get(response.model, 0.0) + response.cost
+        )
+    def get_summary(self) -> dict[str, Any]:
+        """Get cost summary."""
+        return {
+            "total_cost_usd": self.total_cost,
+            "request_count": self.request_count,
+            "total_tokens": {
+                "prompt": self.total_tokens.prompt_tokens,
+                "completion": self.total_tokens.completion_tokens,
+                "total": self.total_tokens.total_tokens,
+            },
+            "cost_by_provider": self.cost_by_provider,
+            "cost_by_model": self.cost_by_model,
+            "avg_cost_per_request": (
+                self.total_cost / self.request_count if self.request_count > 0 else 0
+            ),
+            "tracking_since": self.start_time.isoformat(),
+        }
+    def reset(self) -> None:
+        """Reset cost tracking."""
+        self.total_cost = 0.0
+        self.cost_by_provider = {}
+        self.cost_by_model = {}
+        self.request_count = 0
+        self.total_tokens = TokenUsage()
+        self.start_time = datetime.utcnow()
+class SmartModelRouter:
+    """Intelligent model router with fallback and cost tracking."""
+    # Model quality rankings (subjective, based on benchmarks)
+    MODEL_QUALITY_SCORES: dict[str, float] = {
+        # OpenAI
+        "gpt-4o": 0.95,
+        "gpt-4-turbo": 0.92,
+        "gpt-4": 0.90,
+        "gpt-4o-mini": 0.80,
+        "gpt-3.5-turbo": 0.70,
+        # Anthropic
+        "claude-3-opus-20240229": 0.97,
+        "claude-3-5-sonnet-20241022": 0.94,
+        "claude-3-sonnet-20240229": 0.88,
+        "claude-3-5-haiku-20241022": 0.82,
+        "claude-3-haiku-20240307": 0.75,
+        # Google
+        "gemini-1.5-pro": 0.91,
+        "gemini-2.0-flash-exp": 0.88,
+        "gemini-1.5-flash": 0.78,
+        "gemini-pro": 0.75,
+        # Groq
+        "llama-3.3-70b-versatile": 0.85,
+        "llama-3.1-70b-versatile": 0.84,
+        "llama3-70b-8192": 0.82,
+        "mixtral-8x7b-32768": 0.78,
+        "llama-3.1-8b-instant": 0.65,
+        "llama3-8b-8192": 0.60,
+        "gemma2-9b-it": 0.62,
+    }
+    # Model speed rankings (relative, based on typical latency)
+    MODEL_SPEED_SCORES: dict[str, float] = {
+        # Groq is fastest
+        "llama-3.1-8b-instant": 0.98,
+        "llama3-8b-8192": 0.97,
+        "gemma2-9b-it": 0.96,
+        "mixtral-8x7b-32768": 0.94,
+        "llama3-70b-8192": 0.92,
+        "llama-3.1-70b-versatile": 0.91,
+        "llama-3.3-70b-versatile": 0.90,
+        # Google Flash is fast
+        "gemini-1.5-flash": 0.88,
+        "gemini-2.0-flash-exp": 0.87,
+        # Mini models
+        "gpt-4o-mini": 0.85,
+        "claude-3-haiku-20240307": 0.84,
+        "claude-3-5-haiku-20241022": 0.83,
+        "gpt-3.5-turbo": 0.82,
+        # Pro models
+        "gemini-pro": 0.75,
+        "gemini-1.5-pro": 0.70,
+        "gpt-4o": 0.68,
+        "claude-3-5-sonnet-20241022": 0.65,
+        "claude-3-sonnet-20240229": 0.62,
+        "gpt-4-turbo": 0.55,
+        "gpt-4": 0.50,
+        "claude-3-opus-20240229": 0.40,
+    }
+    def __init__(
+        self,
+        openai_api_key: str | SecretStr | None = None,
+        anthropic_api_key: str | SecretStr | None = None,
+        google_api_key: str | SecretStr | None = None,
+        groq_api_key: str | SecretStr | None = None,
+        config: RoutingConfig | None = None,
+    ):
+        self.config = config or RoutingConfig()
+        self.providers: dict[str, BaseProvider] = {}
+        self.cost_tracker = CostTracker()
+        self._initialized = False
+        self._round_robin_index = 0
+        # Store API keys (handle SecretStr)
+        self._api_keys = {
+            "openai": self._get_key_value(openai_api_key),
+            "anthropic": self._get_key_value(anthropic_api_key),
+            "google": self._get_key_value(google_api_key),
+            "groq": self._get_key_value(groq_api_key),
+        }
+    @staticmethod
+    def _get_key_value(key: str | SecretStr | None) -> str | None:
+        """Extract string value from SecretStr if needed."""
+        if key is None:
+            return None
+        if isinstance(key, SecretStr):
+            return key.get_secret_value()
+        return key
+    async def initialize(self) -> None:
+        """Initialize all configured providers."""
+        if self._initialized:
+            return
+        # Initialize providers based on available API keys
+        if self._api_keys["openai"]:
+            provider = OpenAIProvider(api_key=self._api_keys["openai"])
+            await provider.initialize()
+            self.providers["openai"] = provider
+            logger.info("Initialized OpenAI provider")
+        if self._api_keys["anthropic"]:
+            provider = AnthropicProvider(api_key=self._api_keys["anthropic"])
+            await provider.initialize()
+            self.providers["anthropic"] = provider
+            logger.info("Initialized Anthropic provider")
+        if self._api_keys["google"]:
+            provider = GoogleProvider(api_key=self._api_keys["google"])
+            await provider.initialize()
+            self.providers["google"] = provider
+            logger.info("Initialized Google provider")
+        if self._api_keys["groq"]:
+            provider = GroqProvider(api_key=self._api_keys["groq"])
+            await provider.initialize()
+            self.providers["groq"] = provider
+            logger.info("Initialized Groq provider")
+        if not self.providers:
+            logger.warning("No LLM providers configured")
+        self._initialized = True
+    async def shutdown(self) -> None:
+        """Shutdown all providers."""
+        for provider in self.providers.values():
+            await provider.shutdown()
+        self.providers.clear()
+        self._initialized = False
+    def get_available_models(self) -> list[ModelInfo]:
+        """Get all available models across providers."""
+        models = []
+        for provider in self.providers.values():
+            models.extend(provider.get_models())
+        return models
+    def get_provider_for_model(self, model: str) -> BaseProvider | None:
+        """Get the provider for a specific model."""
+        for provider in self.providers.values():
+            if provider.get_model_info(model):
+                return provider
+            # Check aliases for Anthropic and Google
+            if hasattr(provider, "MODEL_ALIASES"):
+                if model in provider.MODEL_ALIASES:  # type: ignore
+                    return provider
+        return None
+    def _score_model(
+        self,
+        model_info: ModelInfo,
+        strategy: RoutingStrategy,
+    ) -> ModelScore:
+        """Score a model based on routing strategy."""
+        model_id = model_info.id
+        quality = self.MODEL_QUALITY_SCORES.get(model_id, 0.5)
+        speed = self.MODEL_SPEED_SCORES.get(model_id, 0.5)
+        # Calculate cost score (inverse of cost, normalized)
+        max_cost = 0.1  # $0.10 per 1K tokens as reference
+        avg_cost = (model_info.cost_per_1k_input + model_info.cost_per_1k_output) / 2
+        cost_score = 1.0 - min(avg_cost / max_cost, 1.0)
+        # Calculate overall score based on strategy
+        if strategy == RoutingStrategy.BEST_QUALITY:
+            overall = quality * 0.8 + speed * 0.1 + cost_score * 0.1
+        elif strategy == RoutingStrategy.BEST_SPEED:
+            overall = quality * 0.1 + speed * 0.8 + cost_score * 0.1
+        elif strategy == RoutingStrategy.LOWEST_COST:
+            overall = quality * 0.1 + speed * 0.1 + cost_score * 0.8
+        else:  # BEST_VALUE
+            overall = quality * 0.4 + speed * 0.3 + cost_score * 0.3
+        return ModelScore(
+            model_id=model_id,
+            provider=model_info.provider,
+            quality_score=quality,
+            speed_score=speed,
+            cost_score=cost_score,
+            overall_score=overall,
+        )
+    def route(
+        self,
+        task_type: TaskType = TaskType.GENERAL,
+        strategy: RoutingStrategy | None = None,
+        required_features: list[str] | None = None,
+    ) -> tuple[str, BaseProvider] | None:
+        """Route to the best model for the task.
+        Args:
+            task_type: Type of task to perform
+            strategy: Routing strategy (uses default if not specified)
+            required_features: Required model features (e.g., 'functions', 'vision')
+        Returns:
+            Tuple of (model_id, provider) or None if no suitable model found
+        """
+        if not self.providers:
+            return None
+        strategy = strategy or self.config.default_strategy
+        # Handle round robin specially
+        if strategy == RoutingStrategy.ROUND_ROBIN:
+            models = self.get_available_models()
+            if not models:
+                return None
+            # Filter by features if needed
+            if required_features:
+                models = self._filter_by_features(models, required_features)
+            if not models:
+                return None
+            model = models[self._round_robin_index % len(models)]
+            self._round_robin_index += 1
+            provider = self.get_provider_for_model(model.id)
+            return (model.id, provider) if provider else None
+        # Get task preferences
+        preferred_models = self.config.task_preferences.get(task_type, [])
+        # Check preferred models first
+        for model_id in preferred_models:
+            provider = self.get_provider_for_model(model_id)
+            if provider:
+                model_info = provider.get_model_info(model_id)
+                if model_info and self._meets_requirements(model_info, required_features):
+                    return (model_id, provider)
+        # Score all available models
+        scored_models: list[tuple[ModelScore, BaseProvider]] = []
+        for provider in self.providers.values():
+            for model_info in provider.get_models():
+                if self._meets_requirements(model_info, required_features):
+                    score = self._score_model(model_info, strategy)
+                    scored_models.append((score, provider))
+        if not scored_models:
+            return None
+        # Sort by overall score
+        scored_models.sort(key=lambda x: x[0].overall_score, reverse=True)
+        best_score, best_provider = scored_models[0]
+        return (best_score.model_id, best_provider)
+    def _meets_requirements(
+        self,
+        model_info: ModelInfo,
+        required_features: list[str] | None,
+    ) -> bool:
+        """Check if model meets required features."""
+        if not required_features:
+            return True
+        for feature in required_features:
+            if feature == "functions" and not model_info.supports_functions:
+                return False
+            if feature == "vision" and not model_info.supports_vision:
+                return False
+            if feature == "streaming" and not model_info.supports_streaming:
+                return False
+        return True
+    def _filter_by_features(
+        self,
+        models: list[ModelInfo],
+        required_features: list[str],
+    ) -> list[ModelInfo]:
+        """Filter models by required features."""
+        return [m for m in models if self._meets_requirements(m, required_features)]
+    async def complete(
+        self,
+        messages: list[dict[str, Any]],
+        model: str | None = None,
+        task_type: TaskType = TaskType.GENERAL,
+        strategy: RoutingStrategy | None = None,
+        required_features: list[str] | None = None,
+        fallback: bool = True,
+        **kwargs: Any,
+    ) -> CompletionResponse:
+        """Generate a completion with automatic routing and fallback.
+        Args:
+            messages: List of message dicts
+            model: Specific model to use (overrides routing)
+            task_type: Type of task for routing
+            strategy: Routing strategy
+            required_features: Required model features
+            fallback: Enable fallback on failure
+            **kwargs: Additional completion parameters
+        Returns:
+            CompletionResponse from the model
+        Raises:
+            ProviderError: If all models fail
+        """
+        if not self._initialized:
+            await self.initialize()
+        # Determine model(s) to try
+        models_to_try: list[tuple[str, BaseProvider]] = []
+        if model:
+            # Specific model requested
+            provider = self.get_provider_for_model(model)
+            if provider:
+                models_to_try.append((model, provider))
+            else:
+                raise ProviderError(f"Model {model} not found", "router")
+        else:
+            # Use routing
+            route_result = self.route(task_type, strategy, required_features)
+            if route_result:
+                models_to_try.append(route_result)
+        # Add fallback models
+        if fallback and len(models_to_try) < self.config.max_fallback_attempts:
+            # Get additional models for fallback
+            preferred = self.config.task_preferences.get(task_type, [])
+            for fallback_model in preferred:
+                if len(models_to_try) >= self.config.max_fallback_attempts:
+                    break
+                provider = self.get_provider_for_model(fallback_model)
+                if provider and (fallback_model, provider) not in models_to_try:
+                    models_to_try.append((fallback_model, provider))
+        if not models_to_try:
+            raise ProviderError("No suitable models available", "router")
+        # Try models in order
+        last_error: Exception | None = None
+        for i, (model_id, provider) in enumerate(models_to_try):
+            try:
+                logger.info(f"Attempting completion with {provider.PROVIDER_NAME}/{model_id}")
+                response = await provider.complete(messages, model_id, **kwargs)
+                # Track cost
+                self.cost_tracker.track(response)
+                return response
+            except RateLimitError as e:
+                logger.warning(f"Rate limited by {provider.PROVIDER_NAME}: {e}")
+                last_error = e
+                if i < len(models_to_try) - 1:
+                    await asyncio.sleep(self.config.fallback_delay_seconds)
+            except ProviderError as e:
+                logger.warning(f"Provider error from {provider.PROVIDER_NAME}: {e}")
+                last_error = e
+                if i < len(models_to_try) - 1:
+                    await asyncio.sleep(self.config.fallback_delay_seconds)
+            except Exception as e:
+                logger.error(f"Unexpected error from {provider.PROVIDER_NAME}: {e}")
+                last_error = e
+        # All models failed
+        raise ProviderError(
+            f"All models failed. Last error: {last_error}",
+            "router",
+        )
+    def get_cost_summary(self) -> dict[str, Any]:
+        """Get cost tracking summary."""
+        return self.cost_tracker.get_summary()
+    def reset_cost_tracking(self) -> None:
+        """Reset cost tracking."""
+        self.cost_tracker.reset()
+    @property
+    def available_providers(self) -> list[str]:
+        """List of initialized provider names."""
+        return list(self.providers.keys())
+    def __repr__(self) -> str:
+        return (
+            f"SmartModelRouter(providers={list(self.providers.keys())}, "
+            f"requests={self.cost_tracker.request_count}, "
+            f"cost=${self.cost_tracker.total_cost:.4f})"
+        )