Spaces:

NeerajCodz
/

scrapeRL

Running

App Files Files Community

NeerajCodz commited on 30 days ago

Commit

afefaea

1 Parent(s): bb3ee41

feat: add MCP tool registry and search engine integration

Browse files

Files changed (24) hide show

backend/app/search/__init__.py +17 -0
backend/app/search/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/search/__pycache__/engine.cpython-314.pyc +0 -0
backend/app/search/engine.py +261 -0
backend/app/search/providers/__init__.py +13 -0
backend/app/search/providers/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/search/providers/__pycache__/base.cpython-314.pyc +0 -0
backend/app/search/providers/__pycache__/bing.cpython-314.pyc +0 -0
backend/app/search/providers/__pycache__/duckduckgo.cpython-314.pyc +0 -0
backend/app/search/providers/__pycache__/google.cpython-314.pyc +0 -0
backend/app/search/providers/base.py +70 -0
backend/app/search/providers/bing.py +107 -0
backend/app/search/providers/duckduckgo.py +126 -0
backend/app/search/providers/google.py +112 -0
backend/app/tools/__init__.py +25 -0
backend/app/tools/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/tools/__pycache__/browser.cpython-314.pyc +0 -0
backend/app/tools/__pycache__/html.cpython-314.pyc +0 -0
backend/app/tools/__pycache__/registry.cpython-314.pyc +0 -0
backend/app/tools/__pycache__/search.cpython-314.pyc +0 -0
backend/app/tools/browser.py +362 -0
backend/app/tools/html.py +22 -0
backend/app/tools/registry.py +317 -0
backend/app/tools/search.py +152 -0

backend/app/search/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Search module for ScrapeRL backend."""
+from app.search.engine import SearchEngineRouter
+from app.search.providers import (
+    BaseSearchProvider,
+    GoogleSearchProvider,
+    BingSearchProvider,
+    DuckDuckGoProvider,
+)
+__all__ = [
+    "SearchEngineRouter",
+    "BaseSearchProvider",
+    "GoogleSearchProvider",
+    "BingSearchProvider",
+    "DuckDuckGoProvider",
+]

backend/app/search/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (443 Bytes). View file

backend/app/search/__pycache__/engine.cpython-314.pyc ADDED Viewed

Binary file (13.4 kB). View file

backend/app/search/engine.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Search engine router for aggregating multiple search providers."""
+from typing import Any, Optional
+from dataclasses import dataclass, field
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+@dataclass
+class SearchResult:
+    """Individual search result."""
+    title: str
+    url: str
+    snippet: str
+    position: int
+    source: str
+    score: float = 1.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+class SearchEngineRouter:
+    """
+    Routes search queries to different providers and aggregates results.
+    Supports multiple search providers and can aggregate/rank results
+    from multiple sources.
+    """
+    def __init__(self) -> None:
+        self._providers: dict[str, Any] = {}
+        self._default_provider: Optional[str] = None
+        self._initialized: bool = False
+    async def initialize(self) -> None:
+        """Initialize the search engine router and all providers."""
+        logger.info("Initializing SearchEngineRouter")
+        # Initialize all registered providers
+        for name, provider in self._providers.items():
+            try:
+                if hasattr(provider, "initialize"):
+                    await provider.initialize()
+                logger.info(f"Initialized provider: {name}")
+            except Exception as e:
+                logger.error(f"Failed to initialize provider {name}: {e}")
+        self._initialized = True
+        logger.info("SearchEngineRouter initialized")
+    async def shutdown(self) -> None:
+        """Shutdown the router and all providers."""
+        logger.info("Shutting down SearchEngineRouter")
+        for name, provider in self._providers.items():
+            try:
+                if hasattr(provider, "shutdown"):
+                    await provider.shutdown()
+                logger.info(f"Shut down provider: {name}")
+            except Exception as e:
+                logger.error(f"Error shutting down provider {name}: {e}")
+        self._initialized = False
+    def register_provider(
+        self,
+        name: str,
+        provider: Any,
+        set_default: bool = False,
+    ) -> None:
+        """
+        Register a search provider.
+        Args:
+            name: Provider identifier
+            provider: Provider instance
+            set_default: Set as the default provider
+        """
+        self._providers[name] = provider
+        logger.info(f"Registered search provider: {name}")
+        if set_default or self._default_provider is None:
+            self._default_provider = name
+            logger.info(f"Set default provider: {name}")
+    def unregister_provider(self, name: str) -> bool:
+        """
+        Unregister a search provider.
+        Args:
+            name: Provider identifier
+        Returns:
+            True if provider was removed
+        """
+        if name in self._providers:
+            del self._providers[name]
+            if self._default_provider == name:
+                self._default_provider = next(iter(self._providers), None)
+            logger.info(f"Unregistered provider: {name}")
+            return True
+        return False
+    def get_providers(self) -> list[str]:
+        """
+        Get list of registered provider names.
+        Returns:
+            List of provider identifiers
+        """
+        return list(self._providers.keys())
+    def get_provider(self, name: str) -> Optional[Any]:
+        """
+        Get a specific provider by name.
+        Args:
+            name: Provider identifier
+        Returns:
+            Provider instance or None
+        """
+        return self._providers.get(name)
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+        provider: Optional[str] = None,
+    ) -> list[SearchResult]:
+        """
+        Perform a search using a specific provider.
+        Args:
+            query: Search query string
+            max_results: Maximum results to return
+            provider: Provider to use (defaults to default provider)
+        Returns:
+            List of search results
+        Raises:
+            ValueError: If provider not found
+        """
+        provider_name = provider or self._default_provider
+        if provider_name is None:
+            raise ValueError("No search provider configured")
+        if provider_name not in self._providers:
+            raise ValueError(f"Provider '{provider_name}' not found")
+        provider_instance = self._providers[provider_name]
+        logger.info(f"Searching with provider '{provider_name}': {query}")
+        try:
+            results = await provider_instance.search(query, max_results)
+            # Ensure results have proper source attribution
+            for i, result in enumerate(results):
+                if isinstance(result, dict):
+                    result["source"] = provider_name
+                    result["position"] = i + 1
+                elif hasattr(result, "source"):
+                    result.source = provider_name
+                    result.position = i + 1
+            return results
+        except Exception as e:
+            logger.error(f"Search failed with provider '{provider_name}': {e}")
+            raise
+    async def search_all(
+        self,
+        query: str,
+        max_results_per_provider: int = 10,
+        providers: Optional[list[str]] = None,
+    ) -> list[SearchResult]:
+        """
+        Search across multiple providers and aggregate results.
+        Args:
+            query: Search query string
+            max_results_per_provider: Max results from each provider
+            providers: Specific providers to use (defaults to all)
+        Returns:
+            Aggregated and ranked list of results
+        """
+        provider_names = providers or list(self._providers.keys())
+        all_results: list[SearchResult] = []
+        for provider_name in provider_names:
+            try:
+                results = await self.search(
+                    query=query,
+                    max_results=max_results_per_provider,
+                    provider=provider_name,
+                )
+                all_results.extend(results)
+            except Exception as e:
+                logger.warning(f"Provider '{provider_name}' failed: {e}")
+                continue
+        # Rank and deduplicate results
+        ranked_results = self._rank_results(all_results)
+        return ranked_results
+    def _rank_results(
+        self,
+        results: list[SearchResult],
+    ) -> list[SearchResult]:
+        """
+        Rank and deduplicate search results.
+        Args:
+            results: Raw results from multiple providers
+        Returns:
+            Ranked and deduplicated results
+        """
+        # Deduplicate by URL
+        seen_urls: set[str] = set()
+        unique_results: list[SearchResult] = []
+        for result in results:
+            url = result.url if hasattr(result, "url") else result.get("url", "")
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                unique_results.append(result)
+        # Sort by score (higher is better) then by position (lower is better)
+        def sort_key(r: Any) -> tuple[float, int]:
+            score = r.score if hasattr(r, "score") else r.get("score", 1.0)
+            position = r.position if hasattr(r, "position") else r.get("position", 999)
+            return (-score, position)
+        unique_results.sort(key=sort_key)
+        # Update positions
+        for i, result in enumerate(unique_results):
+            if hasattr(result, "position"):
+                result.position = i + 1
+            elif isinstance(result, dict):
+                result["position"] = i + 1
+        return unique_results
+    @property
+    def is_initialized(self) -> bool:
+        """Check if the router is initialized."""
+        return self._initialized
+    @property
+    def default_provider(self) -> Optional[str]:
+        """Get the default provider name."""
+        return self._default_provider

backend/app/search/providers/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Search providers for ScrapeRL backend."""
+from app.search.providers.base import BaseSearchProvider
+from app.search.providers.google import GoogleSearchProvider
+from app.search.providers.bing import BingSearchProvider
+from app.search.providers.duckduckgo import DuckDuckGoProvider
+__all__ = [
+    "BaseSearchProvider",
+    "GoogleSearchProvider",
+    "BingSearchProvider",
+    "DuckDuckGoProvider",
+]

backend/app/search/providers/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (516 Bytes). View file

backend/app/search/providers/__pycache__/base.cpython-314.pyc ADDED Viewed

Binary file (4.65 kB). View file

backend/app/search/providers/__pycache__/bing.cpython-314.pyc ADDED Viewed

Binary file (4.19 kB). View file

backend/app/search/providers/__pycache__/duckduckgo.cpython-314.pyc ADDED Viewed

Binary file (6.63 kB). View file

backend/app/search/providers/__pycache__/google.cpython-314.pyc ADDED Viewed

Binary file (4.65 kB). View file

backend/app/search/providers/base.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Base search provider interface."""
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+from dataclasses import dataclass, field
+@dataclass
+class SearchResult:
+    """Standard search result format."""
+    title: str
+    url: str
+    snippet: str
+    position: int = 0
+    source: str = ""
+    score: float = 1.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+class BaseSearchProvider(ABC):
+    """
+    Abstract base class for search providers.
+    All search providers must implement this interface.
+    """
+    def __init__(self, api_key: Optional[str] = None) -> None:
+        self.api_key = api_key
+        self._initialized: bool = False
+    async def initialize(self) -> None:
+        """Initialize the provider (optional override)."""
+        self._initialized = True
+    async def shutdown(self) -> None:
+        """Shutdown the provider (optional override)."""
+        self._initialized = False
+    @abstractmethod
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+    ) -> list[SearchResult]:
+        """
+        Perform a search query.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results
+        Returns:
+            List of SearchResult objects
+        """
+        pass
+    @property
+    def name(self) -> str:
+        """Provider name for identification."""
+        return self.__class__.__name__.replace("Provider", "").replace("Search", "")
+    @property
+    def is_initialized(self) -> bool:
+        """Check if provider is initialized."""
+        return self._initialized
+    def health_check(self) -> bool:
+        """Check provider health."""
+        return self._initialized

backend/app/search/providers/bing.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Bing Search provider (stub implementation)."""
+from typing import Optional
+from app.search.providers.base import BaseSearchProvider, SearchResult
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+class BingSearchProvider(BaseSearchProvider):
+    """
+    Bing Search provider using Bing Web Search API.
+    This is a stub implementation. To use Bing Search API:
+    1. Get API key from Azure Portal (Bing Search resource)
+    2. Set the BING_API_KEY environment variable
+    Environment variables:
+        BING_API_KEY: Bing Search API key
+    """
+    def __init__(self, api_key: Optional[str] = None) -> None:
+        super().__init__(api_key)
+        self._base_url = "https://api.bing.microsoft.com/v7.0/search"
+    async def initialize(self) -> None:
+        """Initialize the Bing Search provider."""
+        logger.info("Initializing BingSearchProvider")
+        if not self.api_key:
+            logger.warning("Bing API key not configured - stub mode enabled")
+        self._initialized = True
+        logger.info("BingSearchProvider initialized")
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+    ) -> list[SearchResult]:
+        """
+        Search using Bing Web Search API.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results
+        Returns:
+            List of SearchResult objects
+        """
+        logger.info(f"Bing search: {query}")
+        if not self.api_key:
+            logger.warning("Bing Search not configured, returning stub results")
+            return self._get_stub_results(query, max_results)
+        # Real implementation would look like:
+        # import httpx
+        # async with httpx.AsyncClient() as client:
+        #     headers = {"Ocp-Apim-Subscription-Key": self.api_key}
+        #     params = {
+        #         "q": query,
+        #         "count": max_results,
+        #         "responseFilter": "Webpages",
+        #     }
+        #     response = await client.get(
+        #         self._base_url,
+        #         headers=headers,
+        #         params=params,
+        #     )
+        #     data = response.json()
+        #
+        #     results = []
+        #     web_pages = data.get("webPages", {}).get("value", [])
+        #     for i, item in enumerate(web_pages):
+        #         results.append(SearchResult(
+        #             title=item.get("name", ""),
+        #             url=item.get("url", ""),
+        #             snippet=item.get("snippet", ""),
+        #             position=i + 1,
+        #             source="bing",
+        #         ))
+        #     return results
+        return self._get_stub_results(query, max_results)
+    def _get_stub_results(
+        self,
+        query: str,
+        max_results: int,
+    ) -> list[SearchResult]:
+        """Generate stub results for testing."""
+        results = []
+        for i in range(min(max_results, 3)):
+            results.append(
+                SearchResult(
+                    title=f"Bing Result {i + 1}: {query}",
+                    url=f"https://example.com/bing/{i + 1}",
+                    snippet=f"This is a stub Bing search result for '{query}'. "
+                    f"Configure BING_API_KEY for real results.",
+                    position=i + 1,
+                    source="bing",
+                    metadata={"stub": True},
+                )
+            )
+        return results

backend/app/search/providers/duckduckgo.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""DuckDuckGo Search provider using duckduckgo-search library."""
+from typing import Optional
+from app.search.providers.base import BaseSearchProvider, SearchResult
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+class DuckDuckGoProvider(BaseSearchProvider):
+    """
+    DuckDuckGo Search provider using the duckduckgo-search library.
+    This provider works without an API key.
+    Requirements:
+        pip install duckduckgo-search
+    """
+    def __init__(self) -> None:
+        super().__init__(api_key=None)
+        self._ddgs: Optional[object] = None
+    async def initialize(self) -> None:
+        """Initialize the DuckDuckGo Search provider."""
+        logger.info("Initializing DuckDuckGoProvider")
+        try:
+            from duckduckgo_search import DDGS
+            self._ddgs = DDGS()
+            self._initialized = True
+            logger.info("DuckDuckGoProvider initialized with duckduckgo-search")
+        except ImportError:
+            logger.warning(
+                "duckduckgo-search not installed. "
+                "Install with: pip install duckduckgo-search"
+            )
+            self._initialized = True  # Still mark as initialized for stub mode
+            logger.info("DuckDuckGoProvider initialized in stub mode")
+    async def shutdown(self) -> None:
+        """Shutdown the DuckDuckGo provider."""
+        self._ddgs = None
+        self._initialized = False
+        logger.info("DuckDuckGoProvider shut down")
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+    ) -> list[SearchResult]:
+        """
+        Search using DuckDuckGo.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results
+        Returns:
+            List of SearchResult objects
+        """
+        logger.info(f"DuckDuckGo search: {query}")
+        if self._ddgs is None:
+            logger.warning("DuckDuckGo not available, returning stub results")
+            return self._get_stub_results(query, max_results)
+        try:
+            # duckduckgo-search is synchronous, run in executor for async
+            import asyncio
+            loop = asyncio.get_event_loop()
+            raw_results = await loop.run_in_executor(
+                None,
+                lambda: list(self._ddgs.text(query, max_results=max_results)),  # type: ignore
+            )
+            results = []
+            for i, item in enumerate(raw_results):
+                results.append(
+                    SearchResult(
+                        title=item.get("title", ""),
+                        url=item.get("href", item.get("link", "")),
+                        snippet=item.get("body", item.get("snippet", "")),
+                        position=i + 1,
+                        source="duckduckgo",
+                        metadata={
+                            "raw": item,
+                        },
+                    )
+                )
+            logger.info(f"DuckDuckGo returned {len(results)} results")
+            return results
+        except Exception as e:
+            logger.error(f"DuckDuckGo search failed: {e}")
+            return self._get_stub_results(query, max_results)
+    def _get_stub_results(
+        self,
+        query: str,
+        max_results: int,
+    ) -> list[SearchResult]:
+        """Generate stub results for testing."""
+        results = []
+        for i in range(min(max_results, 3)):
+            results.append(
+                SearchResult(
+                    title=f"DuckDuckGo Result {i + 1}: {query}",
+                    url=f"https://example.com/ddg/{i + 1}",
+                    snippet=f"This is a stub DuckDuckGo search result for '{query}'. "
+                    f"Install duckduckgo-search for real results.",
+                    position=i + 1,
+                    source="duckduckgo",
+                    metadata={"stub": True},
+                )
+            )
+        return results
+    @property
+    def is_available(self) -> bool:
+        """Check if DuckDuckGo search is available."""
+        return self._ddgs is not None

backend/app/search/providers/google.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Google Search provider (stub implementation)."""
+from typing import Optional
+from app.search.providers.base import BaseSearchProvider, SearchResult
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+class GoogleSearchProvider(BaseSearchProvider):
+    """
+    Google Search provider using Custom Search API.
+    This is a stub implementation. To use Google Search API:
+    1. Get API key from Google Cloud Console
+    2. Create a Custom Search Engine (CSE)
+    3. Get the Search Engine ID (cx)
+    Environment variables:
+        GOOGLE_API_KEY: Google Cloud API key
+        GOOGLE_CSE_ID: Custom Search Engine ID
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        search_engine_id: Optional[str] = None,
+    ) -> None:
+        super().__init__(api_key)
+        self.search_engine_id = search_engine_id
+        self._base_url = "https://www.googleapis.com/customsearch/v1"
+    async def initialize(self) -> None:
+        """Initialize the Google Search provider."""
+        logger.info("Initializing GoogleSearchProvider")
+        if not self.api_key:
+            logger.warning("Google API key not configured - stub mode enabled")
+        if not self.search_engine_id:
+            logger.warning("Google CSE ID not configured - stub mode enabled")
+        self._initialized = True
+        logger.info("GoogleSearchProvider initialized")
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+    ) -> list[SearchResult]:
+        """
+        Search using Google Custom Search API.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results (max 10 per request)
+        Returns:
+            List of SearchResult objects
+        """
+        logger.info(f"Google search: {query}")
+        if not self.api_key or not self.search_engine_id:
+            logger.warning("Google Search not configured, returning stub results")
+            return self._get_stub_results(query, max_results)
+        # Real implementation would look like:
+        # import httpx
+        # async with httpx.AsyncClient() as client:
+        #     params = {
+        #         "key": self.api_key,
+        #         "cx": self.search_engine_id,
+        #         "q": query,
+        #         "num": min(max_results, 10),
+        #     }
+        #     response = await client.get(self._base_url, params=params)
+        #     data = response.json()
+        #
+        #     results = []
+        #     for i, item in enumerate(data.get("items", [])):
+        #         results.append(SearchResult(
+        #             title=item.get("title", ""),
+        #             url=item.get("link", ""),
+        #             snippet=item.get("snippet", ""),
+        #             position=i + 1,
+        #             source="google",
+        #         ))
+        #     return results
+        return self._get_stub_results(query, max_results)
+    def _get_stub_results(
+        self,
+        query: str,
+        max_results: int,
+    ) -> list[SearchResult]:
+        """Generate stub results for testing."""
+        results = []
+        for i in range(min(max_results, 3)):
+            results.append(
+                SearchResult(
+                    title=f"Google Result {i + 1}: {query}",
+                    url=f"https://example.com/google/{i + 1}",
+                    snippet=f"This is a stub Google search result for '{query}'. "
+                    f"Configure GOOGLE_API_KEY and GOOGLE_CSE_ID for real results.",
+                    position=i + 1,
+                    source="google",
+                    metadata={"stub": True},
+                )
+            )
+        return results

backend/app/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Tools module for ScrapeRL backend."""
+from app.tools.registry import MCPToolRegistry
+from app.tools.browser import BrowserTool
+from app.tools.search import SearchTool
+from app.tools.html import (
+    parse_html,
+    clean_html,
+    extract_text,
+    semantic_chunk,
+    extract_links,
+    extract_tables,
+)
+__all__ = [
+    "MCPToolRegistry",
+    "BrowserTool",
+    "SearchTool",
+    "parse_html",
+    "clean_html",
+    "extract_text",
+    "semantic_chunk",
+    "extract_links",
+    "extract_tables",
+]

backend/app/tools/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (580 Bytes). View file

backend/app/tools/__pycache__/browser.cpython-314.pyc ADDED Viewed

Binary file (13.2 kB). View file

backend/app/tools/__pycache__/html.cpython-314.pyc ADDED Viewed

Binary file (437 Bytes). View file

backend/app/tools/__pycache__/registry.cpython-314.pyc ADDED Viewed

Binary file (15.1 kB). View file

backend/app/tools/__pycache__/search.cpython-314.pyc ADDED Viewed

Binary file (7.2 kB). View file

backend/app/tools/browser.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""Browser automation tool for web scraping."""
+from typing import Any, Optional
+from dataclasses import dataclass
+from enum import Enum
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+class BrowserType(Enum):
+    """Supported browser types."""
+    CHROMIUM = "chromium"
+    FIREFOX = "firefox"
+    WEBKIT = "webkit"
+@dataclass
+class BrowserConfig:
+    """Configuration for browser instance."""
+    browser_type: BrowserType = BrowserType.CHROMIUM
+    headless: bool = True
+    timeout: int = 30000  # milliseconds
+    viewport_width: int = 1920
+    viewport_height: int = 1080
+    user_agent: Optional[str] = None
+    proxy: Optional[str] = None
+@dataclass
+class NavigationResult:
+    """Result of a navigation action."""
+    url: str
+    status: int
+    title: str
+    success: bool
+    error: Optional[str] = None
+@dataclass
+class ClickResult:
+    """Result of a click action."""
+    selector: str
+    success: bool
+    error: Optional[str] = None
+@dataclass
+class ScreenshotResult:
+    """Result of a screenshot action."""
+    data: bytes
+    format: str
+    width: int
+    height: int
+    success: bool
+    error: Optional[str] = None
+class BrowserTool:
+    """
+    Browser automation tool using Playwright/Selenium.
+    This is a stub implementation that defines the interface.
+    Actual browser automation requires installing playwright or selenium.
+    """
+    def __init__(self, config: Optional[BrowserConfig] = None) -> None:
+        self.config = config or BrowserConfig()
+        self._browser: Any = None
+        self._context: Any = None
+        self._page: Any = None
+        self._initialized: bool = False
+    async def initialize(self) -> None:
+        """
+        Initialize the browser instance.
+        Note: This is a stub. Real implementation requires playwright:
+            pip install playwright
+            playwright install
+        """
+        logger.info(f"Initializing browser: {self.config.browser_type.value}")
+        # Stub: In real implementation, initialize playwright here
+        # from playwright.async_api import async_playwright
+        # self._playwright = await async_playwright().start()
+        # self._browser = await self._playwright.chromium.launch(headless=self.config.headless)
+        self._initialized = True
+        logger.info("Browser initialized (stub mode)")
+    async def shutdown(self) -> None:
+        """Close the browser and cleanup resources."""
+        logger.info("Shutting down browser")
+        if self._page:
+            # await self._page.close()
+            self._page = None
+        if self._context:
+            # await self._context.close()
+            self._context = None
+        if self._browser:
+            # await self._browser.close()
+            self._browser = None
+        self._initialized = False
+        logger.info("Browser shutdown complete")
+    async def navigate(
+        self,
+        url: str,
+        wait_until: str = "domcontentloaded",
+        timeout: Optional[int] = None,
+    ) -> NavigationResult:
+        """
+        Navigate to a URL.
+        Args:
+            url: Target URL
+            wait_until: Navigation wait condition (load, domcontentloaded, networkidle)
+            timeout: Navigation timeout in milliseconds
+        Returns:
+            NavigationResult with status and details
+        """
+        logger.info(f"Navigating to: {url}")
+        if not self._initialized:
+            return NavigationResult(
+                url=url,
+                status=0,
+                title="",
+                success=False,
+                error="Browser not initialized",
+            )
+        # Stub implementation
+        # Real implementation:
+        # response = await self._page.goto(url, wait_until=wait_until, timeout=timeout)
+        # return NavigationResult(
+        #     url=self._page.url,
+        #     status=response.status if response else 0,
+        #     title=await self._page.title(),
+        #     success=True,
+        # )
+        return NavigationResult(
+            url=url,
+            status=200,
+            title="Stub Page Title",
+            success=True,
+            error="Stub mode - no actual navigation",
+        )
+    async def click(
+        self,
+        selector: str,
+        timeout: Optional[int] = None,
+        force: bool = False,
+    ) -> ClickResult:
+        """
+        Click an element on the page.
+        Args:
+            selector: CSS or XPath selector
+            timeout: Click timeout in milliseconds
+            force: Force click even if element is obscured
+        Returns:
+            ClickResult indicating success or failure
+        """
+        logger.info(f"Clicking element: {selector}")
+        if not self._initialized:
+            return ClickResult(
+                selector=selector,
+                success=False,
+                error="Browser not initialized",
+            )
+        # Stub implementation
+        # Real implementation:
+        # await self._page.click(selector, timeout=timeout, force=force)
+        return ClickResult(
+            selector=selector,
+            success=True,
+            error="Stub mode - no actual click",
+        )
+    async def fill(
+        self,
+        selector: str,
+        value: str,
+        timeout: Optional[int] = None,
+    ) -> ClickResult:
+        """
+        Fill a form field with text.
+        Args:
+            selector: CSS or XPath selector
+            value: Text to enter
+            timeout: Action timeout in milliseconds
+        Returns:
+            ClickResult indicating success or failure
+        """
+        logger.info(f"Filling element: {selector} with value")
+        if not self._initialized:
+            return ClickResult(
+                selector=selector,
+                success=False,
+                error="Browser not initialized",
+            )
+        # Stub implementation
+        # Real implementation:
+        # await self._page.fill(selector, value, timeout=timeout)
+        return ClickResult(
+            selector=selector,
+            success=True,
+            error="Stub mode - no actual fill",
+        )
+    async def get_html(
+        self,
+        selector: Optional[str] = None,
+    ) -> str:
+        """
+        Get HTML content of the page or a specific element.
+        Args:
+            selector: Optional selector to get HTML of specific element
+        Returns:
+            HTML content as string
+        """
+        logger.info(f"Getting HTML for: {selector or 'full page'}")
+        if not self._initialized:
+            return ""
+        # Stub implementation
+        # Real implementation:
+        # if selector:
+        #     element = await self._page.query_selector(selector)
+        #     return await element.inner_html() if element else ""
+        # return await self._page.content()
+        return "<html><body><h1>Stub HTML Content</h1></body></html>"
+    async def screenshot(
+        self,
+        selector: Optional[str] = None,
+        full_page: bool = False,
+        format: str = "png",
+    ) -> ScreenshotResult:
+        """
+        Take a screenshot of the page or element.
+        Args:
+            selector: Optional selector to screenshot specific element
+            full_page: Capture full scrollable page
+            format: Image format (png, jpeg)
+        Returns:
+            ScreenshotResult with image data
+        """
+        logger.info(f"Taking screenshot: selector={selector}, full_page={full_page}")
+        if not self._initialized:
+            return ScreenshotResult(
+                data=b"",
+                format=format,
+                width=0,
+                height=0,
+                success=False,
+                error="Browser not initialized",
+            )
+        # Stub implementation
+        # Real implementation:
+        # if selector:
+        #     element = await self._page.query_selector(selector)
+        #     data = await element.screenshot(type=format) if element else b""
+        # else:
+        #     data = await self._page.screenshot(full_page=full_page, type=format)
+        return ScreenshotResult(
+            data=b"stub_screenshot_data",
+            format=format,
+            width=self.config.viewport_width,
+            height=self.config.viewport_height,
+            success=True,
+            error="Stub mode - no actual screenshot",
+        )
+    async def evaluate(self, script: str) -> Any:
+        """
+        Execute JavaScript in the page context.
+        Args:
+            script: JavaScript code to execute
+        Returns:
+            Result of the script execution
+        """
+        logger.info(f"Evaluating script: {script[:50]}...")
+        if not self._initialized:
+            return None
+        # Stub implementation
+        # Real implementation:
+        # return await self._page.evaluate(script)
+        return None
+    async def wait_for_selector(
+        self,
+        selector: str,
+        timeout: Optional[int] = None,
+        state: str = "visible",
+    ) -> bool:
+        """
+        Wait for an element to appear on the page.
+        Args:
+            selector: CSS or XPath selector
+            timeout: Wait timeout in milliseconds
+            state: Element state to wait for (visible, hidden, attached, detached)
+        Returns:
+            True if element found, False otherwise
+        """
+        logger.info(f"Waiting for selector: {selector}")
+        if not self._initialized:
+            return False
+        # Stub implementation
+        # Real implementation:
+        # try:
+        #     await self._page.wait_for_selector(selector, timeout=timeout, state=state)
+        #     return True
+        # except TimeoutError:
+        #     return False
+        return True
+    def health_check(self) -> bool:
+        """Check if the browser is healthy and responsive."""
+        return self._initialized
+    @property
+    def is_initialized(self) -> bool:
+        """Check if the browser has been initialized."""
+        return self._initialized

backend/app/tools/html.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""HTML processing tools for web scraping.
+Re-exports utilities from app.utils.html for tool registration.
+"""
+from app.utils.html import (
+    parse_html,
+    clean_html,
+    extract_text,
+    semantic_chunk,
+    extract_links,
+    extract_tables,
+)
+__all__ = [
+    "parse_html",
+    "clean_html",
+    "extract_text",
+    "semantic_chunk",
+    "extract_links",
+    "extract_tables",
+]

backend/app/tools/registry.py ADDED Viewed

	@@ -0,0 +1,317 @@

+"""MCP Tool Registry for dynamic tool discovery and management."""
+import asyncio
+from typing import Any, Callable, Optional
+from dataclasses import dataclass, field
+from enum import Enum
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+class ToolStatus(Enum):
+    """Status of a registered tool."""
+    UNKNOWN = "unknown"
+    HEALTHY = "healthy"
+    UNHEALTHY = "unhealthy"
+    INITIALIZING = "initializing"
+    SHUTDOWN = "shutdown"
+@dataclass
+class ToolDefinition:
+    """Definition of a registered tool."""
+    name: str
+    description: str
+    handler: Callable[..., Any]
+    parameters: dict[str, Any] = field(default_factory=dict)
+    status: ToolStatus = ToolStatus.UNKNOWN
+    metadata: dict[str, Any] = field(default_factory=dict)
+class MCPToolRegistry:
+    """
+    Registry for MCP tools with dynamic discovery and execution.
+    Manages tool lifecycle including registration, health checks,
+    and execution routing.
+    """
+    def __init__(self) -> None:
+        self._tools: dict[str, ToolDefinition] = {}
+        self._initialized: bool = False
+        self._health_check_interval: float = 30.0
+        self._health_check_task: Optional[asyncio.Task[None]] = None
+    async def initialize(self) -> None:
+        """Initialize the registry and start health monitoring."""
+        if self._initialized:
+            logger.warning("Registry already initialized")
+            return
+        logger.info("Initializing MCP Tool Registry")
+        # Start health check background task
+        self._health_check_task = asyncio.create_task(self._health_check_loop())
+        self._initialized = True
+        logger.info("MCP Tool Registry initialized")
+    async def shutdown(self) -> None:
+        """Shutdown the registry and cleanup resources."""
+        logger.info("Shutting down MCP Tool Registry")
+        # Cancel health check task
+        if self._health_check_task:
+            self._health_check_task.cancel()
+            try:
+                await self._health_check_task
+            except asyncio.CancelledError:
+                pass
+        # Mark all tools as shutdown
+        for tool in self._tools.values():
+            tool.status = ToolStatus.SHUTDOWN
+        self._initialized = False
+        logger.info("MCP Tool Registry shutdown complete")
+    def register(
+        self,
+        name: str,
+        handler: Callable[..., Any],
+        description: str = "",
+        parameters: Optional[dict[str, Any]] = None,
+        metadata: Optional[dict[str, Any]] = None,
+    ) -> ToolDefinition:
+        """
+        Register a new tool with the registry.
+        Args:
+            name: Unique tool name
+            handler: Callable that implements the tool
+            description: Human-readable description
+            parameters: JSON schema for tool parameters
+            metadata: Additional tool metadata
+        Returns:
+            The registered ToolDefinition
+        Raises:
+            ValueError: If a tool with the same name already exists
+        """
+        if name in self._tools:
+            raise ValueError(f"Tool '{name}' is already registered")
+        tool = ToolDefinition(
+            name=name,
+            description=description,
+            handler=handler,
+            parameters=parameters or {},
+            status=ToolStatus.INITIALIZING,
+            metadata=metadata or {},
+        )
+        self._tools[name] = tool
+        logger.info(f"Registered tool: {name}")
+        return tool
+    def unregister(self, name: str) -> bool:
+        """
+        Unregister a tool from the registry.
+        Args:
+            name: Tool name to unregister
+        Returns:
+            True if tool was removed, False if not found
+        """
+        if name in self._tools:
+            del self._tools[name]
+            logger.info(f"Unregistered tool: {name}")
+            return True
+        return False
+    def get(self, name: str) -> Optional[ToolDefinition]:
+        """
+        Get a tool definition by name.
+        Args:
+            name: Tool name to retrieve
+        Returns:
+            ToolDefinition if found, None otherwise
+        """
+        return self._tools.get(name)
+    def list_tools(
+        self,
+        include_unhealthy: bool = False,
+    ) -> list[ToolDefinition]:
+        """
+        List all registered tools.
+        Args:
+            include_unhealthy: Include tools with unhealthy status
+        Returns:
+            List of tool definitions
+        """
+        tools = list(self._tools.values())
+        if not include_unhealthy:
+            tools = [
+                t for t in tools
+                if t.status not in (ToolStatus.UNHEALTHY, ToolStatus.SHUTDOWN)
+            ]
+        return tools
+    async def execute(
+        self,
+        name: str,
+        **kwargs: Any,
+    ) -> Any:
+        """
+        Execute a tool by name with the given parameters.
+        Args:
+            name: Tool name to execute
+            **kwargs: Tool parameters
+        Returns:
+            Tool execution result
+        Raises:
+            KeyError: If tool is not found
+            RuntimeError: If tool is not healthy
+        """
+        tool = self.get(name)
+        if tool is None:
+            raise KeyError(f"Tool '{name}' not found")
+        if tool.status == ToolStatus.UNHEALTHY:
+            raise RuntimeError(f"Tool '{name}' is unhealthy")
+        if tool.status == ToolStatus.SHUTDOWN:
+            raise RuntimeError(f"Tool '{name}' has been shut down")
+        logger.debug(f"Executing tool: {name} with params: {kwargs}")
+        try:
+            # Handle both sync and async handlers
+            if asyncio.iscoroutinefunction(tool.handler):
+                result = await tool.handler(**kwargs)
+            else:
+                result = tool.handler(**kwargs)
+            return result
+        except Exception as e:
+            logger.error(f"Tool execution failed: {name} - {e}")
+            raise
+    async def health_check(self, name: str) -> ToolStatus:
+        """
+        Check the health of a specific tool.
+        Args:
+            name: Tool name to check
+        Returns:
+            Current tool status
+        """
+        tool = self.get(name)
+        if tool is None:
+            return ToolStatus.UNKNOWN
+        try:
+            # Try to call a health check method if available
+            handler = tool.handler
+            if hasattr(handler, "health_check"):
+                health_fn = getattr(handler, "health_check")
+                if asyncio.iscoroutinefunction(health_fn):
+                    await health_fn()
+                else:
+                    health_fn()
+            tool.status = ToolStatus.HEALTHY
+        except Exception as e:
+            logger.warning(f"Health check failed for {name}: {e}")
+            tool.status = ToolStatus.UNHEALTHY
+        return tool.status
+    async def health_check_all(self) -> dict[str, ToolStatus]:
+        """
+        Check health of all registered tools.
+        Returns:
+            Dictionary mapping tool names to their status
+        """
+        results: dict[str, ToolStatus] = {}
+        for name in self._tools:
+            results[name] = await self.health_check(name)
+        return results
+    async def _health_check_loop(self) -> None:
+        """Background task for periodic health checks."""
+        while True:
+            try:
+                await asyncio.sleep(self._health_check_interval)
+                await self.health_check_all()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Health check loop error: {e}")
+    def get_tool_schema(self, name: str) -> Optional[dict[str, Any]]:
+        """
+        Get the JSON schema for a tool's parameters.
+        Args:
+            name: Tool name
+        Returns:
+            Parameter schema dict or None if not found
+        """
+        tool = self.get(name)
+        if tool is None:
+            return None
+        return {
+            "name": tool.name,
+            "description": tool.description,
+            "parameters": tool.parameters,
+        }
+    def list_schemas(self) -> list[dict[str, Any]]:
+        """
+        Get schemas for all registered tools.
+        Returns:
+            List of tool schema dictionaries
+        """
+        schemas = []
+        for name in self._tools:
+            schema = self.get_tool_schema(name)
+            if schema:
+                schemas.append(schema)
+        return schemas
+    @property
+    def is_initialized(self) -> bool:
+        """Check if the registry has been initialized."""
+        return self._initialized
+    @property
+    def tool_count(self) -> int:
+        """Get the number of registered tools."""
+        return len(self._tools)

backend/app/tools/search.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""Search tool wrapper for search engine providers."""
+from typing import Any, Optional
+from dataclasses import dataclass
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+@dataclass
+class SearchResult:
+    """Individual search result."""
+    title: str
+    url: str
+    snippet: str
+    position: int
+    source: str  # Provider name
+    metadata: dict[str, Any] | None = None
+@dataclass
+class SearchResponse:
+    """Response from a search query."""
+    query: str
+    results: list[SearchResult]
+    total_results: int
+    provider: str
+    success: bool
+    error: Optional[str] = None
+class SearchTool:
+    """
+    Search tool that wraps search engine providers.
+    Provides a unified interface for searching across different
+    search engine providers.
+    """
+    def __init__(self, default_provider: str = "duckduckgo") -> None:
+        self.default_provider = default_provider
+        self._engine: Any = None
+        self._initialized: bool = False
+    async def initialize(self, engine: Any = None) -> None:
+        """
+        Initialize the search tool with a search engine.
+        Args:
+            engine: SearchEngineRouter instance to use
+        """
+        logger.info("Initializing SearchTool")
+        self._engine = engine
+        self._initialized = True
+        logger.info("SearchTool initialized")
+    async def shutdown(self) -> None:
+        """Shutdown the search tool."""
+        logger.info("Shutting down SearchTool")
+        self._engine = None
+        self._initialized = False
+    async def search(
+        self,
+        query: str,
+        max_results: int = 10,
+        provider: Optional[str] = None,
+    ) -> SearchResponse:
+        """
+        Perform a search query.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results to return
+            provider: Specific provider to use (optional)
+        Returns:
+            SearchResponse with results
+        """
+        logger.info(f"Searching for: {query}")
+        provider_name = provider or self.default_provider
+        if not self._initialized or self._engine is None:
+            logger.warning("SearchTool not properly initialized, using stub response")
+            return SearchResponse(
+                query=query,
+                results=[],
+                total_results=0,
+                provider=provider_name,
+                success=False,
+                error="Search engine not initialized",
+            )
+        try:
+            # Delegate to search engine router
+            results = await self._engine.search(
+                query=query,
+                max_results=max_results,
+                provider=provider_name,
+            )
+            return SearchResponse(
+                query=query,
+                results=results,
+                total_results=len(results),
+                provider=provider_name,
+                success=True,
+            )
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return SearchResponse(
+                query=query,
+                results=[],
+                total_results=0,
+                provider=provider_name,
+                success=False,
+                error=str(e),
+            )
+    async def get_results(
+        self,
+        query: str,
+        max_results: int = 10,
+        provider: Optional[str] = None,
+    ) -> list[SearchResult]:
+        """
+        Get search results as a list.
+        Args:
+            query: Search query string
+            max_results: Maximum number of results to return
+            provider: Specific provider to use (optional)
+        Returns:
+            List of SearchResult objects
+        """
+        response = await self.search(query, max_results, provider)
+        return response.results
+    def health_check(self) -> bool:
+        """Check if the search tool is healthy."""
+        return self._initialized and self._engine is not None
+    @property
+    def is_initialized(self) -> bool:
+        """Check if the search tool has been initialized."""
+        return self._initialized