Spaces:
Running
Running
Commit ·
afefaea
1
Parent(s): bb3ee41
feat: add MCP tool registry and search engine integration
Browse files- backend/app/search/__init__.py +17 -0
- backend/app/search/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/search/__pycache__/engine.cpython-314.pyc +0 -0
- backend/app/search/engine.py +261 -0
- backend/app/search/providers/__init__.py +13 -0
- backend/app/search/providers/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/search/providers/__pycache__/base.cpython-314.pyc +0 -0
- backend/app/search/providers/__pycache__/bing.cpython-314.pyc +0 -0
- backend/app/search/providers/__pycache__/duckduckgo.cpython-314.pyc +0 -0
- backend/app/search/providers/__pycache__/google.cpython-314.pyc +0 -0
- backend/app/search/providers/base.py +70 -0
- backend/app/search/providers/bing.py +107 -0
- backend/app/search/providers/duckduckgo.py +126 -0
- backend/app/search/providers/google.py +112 -0
- backend/app/tools/__init__.py +25 -0
- backend/app/tools/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/tools/__pycache__/browser.cpython-314.pyc +0 -0
- backend/app/tools/__pycache__/html.cpython-314.pyc +0 -0
- backend/app/tools/__pycache__/registry.cpython-314.pyc +0 -0
- backend/app/tools/__pycache__/search.cpython-314.pyc +0 -0
- backend/app/tools/browser.py +362 -0
- backend/app/tools/html.py +22 -0
- backend/app/tools/registry.py +317 -0
- backend/app/tools/search.py +152 -0
backend/app/search/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search module for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
from app.search.engine import SearchEngineRouter
|
| 4 |
+
from app.search.providers import (
|
| 5 |
+
BaseSearchProvider,
|
| 6 |
+
GoogleSearchProvider,
|
| 7 |
+
BingSearchProvider,
|
| 8 |
+
DuckDuckGoProvider,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"SearchEngineRouter",
|
| 13 |
+
"BaseSearchProvider",
|
| 14 |
+
"GoogleSearchProvider",
|
| 15 |
+
"BingSearchProvider",
|
| 16 |
+
"DuckDuckGoProvider",
|
| 17 |
+
]
|
backend/app/search/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (443 Bytes). View file
|
|
|
backend/app/search/__pycache__/engine.cpython-314.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
backend/app/search/engine.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search engine router for aggregating multiple search providers."""
|
| 2 |
+
|
| 3 |
+
from typing import Any, Optional
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
|
| 6 |
+
from app.utils.logging import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class SearchResult:
|
| 13 |
+
"""Individual search result."""
|
| 14 |
+
|
| 15 |
+
title: str
|
| 16 |
+
url: str
|
| 17 |
+
snippet: str
|
| 18 |
+
position: int
|
| 19 |
+
source: str
|
| 20 |
+
score: float = 1.0
|
| 21 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class SearchEngineRouter:
|
| 25 |
+
"""
|
| 26 |
+
Routes search queries to different providers and aggregates results.
|
| 27 |
+
|
| 28 |
+
Supports multiple search providers and can aggregate/rank results
|
| 29 |
+
from multiple sources.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self) -> None:
|
| 33 |
+
self._providers: dict[str, Any] = {}
|
| 34 |
+
self._default_provider: Optional[str] = None
|
| 35 |
+
self._initialized: bool = False
|
| 36 |
+
|
| 37 |
+
async def initialize(self) -> None:
|
| 38 |
+
"""Initialize the search engine router and all providers."""
|
| 39 |
+
logger.info("Initializing SearchEngineRouter")
|
| 40 |
+
|
| 41 |
+
# Initialize all registered providers
|
| 42 |
+
for name, provider in self._providers.items():
|
| 43 |
+
try:
|
| 44 |
+
if hasattr(provider, "initialize"):
|
| 45 |
+
await provider.initialize()
|
| 46 |
+
logger.info(f"Initialized provider: {name}")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Failed to initialize provider {name}: {e}")
|
| 49 |
+
|
| 50 |
+
self._initialized = True
|
| 51 |
+
logger.info("SearchEngineRouter initialized")
|
| 52 |
+
|
| 53 |
+
async def shutdown(self) -> None:
|
| 54 |
+
"""Shutdown the router and all providers."""
|
| 55 |
+
logger.info("Shutting down SearchEngineRouter")
|
| 56 |
+
|
| 57 |
+
for name, provider in self._providers.items():
|
| 58 |
+
try:
|
| 59 |
+
if hasattr(provider, "shutdown"):
|
| 60 |
+
await provider.shutdown()
|
| 61 |
+
logger.info(f"Shut down provider: {name}")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"Error shutting down provider {name}: {e}")
|
| 64 |
+
|
| 65 |
+
self._initialized = False
|
| 66 |
+
|
| 67 |
+
def register_provider(
|
| 68 |
+
self,
|
| 69 |
+
name: str,
|
| 70 |
+
provider: Any,
|
| 71 |
+
set_default: bool = False,
|
| 72 |
+
) -> None:
|
| 73 |
+
"""
|
| 74 |
+
Register a search provider.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
name: Provider identifier
|
| 78 |
+
provider: Provider instance
|
| 79 |
+
set_default: Set as the default provider
|
| 80 |
+
"""
|
| 81 |
+
self._providers[name] = provider
|
| 82 |
+
logger.info(f"Registered search provider: {name}")
|
| 83 |
+
|
| 84 |
+
if set_default or self._default_provider is None:
|
| 85 |
+
self._default_provider = name
|
| 86 |
+
logger.info(f"Set default provider: {name}")
|
| 87 |
+
|
| 88 |
+
def unregister_provider(self, name: str) -> bool:
|
| 89 |
+
"""
|
| 90 |
+
Unregister a search provider.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
name: Provider identifier
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
True if provider was removed
|
| 97 |
+
"""
|
| 98 |
+
if name in self._providers:
|
| 99 |
+
del self._providers[name]
|
| 100 |
+
if self._default_provider == name:
|
| 101 |
+
self._default_provider = next(iter(self._providers), None)
|
| 102 |
+
logger.info(f"Unregistered provider: {name}")
|
| 103 |
+
return True
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
def get_providers(self) -> list[str]:
|
| 107 |
+
"""
|
| 108 |
+
Get list of registered provider names.
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of provider identifiers
|
| 112 |
+
"""
|
| 113 |
+
return list(self._providers.keys())
|
| 114 |
+
|
| 115 |
+
def get_provider(self, name: str) -> Optional[Any]:
|
| 116 |
+
"""
|
| 117 |
+
Get a specific provider by name.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
name: Provider identifier
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Provider instance or None
|
| 124 |
+
"""
|
| 125 |
+
return self._providers.get(name)
|
| 126 |
+
|
| 127 |
+
async def search(
|
| 128 |
+
self,
|
| 129 |
+
query: str,
|
| 130 |
+
max_results: int = 10,
|
| 131 |
+
provider: Optional[str] = None,
|
| 132 |
+
) -> list[SearchResult]:
|
| 133 |
+
"""
|
| 134 |
+
Perform a search using a specific provider.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
query: Search query string
|
| 138 |
+
max_results: Maximum results to return
|
| 139 |
+
provider: Provider to use (defaults to default provider)
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
List of search results
|
| 143 |
+
|
| 144 |
+
Raises:
|
| 145 |
+
ValueError: If provider not found
|
| 146 |
+
"""
|
| 147 |
+
provider_name = provider or self._default_provider
|
| 148 |
+
|
| 149 |
+
if provider_name is None:
|
| 150 |
+
raise ValueError("No search provider configured")
|
| 151 |
+
|
| 152 |
+
if provider_name not in self._providers:
|
| 153 |
+
raise ValueError(f"Provider '{provider_name}' not found")
|
| 154 |
+
|
| 155 |
+
provider_instance = self._providers[provider_name]
|
| 156 |
+
logger.info(f"Searching with provider '{provider_name}': {query}")
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
results = await provider_instance.search(query, max_results)
|
| 160 |
+
|
| 161 |
+
# Ensure results have proper source attribution
|
| 162 |
+
for i, result in enumerate(results):
|
| 163 |
+
if isinstance(result, dict):
|
| 164 |
+
result["source"] = provider_name
|
| 165 |
+
result["position"] = i + 1
|
| 166 |
+
elif hasattr(result, "source"):
|
| 167 |
+
result.source = provider_name
|
| 168 |
+
result.position = i + 1
|
| 169 |
+
|
| 170 |
+
return results
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"Search failed with provider '{provider_name}': {e}")
|
| 174 |
+
raise
|
| 175 |
+
|
| 176 |
+
async def search_all(
|
| 177 |
+
self,
|
| 178 |
+
query: str,
|
| 179 |
+
max_results_per_provider: int = 10,
|
| 180 |
+
providers: Optional[list[str]] = None,
|
| 181 |
+
) -> list[SearchResult]:
|
| 182 |
+
"""
|
| 183 |
+
Search across multiple providers and aggregate results.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
query: Search query string
|
| 187 |
+
max_results_per_provider: Max results from each provider
|
| 188 |
+
providers: Specific providers to use (defaults to all)
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Aggregated and ranked list of results
|
| 192 |
+
"""
|
| 193 |
+
provider_names = providers or list(self._providers.keys())
|
| 194 |
+
all_results: list[SearchResult] = []
|
| 195 |
+
|
| 196 |
+
for provider_name in provider_names:
|
| 197 |
+
try:
|
| 198 |
+
results = await self.search(
|
| 199 |
+
query=query,
|
| 200 |
+
max_results=max_results_per_provider,
|
| 201 |
+
provider=provider_name,
|
| 202 |
+
)
|
| 203 |
+
all_results.extend(results)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.warning(f"Provider '{provider_name}' failed: {e}")
|
| 206 |
+
continue
|
| 207 |
+
|
| 208 |
+
# Rank and deduplicate results
|
| 209 |
+
ranked_results = self._rank_results(all_results)
|
| 210 |
+
|
| 211 |
+
return ranked_results
|
| 212 |
+
|
| 213 |
+
def _rank_results(
|
| 214 |
+
self,
|
| 215 |
+
results: list[SearchResult],
|
| 216 |
+
) -> list[SearchResult]:
|
| 217 |
+
"""
|
| 218 |
+
Rank and deduplicate search results.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
results: Raw results from multiple providers
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Ranked and deduplicated results
|
| 225 |
+
"""
|
| 226 |
+
# Deduplicate by URL
|
| 227 |
+
seen_urls: set[str] = set()
|
| 228 |
+
unique_results: list[SearchResult] = []
|
| 229 |
+
|
| 230 |
+
for result in results:
|
| 231 |
+
url = result.url if hasattr(result, "url") else result.get("url", "")
|
| 232 |
+
if url and url not in seen_urls:
|
| 233 |
+
seen_urls.add(url)
|
| 234 |
+
unique_results.append(result)
|
| 235 |
+
|
| 236 |
+
# Sort by score (higher is better) then by position (lower is better)
|
| 237 |
+
def sort_key(r: Any) -> tuple[float, int]:
|
| 238 |
+
score = r.score if hasattr(r, "score") else r.get("score", 1.0)
|
| 239 |
+
position = r.position if hasattr(r, "position") else r.get("position", 999)
|
| 240 |
+
return (-score, position)
|
| 241 |
+
|
| 242 |
+
unique_results.sort(key=sort_key)
|
| 243 |
+
|
| 244 |
+
# Update positions
|
| 245 |
+
for i, result in enumerate(unique_results):
|
| 246 |
+
if hasattr(result, "position"):
|
| 247 |
+
result.position = i + 1
|
| 248 |
+
elif isinstance(result, dict):
|
| 249 |
+
result["position"] = i + 1
|
| 250 |
+
|
| 251 |
+
return unique_results
|
| 252 |
+
|
| 253 |
+
@property
|
| 254 |
+
def is_initialized(self) -> bool:
|
| 255 |
+
"""Check if the router is initialized."""
|
| 256 |
+
return self._initialized
|
| 257 |
+
|
| 258 |
+
@property
|
| 259 |
+
def default_provider(self) -> Optional[str]:
|
| 260 |
+
"""Get the default provider name."""
|
| 261 |
+
return self._default_provider
|
backend/app/search/providers/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search providers for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
from app.search.providers.base import BaseSearchProvider
|
| 4 |
+
from app.search.providers.google import GoogleSearchProvider
|
| 5 |
+
from app.search.providers.bing import BingSearchProvider
|
| 6 |
+
from app.search.providers.duckduckgo import DuckDuckGoProvider
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"BaseSearchProvider",
|
| 10 |
+
"GoogleSearchProvider",
|
| 11 |
+
"BingSearchProvider",
|
| 12 |
+
"DuckDuckGoProvider",
|
| 13 |
+
]
|
backend/app/search/providers/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (516 Bytes). View file
|
|
|
backend/app/search/providers/__pycache__/base.cpython-314.pyc
ADDED
|
Binary file (4.65 kB). View file
|
|
|
backend/app/search/providers/__pycache__/bing.cpython-314.pyc
ADDED
|
Binary file (4.19 kB). View file
|
|
|
backend/app/search/providers/__pycache__/duckduckgo.cpython-314.pyc
ADDED
|
Binary file (6.63 kB). View file
|
|
|
backend/app/search/providers/__pycache__/google.cpython-314.pyc
ADDED
|
Binary file (4.65 kB). View file
|
|
|
backend/app/search/providers/base.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base search provider interface."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import Any, Optional
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class SearchResult:
|
| 10 |
+
"""Standard search result format."""
|
| 11 |
+
|
| 12 |
+
title: str
|
| 13 |
+
url: str
|
| 14 |
+
snippet: str
|
| 15 |
+
position: int = 0
|
| 16 |
+
source: str = ""
|
| 17 |
+
score: float = 1.0
|
| 18 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class BaseSearchProvider(ABC):
|
| 22 |
+
"""
|
| 23 |
+
Abstract base class for search providers.
|
| 24 |
+
|
| 25 |
+
All search providers must implement this interface.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, api_key: Optional[str] = None) -> None:
|
| 29 |
+
self.api_key = api_key
|
| 30 |
+
self._initialized: bool = False
|
| 31 |
+
|
| 32 |
+
async def initialize(self) -> None:
|
| 33 |
+
"""Initialize the provider (optional override)."""
|
| 34 |
+
self._initialized = True
|
| 35 |
+
|
| 36 |
+
async def shutdown(self) -> None:
|
| 37 |
+
"""Shutdown the provider (optional override)."""
|
| 38 |
+
self._initialized = False
|
| 39 |
+
|
| 40 |
+
@abstractmethod
|
| 41 |
+
async def search(
|
| 42 |
+
self,
|
| 43 |
+
query: str,
|
| 44 |
+
max_results: int = 10,
|
| 45 |
+
) -> list[SearchResult]:
|
| 46 |
+
"""
|
| 47 |
+
Perform a search query.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
query: Search query string
|
| 51 |
+
max_results: Maximum number of results
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
List of SearchResult objects
|
| 55 |
+
"""
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def name(self) -> str:
|
| 60 |
+
"""Provider name for identification."""
|
| 61 |
+
return self.__class__.__name__.replace("Provider", "").replace("Search", "")
|
| 62 |
+
|
| 63 |
+
@property
|
| 64 |
+
def is_initialized(self) -> bool:
|
| 65 |
+
"""Check if provider is initialized."""
|
| 66 |
+
return self._initialized
|
| 67 |
+
|
| 68 |
+
def health_check(self) -> bool:
|
| 69 |
+
"""Check provider health."""
|
| 70 |
+
return self._initialized
|
backend/app/search/providers/bing.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bing Search provider (stub implementation)."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from app.search.providers.base import BaseSearchProvider, SearchResult
|
| 6 |
+
from app.utils.logging import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BingSearchProvider(BaseSearchProvider):
|
| 12 |
+
"""
|
| 13 |
+
Bing Search provider using Bing Web Search API.
|
| 14 |
+
|
| 15 |
+
This is a stub implementation. To use Bing Search API:
|
| 16 |
+
1. Get API key from Azure Portal (Bing Search resource)
|
| 17 |
+
2. Set the BING_API_KEY environment variable
|
| 18 |
+
|
| 19 |
+
Environment variables:
|
| 20 |
+
BING_API_KEY: Bing Search API key
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, api_key: Optional[str] = None) -> None:
|
| 24 |
+
super().__init__(api_key)
|
| 25 |
+
self._base_url = "https://api.bing.microsoft.com/v7.0/search"
|
| 26 |
+
|
| 27 |
+
async def initialize(self) -> None:
|
| 28 |
+
"""Initialize the Bing Search provider."""
|
| 29 |
+
logger.info("Initializing BingSearchProvider")
|
| 30 |
+
|
| 31 |
+
if not self.api_key:
|
| 32 |
+
logger.warning("Bing API key not configured - stub mode enabled")
|
| 33 |
+
|
| 34 |
+
self._initialized = True
|
| 35 |
+
logger.info("BingSearchProvider initialized")
|
| 36 |
+
|
| 37 |
+
async def search(
|
| 38 |
+
self,
|
| 39 |
+
query: str,
|
| 40 |
+
max_results: int = 10,
|
| 41 |
+
) -> list[SearchResult]:
|
| 42 |
+
"""
|
| 43 |
+
Search using Bing Web Search API.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
query: Search query string
|
| 47 |
+
max_results: Maximum number of results
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
List of SearchResult objects
|
| 51 |
+
"""
|
| 52 |
+
logger.info(f"Bing search: {query}")
|
| 53 |
+
|
| 54 |
+
if not self.api_key:
|
| 55 |
+
logger.warning("Bing Search not configured, returning stub results")
|
| 56 |
+
return self._get_stub_results(query, max_results)
|
| 57 |
+
|
| 58 |
+
# Real implementation would look like:
|
| 59 |
+
# import httpx
|
| 60 |
+
# async with httpx.AsyncClient() as client:
|
| 61 |
+
# headers = {"Ocp-Apim-Subscription-Key": self.api_key}
|
| 62 |
+
# params = {
|
| 63 |
+
# "q": query,
|
| 64 |
+
# "count": max_results,
|
| 65 |
+
# "responseFilter": "Webpages",
|
| 66 |
+
# }
|
| 67 |
+
# response = await client.get(
|
| 68 |
+
# self._base_url,
|
| 69 |
+
# headers=headers,
|
| 70 |
+
# params=params,
|
| 71 |
+
# )
|
| 72 |
+
# data = response.json()
|
| 73 |
+
#
|
| 74 |
+
# results = []
|
| 75 |
+
# web_pages = data.get("webPages", {}).get("value", [])
|
| 76 |
+
# for i, item in enumerate(web_pages):
|
| 77 |
+
# results.append(SearchResult(
|
| 78 |
+
# title=item.get("name", ""),
|
| 79 |
+
# url=item.get("url", ""),
|
| 80 |
+
# snippet=item.get("snippet", ""),
|
| 81 |
+
# position=i + 1,
|
| 82 |
+
# source="bing",
|
| 83 |
+
# ))
|
| 84 |
+
# return results
|
| 85 |
+
|
| 86 |
+
return self._get_stub_results(query, max_results)
|
| 87 |
+
|
| 88 |
+
def _get_stub_results(
|
| 89 |
+
self,
|
| 90 |
+
query: str,
|
| 91 |
+
max_results: int,
|
| 92 |
+
) -> list[SearchResult]:
|
| 93 |
+
"""Generate stub results for testing."""
|
| 94 |
+
results = []
|
| 95 |
+
for i in range(min(max_results, 3)):
|
| 96 |
+
results.append(
|
| 97 |
+
SearchResult(
|
| 98 |
+
title=f"Bing Result {i + 1}: {query}",
|
| 99 |
+
url=f"https://example.com/bing/{i + 1}",
|
| 100 |
+
snippet=f"This is a stub Bing search result for '{query}'. "
|
| 101 |
+
f"Configure BING_API_KEY for real results.",
|
| 102 |
+
position=i + 1,
|
| 103 |
+
source="bing",
|
| 104 |
+
metadata={"stub": True},
|
| 105 |
+
)
|
| 106 |
+
)
|
| 107 |
+
return results
|
backend/app/search/providers/duckduckgo.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DuckDuckGo Search provider using duckduckgo-search library."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from app.search.providers.base import BaseSearchProvider, SearchResult
|
| 6 |
+
from app.utils.logging import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DuckDuckGoProvider(BaseSearchProvider):
|
| 12 |
+
"""
|
| 13 |
+
DuckDuckGo Search provider using the duckduckgo-search library.
|
| 14 |
+
|
| 15 |
+
This provider works without an API key.
|
| 16 |
+
|
| 17 |
+
Requirements:
|
| 18 |
+
pip install duckduckgo-search
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self) -> None:
|
| 22 |
+
super().__init__(api_key=None)
|
| 23 |
+
self._ddgs: Optional[object] = None
|
| 24 |
+
|
| 25 |
+
async def initialize(self) -> None:
|
| 26 |
+
"""Initialize the DuckDuckGo Search provider."""
|
| 27 |
+
logger.info("Initializing DuckDuckGoProvider")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from duckduckgo_search import DDGS
|
| 31 |
+
|
| 32 |
+
self._ddgs = DDGS()
|
| 33 |
+
self._initialized = True
|
| 34 |
+
logger.info("DuckDuckGoProvider initialized with duckduckgo-search")
|
| 35 |
+
except ImportError:
|
| 36 |
+
logger.warning(
|
| 37 |
+
"duckduckgo-search not installed. "
|
| 38 |
+
"Install with: pip install duckduckgo-search"
|
| 39 |
+
)
|
| 40 |
+
self._initialized = True # Still mark as initialized for stub mode
|
| 41 |
+
logger.info("DuckDuckGoProvider initialized in stub mode")
|
| 42 |
+
|
| 43 |
+
async def shutdown(self) -> None:
|
| 44 |
+
"""Shutdown the DuckDuckGo provider."""
|
| 45 |
+
self._ddgs = None
|
| 46 |
+
self._initialized = False
|
| 47 |
+
logger.info("DuckDuckGoProvider shut down")
|
| 48 |
+
|
| 49 |
+
async def search(
|
| 50 |
+
self,
|
| 51 |
+
query: str,
|
| 52 |
+
max_results: int = 10,
|
| 53 |
+
) -> list[SearchResult]:
|
| 54 |
+
"""
|
| 55 |
+
Search using DuckDuckGo.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
query: Search query string
|
| 59 |
+
max_results: Maximum number of results
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
List of SearchResult objects
|
| 63 |
+
"""
|
| 64 |
+
logger.info(f"DuckDuckGo search: {query}")
|
| 65 |
+
|
| 66 |
+
if self._ddgs is None:
|
| 67 |
+
logger.warning("DuckDuckGo not available, returning stub results")
|
| 68 |
+
return self._get_stub_results(query, max_results)
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
# duckduckgo-search is synchronous, run in executor for async
|
| 72 |
+
import asyncio
|
| 73 |
+
|
| 74 |
+
loop = asyncio.get_event_loop()
|
| 75 |
+
raw_results = await loop.run_in_executor(
|
| 76 |
+
None,
|
| 77 |
+
lambda: list(self._ddgs.text(query, max_results=max_results)), # type: ignore
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
results = []
|
| 81 |
+
for i, item in enumerate(raw_results):
|
| 82 |
+
results.append(
|
| 83 |
+
SearchResult(
|
| 84 |
+
title=item.get("title", ""),
|
| 85 |
+
url=item.get("href", item.get("link", "")),
|
| 86 |
+
snippet=item.get("body", item.get("snippet", "")),
|
| 87 |
+
position=i + 1,
|
| 88 |
+
source="duckduckgo",
|
| 89 |
+
metadata={
|
| 90 |
+
"raw": item,
|
| 91 |
+
},
|
| 92 |
+
)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
logger.info(f"DuckDuckGo returned {len(results)} results")
|
| 96 |
+
return results
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"DuckDuckGo search failed: {e}")
|
| 100 |
+
return self._get_stub_results(query, max_results)
|
| 101 |
+
|
| 102 |
+
def _get_stub_results(
|
| 103 |
+
self,
|
| 104 |
+
query: str,
|
| 105 |
+
max_results: int,
|
| 106 |
+
) -> list[SearchResult]:
|
| 107 |
+
"""Generate stub results for testing."""
|
| 108 |
+
results = []
|
| 109 |
+
for i in range(min(max_results, 3)):
|
| 110 |
+
results.append(
|
| 111 |
+
SearchResult(
|
| 112 |
+
title=f"DuckDuckGo Result {i + 1}: {query}",
|
| 113 |
+
url=f"https://example.com/ddg/{i + 1}",
|
| 114 |
+
snippet=f"This is a stub DuckDuckGo search result for '{query}'. "
|
| 115 |
+
f"Install duckduckgo-search for real results.",
|
| 116 |
+
position=i + 1,
|
| 117 |
+
source="duckduckgo",
|
| 118 |
+
metadata={"stub": True},
|
| 119 |
+
)
|
| 120 |
+
)
|
| 121 |
+
return results
|
| 122 |
+
|
| 123 |
+
@property
|
| 124 |
+
def is_available(self) -> bool:
|
| 125 |
+
"""Check if DuckDuckGo search is available."""
|
| 126 |
+
return self._ddgs is not None
|
backend/app/search/providers/google.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Google Search provider (stub implementation)."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from app.search.providers.base import BaseSearchProvider, SearchResult
|
| 6 |
+
from app.utils.logging import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class GoogleSearchProvider(BaseSearchProvider):
|
| 12 |
+
"""
|
| 13 |
+
Google Search provider using Custom Search API.
|
| 14 |
+
|
| 15 |
+
This is a stub implementation. To use Google Search API:
|
| 16 |
+
1. Get API key from Google Cloud Console
|
| 17 |
+
2. Create a Custom Search Engine (CSE)
|
| 18 |
+
3. Get the Search Engine ID (cx)
|
| 19 |
+
|
| 20 |
+
Environment variables:
|
| 21 |
+
GOOGLE_API_KEY: Google Cloud API key
|
| 22 |
+
GOOGLE_CSE_ID: Custom Search Engine ID
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
api_key: Optional[str] = None,
|
| 28 |
+
search_engine_id: Optional[str] = None,
|
| 29 |
+
) -> None:
|
| 30 |
+
super().__init__(api_key)
|
| 31 |
+
self.search_engine_id = search_engine_id
|
| 32 |
+
self._base_url = "https://www.googleapis.com/customsearch/v1"
|
| 33 |
+
|
| 34 |
+
async def initialize(self) -> None:
|
| 35 |
+
"""Initialize the Google Search provider."""
|
| 36 |
+
logger.info("Initializing GoogleSearchProvider")
|
| 37 |
+
|
| 38 |
+
if not self.api_key:
|
| 39 |
+
logger.warning("Google API key not configured - stub mode enabled")
|
| 40 |
+
|
| 41 |
+
if not self.search_engine_id:
|
| 42 |
+
logger.warning("Google CSE ID not configured - stub mode enabled")
|
| 43 |
+
|
| 44 |
+
self._initialized = True
|
| 45 |
+
logger.info("GoogleSearchProvider initialized")
|
| 46 |
+
|
| 47 |
+
async def search(
|
| 48 |
+
self,
|
| 49 |
+
query: str,
|
| 50 |
+
max_results: int = 10,
|
| 51 |
+
) -> list[SearchResult]:
|
| 52 |
+
"""
|
| 53 |
+
Search using Google Custom Search API.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
query: Search query string
|
| 57 |
+
max_results: Maximum number of results (max 10 per request)
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List of SearchResult objects
|
| 61 |
+
"""
|
| 62 |
+
logger.info(f"Google search: {query}")
|
| 63 |
+
|
| 64 |
+
if not self.api_key or not self.search_engine_id:
|
| 65 |
+
logger.warning("Google Search not configured, returning stub results")
|
| 66 |
+
return self._get_stub_results(query, max_results)
|
| 67 |
+
|
| 68 |
+
# Real implementation would look like:
|
| 69 |
+
# import httpx
|
| 70 |
+
# async with httpx.AsyncClient() as client:
|
| 71 |
+
# params = {
|
| 72 |
+
# "key": self.api_key,
|
| 73 |
+
# "cx": self.search_engine_id,
|
| 74 |
+
# "q": query,
|
| 75 |
+
# "num": min(max_results, 10),
|
| 76 |
+
# }
|
| 77 |
+
# response = await client.get(self._base_url, params=params)
|
| 78 |
+
# data = response.json()
|
| 79 |
+
#
|
| 80 |
+
# results = []
|
| 81 |
+
# for i, item in enumerate(data.get("items", [])):
|
| 82 |
+
# results.append(SearchResult(
|
| 83 |
+
# title=item.get("title", ""),
|
| 84 |
+
# url=item.get("link", ""),
|
| 85 |
+
# snippet=item.get("snippet", ""),
|
| 86 |
+
# position=i + 1,
|
| 87 |
+
# source="google",
|
| 88 |
+
# ))
|
| 89 |
+
# return results
|
| 90 |
+
|
| 91 |
+
return self._get_stub_results(query, max_results)
|
| 92 |
+
|
| 93 |
+
def _get_stub_results(
|
| 94 |
+
self,
|
| 95 |
+
query: str,
|
| 96 |
+
max_results: int,
|
| 97 |
+
) -> list[SearchResult]:
|
| 98 |
+
"""Generate stub results for testing."""
|
| 99 |
+
results = []
|
| 100 |
+
for i in range(min(max_results, 3)):
|
| 101 |
+
results.append(
|
| 102 |
+
SearchResult(
|
| 103 |
+
title=f"Google Result {i + 1}: {query}",
|
| 104 |
+
url=f"https://example.com/google/{i + 1}",
|
| 105 |
+
snippet=f"This is a stub Google search result for '{query}'. "
|
| 106 |
+
f"Configure GOOGLE_API_KEY and GOOGLE_CSE_ID for real results.",
|
| 107 |
+
position=i + 1,
|
| 108 |
+
source="google",
|
| 109 |
+
metadata={"stub": True},
|
| 110 |
+
)
|
| 111 |
+
)
|
| 112 |
+
return results
|
backend/app/tools/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tools module for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
from app.tools.registry import MCPToolRegistry
|
| 4 |
+
from app.tools.browser import BrowserTool
|
| 5 |
+
from app.tools.search import SearchTool
|
| 6 |
+
from app.tools.html import (
|
| 7 |
+
parse_html,
|
| 8 |
+
clean_html,
|
| 9 |
+
extract_text,
|
| 10 |
+
semantic_chunk,
|
| 11 |
+
extract_links,
|
| 12 |
+
extract_tables,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"MCPToolRegistry",
|
| 17 |
+
"BrowserTool",
|
| 18 |
+
"SearchTool",
|
| 19 |
+
"parse_html",
|
| 20 |
+
"clean_html",
|
| 21 |
+
"extract_text",
|
| 22 |
+
"semantic_chunk",
|
| 23 |
+
"extract_links",
|
| 24 |
+
"extract_tables",
|
| 25 |
+
]
|
backend/app/tools/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (580 Bytes). View file
|
|
|
backend/app/tools/__pycache__/browser.cpython-314.pyc
ADDED
|
Binary file (13.2 kB). View file
|
|
|
backend/app/tools/__pycache__/html.cpython-314.pyc
ADDED
|
Binary file (437 Bytes). View file
|
|
|
backend/app/tools/__pycache__/registry.cpython-314.pyc
ADDED
|
Binary file (15.1 kB). View file
|
|
|
backend/app/tools/__pycache__/search.cpython-314.pyc
ADDED
|
Binary file (7.2 kB). View file
|
|
|
backend/app/tools/browser.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Browser automation tool for web scraping."""
|
| 2 |
+
|
| 3 |
+
from typing import Any, Optional
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
from app.utils.logging import get_logger
|
| 8 |
+
|
| 9 |
+
logger = get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BrowserType(Enum):
|
| 13 |
+
"""Supported browser types."""
|
| 14 |
+
|
| 15 |
+
CHROMIUM = "chromium"
|
| 16 |
+
FIREFOX = "firefox"
|
| 17 |
+
WEBKIT = "webkit"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class BrowserConfig:
|
| 22 |
+
"""Configuration for browser instance."""
|
| 23 |
+
|
| 24 |
+
browser_type: BrowserType = BrowserType.CHROMIUM
|
| 25 |
+
headless: bool = True
|
| 26 |
+
timeout: int = 30000 # milliseconds
|
| 27 |
+
viewport_width: int = 1920
|
| 28 |
+
viewport_height: int = 1080
|
| 29 |
+
user_agent: Optional[str] = None
|
| 30 |
+
proxy: Optional[str] = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class NavigationResult:
|
| 35 |
+
"""Result of a navigation action."""
|
| 36 |
+
|
| 37 |
+
url: str
|
| 38 |
+
status: int
|
| 39 |
+
title: str
|
| 40 |
+
success: bool
|
| 41 |
+
error: Optional[str] = None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class ClickResult:
|
| 46 |
+
"""Result of a click action."""
|
| 47 |
+
|
| 48 |
+
selector: str
|
| 49 |
+
success: bool
|
| 50 |
+
error: Optional[str] = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class ScreenshotResult:
|
| 55 |
+
"""Result of a screenshot action."""
|
| 56 |
+
|
| 57 |
+
data: bytes
|
| 58 |
+
format: str
|
| 59 |
+
width: int
|
| 60 |
+
height: int
|
| 61 |
+
success: bool
|
| 62 |
+
error: Optional[str] = None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class BrowserTool:
|
| 66 |
+
"""
|
| 67 |
+
Browser automation tool using Playwright/Selenium.
|
| 68 |
+
|
| 69 |
+
This is a stub implementation that defines the interface.
|
| 70 |
+
Actual browser automation requires installing playwright or selenium.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self, config: Optional[BrowserConfig] = None) -> None:
|
| 74 |
+
self.config = config or BrowserConfig()
|
| 75 |
+
self._browser: Any = None
|
| 76 |
+
self._context: Any = None
|
| 77 |
+
self._page: Any = None
|
| 78 |
+
self._initialized: bool = False
|
| 79 |
+
|
| 80 |
+
async def initialize(self) -> None:
|
| 81 |
+
"""
|
| 82 |
+
Initialize the browser instance.
|
| 83 |
+
|
| 84 |
+
Note: This is a stub. Real implementation requires playwright:
|
| 85 |
+
pip install playwright
|
| 86 |
+
playwright install
|
| 87 |
+
"""
|
| 88 |
+
logger.info(f"Initializing browser: {self.config.browser_type.value}")
|
| 89 |
+
# Stub: In real implementation, initialize playwright here
|
| 90 |
+
# from playwright.async_api import async_playwright
|
| 91 |
+
# self._playwright = await async_playwright().start()
|
| 92 |
+
# self._browser = await self._playwright.chromium.launch(headless=self.config.headless)
|
| 93 |
+
self._initialized = True
|
| 94 |
+
logger.info("Browser initialized (stub mode)")
|
| 95 |
+
|
| 96 |
+
async def shutdown(self) -> None:
|
| 97 |
+
"""Close the browser and cleanup resources."""
|
| 98 |
+
logger.info("Shutting down browser")
|
| 99 |
+
if self._page:
|
| 100 |
+
# await self._page.close()
|
| 101 |
+
self._page = None
|
| 102 |
+
if self._context:
|
| 103 |
+
# await self._context.close()
|
| 104 |
+
self._context = None
|
| 105 |
+
if self._browser:
|
| 106 |
+
# await self._browser.close()
|
| 107 |
+
self._browser = None
|
| 108 |
+
self._initialized = False
|
| 109 |
+
logger.info("Browser shutdown complete")
|
| 110 |
+
|
| 111 |
+
async def navigate(
|
| 112 |
+
self,
|
| 113 |
+
url: str,
|
| 114 |
+
wait_until: str = "domcontentloaded",
|
| 115 |
+
timeout: Optional[int] = None,
|
| 116 |
+
) -> NavigationResult:
|
| 117 |
+
"""
|
| 118 |
+
Navigate to a URL.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
url: Target URL
|
| 122 |
+
wait_until: Navigation wait condition (load, domcontentloaded, networkidle)
|
| 123 |
+
timeout: Navigation timeout in milliseconds
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
NavigationResult with status and details
|
| 127 |
+
"""
|
| 128 |
+
logger.info(f"Navigating to: {url}")
|
| 129 |
+
|
| 130 |
+
if not self._initialized:
|
| 131 |
+
return NavigationResult(
|
| 132 |
+
url=url,
|
| 133 |
+
status=0,
|
| 134 |
+
title="",
|
| 135 |
+
success=False,
|
| 136 |
+
error="Browser not initialized",
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Stub implementation
|
| 140 |
+
# Real implementation:
|
| 141 |
+
# response = await self._page.goto(url, wait_until=wait_until, timeout=timeout)
|
| 142 |
+
# return NavigationResult(
|
| 143 |
+
# url=self._page.url,
|
| 144 |
+
# status=response.status if response else 0,
|
| 145 |
+
# title=await self._page.title(),
|
| 146 |
+
# success=True,
|
| 147 |
+
# )
|
| 148 |
+
|
| 149 |
+
return NavigationResult(
|
| 150 |
+
url=url,
|
| 151 |
+
status=200,
|
| 152 |
+
title="Stub Page Title",
|
| 153 |
+
success=True,
|
| 154 |
+
error="Stub mode - no actual navigation",
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
async def click(
|
| 158 |
+
self,
|
| 159 |
+
selector: str,
|
| 160 |
+
timeout: Optional[int] = None,
|
| 161 |
+
force: bool = False,
|
| 162 |
+
) -> ClickResult:
|
| 163 |
+
"""
|
| 164 |
+
Click an element on the page.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
selector: CSS or XPath selector
|
| 168 |
+
timeout: Click timeout in milliseconds
|
| 169 |
+
force: Force click even if element is obscured
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
ClickResult indicating success or failure
|
| 173 |
+
"""
|
| 174 |
+
logger.info(f"Clicking element: {selector}")
|
| 175 |
+
|
| 176 |
+
if not self._initialized:
|
| 177 |
+
return ClickResult(
|
| 178 |
+
selector=selector,
|
| 179 |
+
success=False,
|
| 180 |
+
error="Browser not initialized",
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# Stub implementation
|
| 184 |
+
# Real implementation:
|
| 185 |
+
# await self._page.click(selector, timeout=timeout, force=force)
|
| 186 |
+
|
| 187 |
+
return ClickResult(
|
| 188 |
+
selector=selector,
|
| 189 |
+
success=True,
|
| 190 |
+
error="Stub mode - no actual click",
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
async def fill(
|
| 194 |
+
self,
|
| 195 |
+
selector: str,
|
| 196 |
+
value: str,
|
| 197 |
+
timeout: Optional[int] = None,
|
| 198 |
+
) -> ClickResult:
|
| 199 |
+
"""
|
| 200 |
+
Fill a form field with text.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
selector: CSS or XPath selector
|
| 204 |
+
value: Text to enter
|
| 205 |
+
timeout: Action timeout in milliseconds
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
ClickResult indicating success or failure
|
| 209 |
+
"""
|
| 210 |
+
logger.info(f"Filling element: {selector} with value")
|
| 211 |
+
|
| 212 |
+
if not self._initialized:
|
| 213 |
+
return ClickResult(
|
| 214 |
+
selector=selector,
|
| 215 |
+
success=False,
|
| 216 |
+
error="Browser not initialized",
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# Stub implementation
|
| 220 |
+
# Real implementation:
|
| 221 |
+
# await self._page.fill(selector, value, timeout=timeout)
|
| 222 |
+
|
| 223 |
+
return ClickResult(
|
| 224 |
+
selector=selector,
|
| 225 |
+
success=True,
|
| 226 |
+
error="Stub mode - no actual fill",
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
async def get_html(
|
| 230 |
+
self,
|
| 231 |
+
selector: Optional[str] = None,
|
| 232 |
+
) -> str:
|
| 233 |
+
"""
|
| 234 |
+
Get HTML content of the page or a specific element.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
selector: Optional selector to get HTML of specific element
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
HTML content as string
|
| 241 |
+
"""
|
| 242 |
+
logger.info(f"Getting HTML for: {selector or 'full page'}")
|
| 243 |
+
|
| 244 |
+
if not self._initialized:
|
| 245 |
+
return ""
|
| 246 |
+
|
| 247 |
+
# Stub implementation
|
| 248 |
+
# Real implementation:
|
| 249 |
+
# if selector:
|
| 250 |
+
# element = await self._page.query_selector(selector)
|
| 251 |
+
# return await element.inner_html() if element else ""
|
| 252 |
+
# return await self._page.content()
|
| 253 |
+
|
| 254 |
+
return "<html><body><h1>Stub HTML Content</h1></body></html>"
|
| 255 |
+
|
| 256 |
+
async def screenshot(
|
| 257 |
+
self,
|
| 258 |
+
selector: Optional[str] = None,
|
| 259 |
+
full_page: bool = False,
|
| 260 |
+
format: str = "png",
|
| 261 |
+
) -> ScreenshotResult:
|
| 262 |
+
"""
|
| 263 |
+
Take a screenshot of the page or element.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
selector: Optional selector to screenshot specific element
|
| 267 |
+
full_page: Capture full scrollable page
|
| 268 |
+
format: Image format (png, jpeg)
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
ScreenshotResult with image data
|
| 272 |
+
"""
|
| 273 |
+
logger.info(f"Taking screenshot: selector={selector}, full_page={full_page}")
|
| 274 |
+
|
| 275 |
+
if not self._initialized:
|
| 276 |
+
return ScreenshotResult(
|
| 277 |
+
data=b"",
|
| 278 |
+
format=format,
|
| 279 |
+
width=0,
|
| 280 |
+
height=0,
|
| 281 |
+
success=False,
|
| 282 |
+
error="Browser not initialized",
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Stub implementation
|
| 286 |
+
# Real implementation:
|
| 287 |
+
# if selector:
|
| 288 |
+
# element = await self._page.query_selector(selector)
|
| 289 |
+
# data = await element.screenshot(type=format) if element else b""
|
| 290 |
+
# else:
|
| 291 |
+
# data = await self._page.screenshot(full_page=full_page, type=format)
|
| 292 |
+
|
| 293 |
+
return ScreenshotResult(
|
| 294 |
+
data=b"stub_screenshot_data",
|
| 295 |
+
format=format,
|
| 296 |
+
width=self.config.viewport_width,
|
| 297 |
+
height=self.config.viewport_height,
|
| 298 |
+
success=True,
|
| 299 |
+
error="Stub mode - no actual screenshot",
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
async def evaluate(self, script: str) -> Any:
|
| 303 |
+
"""
|
| 304 |
+
Execute JavaScript in the page context.
|
| 305 |
+
|
| 306 |
+
Args:
|
| 307 |
+
script: JavaScript code to execute
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Result of the script execution
|
| 311 |
+
"""
|
| 312 |
+
logger.info(f"Evaluating script: {script[:50]}...")
|
| 313 |
+
|
| 314 |
+
if not self._initialized:
|
| 315 |
+
return None
|
| 316 |
+
|
| 317 |
+
# Stub implementation
|
| 318 |
+
# Real implementation:
|
| 319 |
+
# return await self._page.evaluate(script)
|
| 320 |
+
|
| 321 |
+
return None
|
| 322 |
+
|
| 323 |
+
async def wait_for_selector(
|
| 324 |
+
self,
|
| 325 |
+
selector: str,
|
| 326 |
+
timeout: Optional[int] = None,
|
| 327 |
+
state: str = "visible",
|
| 328 |
+
) -> bool:
|
| 329 |
+
"""
|
| 330 |
+
Wait for an element to appear on the page.
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
selector: CSS or XPath selector
|
| 334 |
+
timeout: Wait timeout in milliseconds
|
| 335 |
+
state: Element state to wait for (visible, hidden, attached, detached)
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
True if element found, False otherwise
|
| 339 |
+
"""
|
| 340 |
+
logger.info(f"Waiting for selector: {selector}")
|
| 341 |
+
|
| 342 |
+
if not self._initialized:
|
| 343 |
+
return False
|
| 344 |
+
|
| 345 |
+
# Stub implementation
|
| 346 |
+
# Real implementation:
|
| 347 |
+
# try:
|
| 348 |
+
# await self._page.wait_for_selector(selector, timeout=timeout, state=state)
|
| 349 |
+
# return True
|
| 350 |
+
# except TimeoutError:
|
| 351 |
+
# return False
|
| 352 |
+
|
| 353 |
+
return True
|
| 354 |
+
|
| 355 |
+
def health_check(self) -> bool:
|
| 356 |
+
"""Check if the browser is healthy and responsive."""
|
| 357 |
+
return self._initialized
|
| 358 |
+
|
| 359 |
+
@property
|
| 360 |
+
def is_initialized(self) -> bool:
|
| 361 |
+
"""Check if the browser has been initialized."""
|
| 362 |
+
return self._initialized
|
backend/app/tools/html.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTML processing tools for web scraping.
|
| 2 |
+
|
| 3 |
+
Re-exports utilities from app.utils.html for tool registration.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from app.utils.html import (
|
| 7 |
+
parse_html,
|
| 8 |
+
clean_html,
|
| 9 |
+
extract_text,
|
| 10 |
+
semantic_chunk,
|
| 11 |
+
extract_links,
|
| 12 |
+
extract_tables,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"parse_html",
|
| 17 |
+
"clean_html",
|
| 18 |
+
"extract_text",
|
| 19 |
+
"semantic_chunk",
|
| 20 |
+
"extract_links",
|
| 21 |
+
"extract_tables",
|
| 22 |
+
]
|
backend/app/tools/registry.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MCP Tool Registry for dynamic tool discovery and management."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
from typing import Any, Callable, Optional
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from enum import Enum
|
| 7 |
+
|
| 8 |
+
from app.utils.logging import get_logger
|
| 9 |
+
|
| 10 |
+
logger = get_logger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ToolStatus(Enum):
|
| 14 |
+
"""Status of a registered tool."""
|
| 15 |
+
|
| 16 |
+
UNKNOWN = "unknown"
|
| 17 |
+
HEALTHY = "healthy"
|
| 18 |
+
UNHEALTHY = "unhealthy"
|
| 19 |
+
INITIALIZING = "initializing"
|
| 20 |
+
SHUTDOWN = "shutdown"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ToolDefinition:
|
| 25 |
+
"""Definition of a registered tool."""
|
| 26 |
+
|
| 27 |
+
name: str
|
| 28 |
+
description: str
|
| 29 |
+
handler: Callable[..., Any]
|
| 30 |
+
parameters: dict[str, Any] = field(default_factory=dict)
|
| 31 |
+
status: ToolStatus = ToolStatus.UNKNOWN
|
| 32 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class MCPToolRegistry:
|
| 36 |
+
"""
|
| 37 |
+
Registry for MCP tools with dynamic discovery and execution.
|
| 38 |
+
|
| 39 |
+
Manages tool lifecycle including registration, health checks,
|
| 40 |
+
and execution routing.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self) -> None:
|
| 44 |
+
self._tools: dict[str, ToolDefinition] = {}
|
| 45 |
+
self._initialized: bool = False
|
| 46 |
+
self._health_check_interval: float = 30.0
|
| 47 |
+
self._health_check_task: Optional[asyncio.Task[None]] = None
|
| 48 |
+
|
| 49 |
+
async def initialize(self) -> None:
|
| 50 |
+
"""Initialize the registry and start health monitoring."""
|
| 51 |
+
if self._initialized:
|
| 52 |
+
logger.warning("Registry already initialized")
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
logger.info("Initializing MCP Tool Registry")
|
| 56 |
+
|
| 57 |
+
# Start health check background task
|
| 58 |
+
self._health_check_task = asyncio.create_task(self._health_check_loop())
|
| 59 |
+
self._initialized = True
|
| 60 |
+
|
| 61 |
+
logger.info("MCP Tool Registry initialized")
|
| 62 |
+
|
| 63 |
+
async def shutdown(self) -> None:
|
| 64 |
+
"""Shutdown the registry and cleanup resources."""
|
| 65 |
+
logger.info("Shutting down MCP Tool Registry")
|
| 66 |
+
|
| 67 |
+
# Cancel health check task
|
| 68 |
+
if self._health_check_task:
|
| 69 |
+
self._health_check_task.cancel()
|
| 70 |
+
try:
|
| 71 |
+
await self._health_check_task
|
| 72 |
+
except asyncio.CancelledError:
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
# Mark all tools as shutdown
|
| 76 |
+
for tool in self._tools.values():
|
| 77 |
+
tool.status = ToolStatus.SHUTDOWN
|
| 78 |
+
|
| 79 |
+
self._initialized = False
|
| 80 |
+
logger.info("MCP Tool Registry shutdown complete")
|
| 81 |
+
|
| 82 |
+
def register(
|
| 83 |
+
self,
|
| 84 |
+
name: str,
|
| 85 |
+
handler: Callable[..., Any],
|
| 86 |
+
description: str = "",
|
| 87 |
+
parameters: Optional[dict[str, Any]] = None,
|
| 88 |
+
metadata: Optional[dict[str, Any]] = None,
|
| 89 |
+
) -> ToolDefinition:
|
| 90 |
+
"""
|
| 91 |
+
Register a new tool with the registry.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
name: Unique tool name
|
| 95 |
+
handler: Callable that implements the tool
|
| 96 |
+
description: Human-readable description
|
| 97 |
+
parameters: JSON schema for tool parameters
|
| 98 |
+
metadata: Additional tool metadata
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
The registered ToolDefinition
|
| 102 |
+
|
| 103 |
+
Raises:
|
| 104 |
+
ValueError: If a tool with the same name already exists
|
| 105 |
+
"""
|
| 106 |
+
if name in self._tools:
|
| 107 |
+
raise ValueError(f"Tool '{name}' is already registered")
|
| 108 |
+
|
| 109 |
+
tool = ToolDefinition(
|
| 110 |
+
name=name,
|
| 111 |
+
description=description,
|
| 112 |
+
handler=handler,
|
| 113 |
+
parameters=parameters or {},
|
| 114 |
+
status=ToolStatus.INITIALIZING,
|
| 115 |
+
metadata=metadata or {},
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
self._tools[name] = tool
|
| 119 |
+
logger.info(f"Registered tool: {name}")
|
| 120 |
+
|
| 121 |
+
return tool
|
| 122 |
+
|
| 123 |
+
def unregister(self, name: str) -> bool:
|
| 124 |
+
"""
|
| 125 |
+
Unregister a tool from the registry.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
name: Tool name to unregister
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
True if tool was removed, False if not found
|
| 132 |
+
"""
|
| 133 |
+
if name in self._tools:
|
| 134 |
+
del self._tools[name]
|
| 135 |
+
logger.info(f"Unregistered tool: {name}")
|
| 136 |
+
return True
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
def get(self, name: str) -> Optional[ToolDefinition]:
|
| 140 |
+
"""
|
| 141 |
+
Get a tool definition by name.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
name: Tool name to retrieve
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
ToolDefinition if found, None otherwise
|
| 148 |
+
"""
|
| 149 |
+
return self._tools.get(name)
|
| 150 |
+
|
| 151 |
+
def list_tools(
|
| 152 |
+
self,
|
| 153 |
+
include_unhealthy: bool = False,
|
| 154 |
+
) -> list[ToolDefinition]:
|
| 155 |
+
"""
|
| 156 |
+
List all registered tools.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
include_unhealthy: Include tools with unhealthy status
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
List of tool definitions
|
| 163 |
+
"""
|
| 164 |
+
tools = list(self._tools.values())
|
| 165 |
+
|
| 166 |
+
if not include_unhealthy:
|
| 167 |
+
tools = [
|
| 168 |
+
t for t in tools
|
| 169 |
+
if t.status not in (ToolStatus.UNHEALTHY, ToolStatus.SHUTDOWN)
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
return tools
|
| 173 |
+
|
| 174 |
+
async def execute(
|
| 175 |
+
self,
|
| 176 |
+
name: str,
|
| 177 |
+
**kwargs: Any,
|
| 178 |
+
) -> Any:
|
| 179 |
+
"""
|
| 180 |
+
Execute a tool by name with the given parameters.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
name: Tool name to execute
|
| 184 |
+
**kwargs: Tool parameters
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Tool execution result
|
| 188 |
+
|
| 189 |
+
Raises:
|
| 190 |
+
KeyError: If tool is not found
|
| 191 |
+
RuntimeError: If tool is not healthy
|
| 192 |
+
"""
|
| 193 |
+
tool = self.get(name)
|
| 194 |
+
|
| 195 |
+
if tool is None:
|
| 196 |
+
raise KeyError(f"Tool '{name}' not found")
|
| 197 |
+
|
| 198 |
+
if tool.status == ToolStatus.UNHEALTHY:
|
| 199 |
+
raise RuntimeError(f"Tool '{name}' is unhealthy")
|
| 200 |
+
|
| 201 |
+
if tool.status == ToolStatus.SHUTDOWN:
|
| 202 |
+
raise RuntimeError(f"Tool '{name}' has been shut down")
|
| 203 |
+
|
| 204 |
+
logger.debug(f"Executing tool: {name} with params: {kwargs}")
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
# Handle both sync and async handlers
|
| 208 |
+
if asyncio.iscoroutinefunction(tool.handler):
|
| 209 |
+
result = await tool.handler(**kwargs)
|
| 210 |
+
else:
|
| 211 |
+
result = tool.handler(**kwargs)
|
| 212 |
+
|
| 213 |
+
return result
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.error(f"Tool execution failed: {name} - {e}")
|
| 217 |
+
raise
|
| 218 |
+
|
| 219 |
+
async def health_check(self, name: str) -> ToolStatus:
|
| 220 |
+
"""
|
| 221 |
+
Check the health of a specific tool.
|
| 222 |
+
|
| 223 |
+
Args:
|
| 224 |
+
name: Tool name to check
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
Current tool status
|
| 228 |
+
"""
|
| 229 |
+
tool = self.get(name)
|
| 230 |
+
if tool is None:
|
| 231 |
+
return ToolStatus.UNKNOWN
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
# Try to call a health check method if available
|
| 235 |
+
handler = tool.handler
|
| 236 |
+
if hasattr(handler, "health_check"):
|
| 237 |
+
health_fn = getattr(handler, "health_check")
|
| 238 |
+
if asyncio.iscoroutinefunction(health_fn):
|
| 239 |
+
await health_fn()
|
| 240 |
+
else:
|
| 241 |
+
health_fn()
|
| 242 |
+
|
| 243 |
+
tool.status = ToolStatus.HEALTHY
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.warning(f"Health check failed for {name}: {e}")
|
| 246 |
+
tool.status = ToolStatus.UNHEALTHY
|
| 247 |
+
|
| 248 |
+
return tool.status
|
| 249 |
+
|
| 250 |
+
async def health_check_all(self) -> dict[str, ToolStatus]:
|
| 251 |
+
"""
|
| 252 |
+
Check health of all registered tools.
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
Dictionary mapping tool names to their status
|
| 256 |
+
"""
|
| 257 |
+
results: dict[str, ToolStatus] = {}
|
| 258 |
+
|
| 259 |
+
for name in self._tools:
|
| 260 |
+
results[name] = await self.health_check(name)
|
| 261 |
+
|
| 262 |
+
return results
|
| 263 |
+
|
| 264 |
+
async def _health_check_loop(self) -> None:
|
| 265 |
+
"""Background task for periodic health checks."""
|
| 266 |
+
while True:
|
| 267 |
+
try:
|
| 268 |
+
await asyncio.sleep(self._health_check_interval)
|
| 269 |
+
await self.health_check_all()
|
| 270 |
+
except asyncio.CancelledError:
|
| 271 |
+
break
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logger.error(f"Health check loop error: {e}")
|
| 274 |
+
|
| 275 |
+
def get_tool_schema(self, name: str) -> Optional[dict[str, Any]]:
|
| 276 |
+
"""
|
| 277 |
+
Get the JSON schema for a tool's parameters.
|
| 278 |
+
|
| 279 |
+
Args:
|
| 280 |
+
name: Tool name
|
| 281 |
+
|
| 282 |
+
Returns:
|
| 283 |
+
Parameter schema dict or None if not found
|
| 284 |
+
"""
|
| 285 |
+
tool = self.get(name)
|
| 286 |
+
if tool is None:
|
| 287 |
+
return None
|
| 288 |
+
|
| 289 |
+
return {
|
| 290 |
+
"name": tool.name,
|
| 291 |
+
"description": tool.description,
|
| 292 |
+
"parameters": tool.parameters,
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
def list_schemas(self) -> list[dict[str, Any]]:
|
| 296 |
+
"""
|
| 297 |
+
Get schemas for all registered tools.
|
| 298 |
+
|
| 299 |
+
Returns:
|
| 300 |
+
List of tool schema dictionaries
|
| 301 |
+
"""
|
| 302 |
+
schemas = []
|
| 303 |
+
for name in self._tools:
|
| 304 |
+
schema = self.get_tool_schema(name)
|
| 305 |
+
if schema:
|
| 306 |
+
schemas.append(schema)
|
| 307 |
+
return schemas
|
| 308 |
+
|
| 309 |
+
@property
|
| 310 |
+
def is_initialized(self) -> bool:
|
| 311 |
+
"""Check if the registry has been initialized."""
|
| 312 |
+
return self._initialized
|
| 313 |
+
|
| 314 |
+
@property
|
| 315 |
+
def tool_count(self) -> int:
|
| 316 |
+
"""Get the number of registered tools."""
|
| 317 |
+
return len(self._tools)
|
backend/app/tools/search.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search tool wrapper for search engine providers."""
|
| 2 |
+
|
| 3 |
+
from typing import Any, Optional
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
from app.utils.logging import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class SearchResult:
|
| 13 |
+
"""Individual search result."""
|
| 14 |
+
|
| 15 |
+
title: str
|
| 16 |
+
url: str
|
| 17 |
+
snippet: str
|
| 18 |
+
position: int
|
| 19 |
+
source: str # Provider name
|
| 20 |
+
metadata: dict[str, Any] | None = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class SearchResponse:
|
| 25 |
+
"""Response from a search query."""
|
| 26 |
+
|
| 27 |
+
query: str
|
| 28 |
+
results: list[SearchResult]
|
| 29 |
+
total_results: int
|
| 30 |
+
provider: str
|
| 31 |
+
success: bool
|
| 32 |
+
error: Optional[str] = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class SearchTool:
|
| 36 |
+
"""
|
| 37 |
+
Search tool that wraps search engine providers.
|
| 38 |
+
|
| 39 |
+
Provides a unified interface for searching across different
|
| 40 |
+
search engine providers.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, default_provider: str = "duckduckgo") -> None:
|
| 44 |
+
self.default_provider = default_provider
|
| 45 |
+
self._engine: Any = None
|
| 46 |
+
self._initialized: bool = False
|
| 47 |
+
|
| 48 |
+
async def initialize(self, engine: Any = None) -> None:
|
| 49 |
+
"""
|
| 50 |
+
Initialize the search tool with a search engine.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
engine: SearchEngineRouter instance to use
|
| 54 |
+
"""
|
| 55 |
+
logger.info("Initializing SearchTool")
|
| 56 |
+
self._engine = engine
|
| 57 |
+
self._initialized = True
|
| 58 |
+
logger.info("SearchTool initialized")
|
| 59 |
+
|
| 60 |
+
async def shutdown(self) -> None:
|
| 61 |
+
"""Shutdown the search tool."""
|
| 62 |
+
logger.info("Shutting down SearchTool")
|
| 63 |
+
self._engine = None
|
| 64 |
+
self._initialized = False
|
| 65 |
+
|
| 66 |
+
async def search(
|
| 67 |
+
self,
|
| 68 |
+
query: str,
|
| 69 |
+
max_results: int = 10,
|
| 70 |
+
provider: Optional[str] = None,
|
| 71 |
+
) -> SearchResponse:
|
| 72 |
+
"""
|
| 73 |
+
Perform a search query.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
query: Search query string
|
| 77 |
+
max_results: Maximum number of results to return
|
| 78 |
+
provider: Specific provider to use (optional)
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
SearchResponse with results
|
| 82 |
+
"""
|
| 83 |
+
logger.info(f"Searching for: {query}")
|
| 84 |
+
|
| 85 |
+
provider_name = provider or self.default_provider
|
| 86 |
+
|
| 87 |
+
if not self._initialized or self._engine is None:
|
| 88 |
+
logger.warning("SearchTool not properly initialized, using stub response")
|
| 89 |
+
return SearchResponse(
|
| 90 |
+
query=query,
|
| 91 |
+
results=[],
|
| 92 |
+
total_results=0,
|
| 93 |
+
provider=provider_name,
|
| 94 |
+
success=False,
|
| 95 |
+
error="Search engine not initialized",
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
# Delegate to search engine router
|
| 100 |
+
results = await self._engine.search(
|
| 101 |
+
query=query,
|
| 102 |
+
max_results=max_results,
|
| 103 |
+
provider=provider_name,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
return SearchResponse(
|
| 107 |
+
query=query,
|
| 108 |
+
results=results,
|
| 109 |
+
total_results=len(results),
|
| 110 |
+
provider=provider_name,
|
| 111 |
+
success=True,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f"Search failed: {e}")
|
| 116 |
+
return SearchResponse(
|
| 117 |
+
query=query,
|
| 118 |
+
results=[],
|
| 119 |
+
total_results=0,
|
| 120 |
+
provider=provider_name,
|
| 121 |
+
success=False,
|
| 122 |
+
error=str(e),
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
async def get_results(
|
| 126 |
+
self,
|
| 127 |
+
query: str,
|
| 128 |
+
max_results: int = 10,
|
| 129 |
+
provider: Optional[str] = None,
|
| 130 |
+
) -> list[SearchResult]:
|
| 131 |
+
"""
|
| 132 |
+
Get search results as a list.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
query: Search query string
|
| 136 |
+
max_results: Maximum number of results to return
|
| 137 |
+
provider: Specific provider to use (optional)
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
List of SearchResult objects
|
| 141 |
+
"""
|
| 142 |
+
response = await self.search(query, max_results, provider)
|
| 143 |
+
return response.results
|
| 144 |
+
|
| 145 |
+
def health_check(self) -> bool:
|
| 146 |
+
"""Check if the search tool is healthy."""
|
| 147 |
+
return self._initialized and self._engine is not None
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def is_initialized(self) -> bool:
|
| 151 |
+
"""Check if the search tool has been initialized."""
|
| 152 |
+
return self._initialized
|