NeerajCodz commited on
Commit
afefaea
·
1 Parent(s): bb3ee41

feat: add MCP tool registry and search engine integration

Browse files
backend/app/search/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search module for ScrapeRL backend."""
2
+
3
+ from app.search.engine import SearchEngineRouter
4
+ from app.search.providers import (
5
+ BaseSearchProvider,
6
+ GoogleSearchProvider,
7
+ BingSearchProvider,
8
+ DuckDuckGoProvider,
9
+ )
10
+
11
+ __all__ = [
12
+ "SearchEngineRouter",
13
+ "BaseSearchProvider",
14
+ "GoogleSearchProvider",
15
+ "BingSearchProvider",
16
+ "DuckDuckGoProvider",
17
+ ]
backend/app/search/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (443 Bytes). View file
 
backend/app/search/__pycache__/engine.cpython-314.pyc ADDED
Binary file (13.4 kB). View file
 
backend/app/search/engine.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search engine router for aggregating multiple search providers."""
2
+
3
+ from typing import Any, Optional
4
+ from dataclasses import dataclass, field
5
+
6
+ from app.utils.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class SearchResult:
13
+ """Individual search result."""
14
+
15
+ title: str
16
+ url: str
17
+ snippet: str
18
+ position: int
19
+ source: str
20
+ score: float = 1.0
21
+ metadata: dict[str, Any] = field(default_factory=dict)
22
+
23
+
24
+ class SearchEngineRouter:
25
+ """
26
+ Routes search queries to different providers and aggregates results.
27
+
28
+ Supports multiple search providers and can aggregate/rank results
29
+ from multiple sources.
30
+ """
31
+
32
+ def __init__(self) -> None:
33
+ self._providers: dict[str, Any] = {}
34
+ self._default_provider: Optional[str] = None
35
+ self._initialized: bool = False
36
+
37
+ async def initialize(self) -> None:
38
+ """Initialize the search engine router and all providers."""
39
+ logger.info("Initializing SearchEngineRouter")
40
+
41
+ # Initialize all registered providers
42
+ for name, provider in self._providers.items():
43
+ try:
44
+ if hasattr(provider, "initialize"):
45
+ await provider.initialize()
46
+ logger.info(f"Initialized provider: {name}")
47
+ except Exception as e:
48
+ logger.error(f"Failed to initialize provider {name}: {e}")
49
+
50
+ self._initialized = True
51
+ logger.info("SearchEngineRouter initialized")
52
+
53
+ async def shutdown(self) -> None:
54
+ """Shutdown the router and all providers."""
55
+ logger.info("Shutting down SearchEngineRouter")
56
+
57
+ for name, provider in self._providers.items():
58
+ try:
59
+ if hasattr(provider, "shutdown"):
60
+ await provider.shutdown()
61
+ logger.info(f"Shut down provider: {name}")
62
+ except Exception as e:
63
+ logger.error(f"Error shutting down provider {name}: {e}")
64
+
65
+ self._initialized = False
66
+
67
+ def register_provider(
68
+ self,
69
+ name: str,
70
+ provider: Any,
71
+ set_default: bool = False,
72
+ ) -> None:
73
+ """
74
+ Register a search provider.
75
+
76
+ Args:
77
+ name: Provider identifier
78
+ provider: Provider instance
79
+ set_default: Set as the default provider
80
+ """
81
+ self._providers[name] = provider
82
+ logger.info(f"Registered search provider: {name}")
83
+
84
+ if set_default or self._default_provider is None:
85
+ self._default_provider = name
86
+ logger.info(f"Set default provider: {name}")
87
+
88
+ def unregister_provider(self, name: str) -> bool:
89
+ """
90
+ Unregister a search provider.
91
+
92
+ Args:
93
+ name: Provider identifier
94
+
95
+ Returns:
96
+ True if provider was removed
97
+ """
98
+ if name in self._providers:
99
+ del self._providers[name]
100
+ if self._default_provider == name:
101
+ self._default_provider = next(iter(self._providers), None)
102
+ logger.info(f"Unregistered provider: {name}")
103
+ return True
104
+ return False
105
+
106
+ def get_providers(self) -> list[str]:
107
+ """
108
+ Get list of registered provider names.
109
+
110
+ Returns:
111
+ List of provider identifiers
112
+ """
113
+ return list(self._providers.keys())
114
+
115
+ def get_provider(self, name: str) -> Optional[Any]:
116
+ """
117
+ Get a specific provider by name.
118
+
119
+ Args:
120
+ name: Provider identifier
121
+
122
+ Returns:
123
+ Provider instance or None
124
+ """
125
+ return self._providers.get(name)
126
+
127
+ async def search(
128
+ self,
129
+ query: str,
130
+ max_results: int = 10,
131
+ provider: Optional[str] = None,
132
+ ) -> list[SearchResult]:
133
+ """
134
+ Perform a search using a specific provider.
135
+
136
+ Args:
137
+ query: Search query string
138
+ max_results: Maximum results to return
139
+ provider: Provider to use (defaults to default provider)
140
+
141
+ Returns:
142
+ List of search results
143
+
144
+ Raises:
145
+ ValueError: If provider not found
146
+ """
147
+ provider_name = provider or self._default_provider
148
+
149
+ if provider_name is None:
150
+ raise ValueError("No search provider configured")
151
+
152
+ if provider_name not in self._providers:
153
+ raise ValueError(f"Provider '{provider_name}' not found")
154
+
155
+ provider_instance = self._providers[provider_name]
156
+ logger.info(f"Searching with provider '{provider_name}': {query}")
157
+
158
+ try:
159
+ results = await provider_instance.search(query, max_results)
160
+
161
+ # Ensure results have proper source attribution
162
+ for i, result in enumerate(results):
163
+ if isinstance(result, dict):
164
+ result["source"] = provider_name
165
+ result["position"] = i + 1
166
+ elif hasattr(result, "source"):
167
+ result.source = provider_name
168
+ result.position = i + 1
169
+
170
+ return results
171
+
172
+ except Exception as e:
173
+ logger.error(f"Search failed with provider '{provider_name}': {e}")
174
+ raise
175
+
176
+ async def search_all(
177
+ self,
178
+ query: str,
179
+ max_results_per_provider: int = 10,
180
+ providers: Optional[list[str]] = None,
181
+ ) -> list[SearchResult]:
182
+ """
183
+ Search across multiple providers and aggregate results.
184
+
185
+ Args:
186
+ query: Search query string
187
+ max_results_per_provider: Max results from each provider
188
+ providers: Specific providers to use (defaults to all)
189
+
190
+ Returns:
191
+ Aggregated and ranked list of results
192
+ """
193
+ provider_names = providers or list(self._providers.keys())
194
+ all_results: list[SearchResult] = []
195
+
196
+ for provider_name in provider_names:
197
+ try:
198
+ results = await self.search(
199
+ query=query,
200
+ max_results=max_results_per_provider,
201
+ provider=provider_name,
202
+ )
203
+ all_results.extend(results)
204
+ except Exception as e:
205
+ logger.warning(f"Provider '{provider_name}' failed: {e}")
206
+ continue
207
+
208
+ # Rank and deduplicate results
209
+ ranked_results = self._rank_results(all_results)
210
+
211
+ return ranked_results
212
+
213
+ def _rank_results(
214
+ self,
215
+ results: list[SearchResult],
216
+ ) -> list[SearchResult]:
217
+ """
218
+ Rank and deduplicate search results.
219
+
220
+ Args:
221
+ results: Raw results from multiple providers
222
+
223
+ Returns:
224
+ Ranked and deduplicated results
225
+ """
226
+ # Deduplicate by URL
227
+ seen_urls: set[str] = set()
228
+ unique_results: list[SearchResult] = []
229
+
230
+ for result in results:
231
+ url = result.url if hasattr(result, "url") else result.get("url", "")
232
+ if url and url not in seen_urls:
233
+ seen_urls.add(url)
234
+ unique_results.append(result)
235
+
236
+ # Sort by score (higher is better) then by position (lower is better)
237
+ def sort_key(r: Any) -> tuple[float, int]:
238
+ score = r.score if hasattr(r, "score") else r.get("score", 1.0)
239
+ position = r.position if hasattr(r, "position") else r.get("position", 999)
240
+ return (-score, position)
241
+
242
+ unique_results.sort(key=sort_key)
243
+
244
+ # Update positions
245
+ for i, result in enumerate(unique_results):
246
+ if hasattr(result, "position"):
247
+ result.position = i + 1
248
+ elif isinstance(result, dict):
249
+ result["position"] = i + 1
250
+
251
+ return unique_results
252
+
253
+ @property
254
+ def is_initialized(self) -> bool:
255
+ """Check if the router is initialized."""
256
+ return self._initialized
257
+
258
+ @property
259
+ def default_provider(self) -> Optional[str]:
260
+ """Get the default provider name."""
261
+ return self._default_provider
backend/app/search/providers/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search providers for ScrapeRL backend."""
2
+
3
+ from app.search.providers.base import BaseSearchProvider
4
+ from app.search.providers.google import GoogleSearchProvider
5
+ from app.search.providers.bing import BingSearchProvider
6
+ from app.search.providers.duckduckgo import DuckDuckGoProvider
7
+
8
+ __all__ = [
9
+ "BaseSearchProvider",
10
+ "GoogleSearchProvider",
11
+ "BingSearchProvider",
12
+ "DuckDuckGoProvider",
13
+ ]
backend/app/search/providers/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (516 Bytes). View file
 
backend/app/search/providers/__pycache__/base.cpython-314.pyc ADDED
Binary file (4.65 kB). View file
 
backend/app/search/providers/__pycache__/bing.cpython-314.pyc ADDED
Binary file (4.19 kB). View file
 
backend/app/search/providers/__pycache__/duckduckgo.cpython-314.pyc ADDED
Binary file (6.63 kB). View file
 
backend/app/search/providers/__pycache__/google.cpython-314.pyc ADDED
Binary file (4.65 kB). View file
 
backend/app/search/providers/base.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base search provider interface."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class SearchResult:
10
+ """Standard search result format."""
11
+
12
+ title: str
13
+ url: str
14
+ snippet: str
15
+ position: int = 0
16
+ source: str = ""
17
+ score: float = 1.0
18
+ metadata: dict[str, Any] = field(default_factory=dict)
19
+
20
+
21
+ class BaseSearchProvider(ABC):
22
+ """
23
+ Abstract base class for search providers.
24
+
25
+ All search providers must implement this interface.
26
+ """
27
+
28
+ def __init__(self, api_key: Optional[str] = None) -> None:
29
+ self.api_key = api_key
30
+ self._initialized: bool = False
31
+
32
+ async def initialize(self) -> None:
33
+ """Initialize the provider (optional override)."""
34
+ self._initialized = True
35
+
36
+ async def shutdown(self) -> None:
37
+ """Shutdown the provider (optional override)."""
38
+ self._initialized = False
39
+
40
+ @abstractmethod
41
+ async def search(
42
+ self,
43
+ query: str,
44
+ max_results: int = 10,
45
+ ) -> list[SearchResult]:
46
+ """
47
+ Perform a search query.
48
+
49
+ Args:
50
+ query: Search query string
51
+ max_results: Maximum number of results
52
+
53
+ Returns:
54
+ List of SearchResult objects
55
+ """
56
+ pass
57
+
58
+ @property
59
+ def name(self) -> str:
60
+ """Provider name for identification."""
61
+ return self.__class__.__name__.replace("Provider", "").replace("Search", "")
62
+
63
+ @property
64
+ def is_initialized(self) -> bool:
65
+ """Check if provider is initialized."""
66
+ return self._initialized
67
+
68
+ def health_check(self) -> bool:
69
+ """Check provider health."""
70
+ return self._initialized
backend/app/search/providers/bing.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bing Search provider (stub implementation)."""
2
+
3
+ from typing import Optional
4
+
5
+ from app.search.providers.base import BaseSearchProvider, SearchResult
6
+ from app.utils.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class BingSearchProvider(BaseSearchProvider):
12
+ """
13
+ Bing Search provider using Bing Web Search API.
14
+
15
+ This is a stub implementation. To use Bing Search API:
16
+ 1. Get API key from Azure Portal (Bing Search resource)
17
+ 2. Set the BING_API_KEY environment variable
18
+
19
+ Environment variables:
20
+ BING_API_KEY: Bing Search API key
21
+ """
22
+
23
+ def __init__(self, api_key: Optional[str] = None) -> None:
24
+ super().__init__(api_key)
25
+ self._base_url = "https://api.bing.microsoft.com/v7.0/search"
26
+
27
+ async def initialize(self) -> None:
28
+ """Initialize the Bing Search provider."""
29
+ logger.info("Initializing BingSearchProvider")
30
+
31
+ if not self.api_key:
32
+ logger.warning("Bing API key not configured - stub mode enabled")
33
+
34
+ self._initialized = True
35
+ logger.info("BingSearchProvider initialized")
36
+
37
+ async def search(
38
+ self,
39
+ query: str,
40
+ max_results: int = 10,
41
+ ) -> list[SearchResult]:
42
+ """
43
+ Search using Bing Web Search API.
44
+
45
+ Args:
46
+ query: Search query string
47
+ max_results: Maximum number of results
48
+
49
+ Returns:
50
+ List of SearchResult objects
51
+ """
52
+ logger.info(f"Bing search: {query}")
53
+
54
+ if not self.api_key:
55
+ logger.warning("Bing Search not configured, returning stub results")
56
+ return self._get_stub_results(query, max_results)
57
+
58
+ # Real implementation would look like:
59
+ # import httpx
60
+ # async with httpx.AsyncClient() as client:
61
+ # headers = {"Ocp-Apim-Subscription-Key": self.api_key}
62
+ # params = {
63
+ # "q": query,
64
+ # "count": max_results,
65
+ # "responseFilter": "Webpages",
66
+ # }
67
+ # response = await client.get(
68
+ # self._base_url,
69
+ # headers=headers,
70
+ # params=params,
71
+ # )
72
+ # data = response.json()
73
+ #
74
+ # results = []
75
+ # web_pages = data.get("webPages", {}).get("value", [])
76
+ # for i, item in enumerate(web_pages):
77
+ # results.append(SearchResult(
78
+ # title=item.get("name", ""),
79
+ # url=item.get("url", ""),
80
+ # snippet=item.get("snippet", ""),
81
+ # position=i + 1,
82
+ # source="bing",
83
+ # ))
84
+ # return results
85
+
86
+ return self._get_stub_results(query, max_results)
87
+
88
+ def _get_stub_results(
89
+ self,
90
+ query: str,
91
+ max_results: int,
92
+ ) -> list[SearchResult]:
93
+ """Generate stub results for testing."""
94
+ results = []
95
+ for i in range(min(max_results, 3)):
96
+ results.append(
97
+ SearchResult(
98
+ title=f"Bing Result {i + 1}: {query}",
99
+ url=f"https://example.com/bing/{i + 1}",
100
+ snippet=f"This is a stub Bing search result for '{query}'. "
101
+ f"Configure BING_API_KEY for real results.",
102
+ position=i + 1,
103
+ source="bing",
104
+ metadata={"stub": True},
105
+ )
106
+ )
107
+ return results
backend/app/search/providers/duckduckgo.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DuckDuckGo Search provider using duckduckgo-search library."""
2
+
3
+ from typing import Optional
4
+
5
+ from app.search.providers.base import BaseSearchProvider, SearchResult
6
+ from app.utils.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class DuckDuckGoProvider(BaseSearchProvider):
12
+ """
13
+ DuckDuckGo Search provider using the duckduckgo-search library.
14
+
15
+ This provider works without an API key.
16
+
17
+ Requirements:
18
+ pip install duckduckgo-search
19
+ """
20
+
21
+ def __init__(self) -> None:
22
+ super().__init__(api_key=None)
23
+ self._ddgs: Optional[object] = None
24
+
25
+ async def initialize(self) -> None:
26
+ """Initialize the DuckDuckGo Search provider."""
27
+ logger.info("Initializing DuckDuckGoProvider")
28
+
29
+ try:
30
+ from duckduckgo_search import DDGS
31
+
32
+ self._ddgs = DDGS()
33
+ self._initialized = True
34
+ logger.info("DuckDuckGoProvider initialized with duckduckgo-search")
35
+ except ImportError:
36
+ logger.warning(
37
+ "duckduckgo-search not installed. "
38
+ "Install with: pip install duckduckgo-search"
39
+ )
40
+ self._initialized = True # Still mark as initialized for stub mode
41
+ logger.info("DuckDuckGoProvider initialized in stub mode")
42
+
43
+ async def shutdown(self) -> None:
44
+ """Shutdown the DuckDuckGo provider."""
45
+ self._ddgs = None
46
+ self._initialized = False
47
+ logger.info("DuckDuckGoProvider shut down")
48
+
49
+ async def search(
50
+ self,
51
+ query: str,
52
+ max_results: int = 10,
53
+ ) -> list[SearchResult]:
54
+ """
55
+ Search using DuckDuckGo.
56
+
57
+ Args:
58
+ query: Search query string
59
+ max_results: Maximum number of results
60
+
61
+ Returns:
62
+ List of SearchResult objects
63
+ """
64
+ logger.info(f"DuckDuckGo search: {query}")
65
+
66
+ if self._ddgs is None:
67
+ logger.warning("DuckDuckGo not available, returning stub results")
68
+ return self._get_stub_results(query, max_results)
69
+
70
+ try:
71
+ # duckduckgo-search is synchronous, run in executor for async
72
+ import asyncio
73
+
74
+ loop = asyncio.get_event_loop()
75
+ raw_results = await loop.run_in_executor(
76
+ None,
77
+ lambda: list(self._ddgs.text(query, max_results=max_results)), # type: ignore
78
+ )
79
+
80
+ results = []
81
+ for i, item in enumerate(raw_results):
82
+ results.append(
83
+ SearchResult(
84
+ title=item.get("title", ""),
85
+ url=item.get("href", item.get("link", "")),
86
+ snippet=item.get("body", item.get("snippet", "")),
87
+ position=i + 1,
88
+ source="duckduckgo",
89
+ metadata={
90
+ "raw": item,
91
+ },
92
+ )
93
+ )
94
+
95
+ logger.info(f"DuckDuckGo returned {len(results)} results")
96
+ return results
97
+
98
+ except Exception as e:
99
+ logger.error(f"DuckDuckGo search failed: {e}")
100
+ return self._get_stub_results(query, max_results)
101
+
102
+ def _get_stub_results(
103
+ self,
104
+ query: str,
105
+ max_results: int,
106
+ ) -> list[SearchResult]:
107
+ """Generate stub results for testing."""
108
+ results = []
109
+ for i in range(min(max_results, 3)):
110
+ results.append(
111
+ SearchResult(
112
+ title=f"DuckDuckGo Result {i + 1}: {query}",
113
+ url=f"https://example.com/ddg/{i + 1}",
114
+ snippet=f"This is a stub DuckDuckGo search result for '{query}'. "
115
+ f"Install duckduckgo-search for real results.",
116
+ position=i + 1,
117
+ source="duckduckgo",
118
+ metadata={"stub": True},
119
+ )
120
+ )
121
+ return results
122
+
123
+ @property
124
+ def is_available(self) -> bool:
125
+ """Check if DuckDuckGo search is available."""
126
+ return self._ddgs is not None
backend/app/search/providers/google.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Google Search provider (stub implementation)."""
2
+
3
+ from typing import Optional
4
+
5
+ from app.search.providers.base import BaseSearchProvider, SearchResult
6
+ from app.utils.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class GoogleSearchProvider(BaseSearchProvider):
12
+ """
13
+ Google Search provider using Custom Search API.
14
+
15
+ This is a stub implementation. To use Google Search API:
16
+ 1. Get API key from Google Cloud Console
17
+ 2. Create a Custom Search Engine (CSE)
18
+ 3. Get the Search Engine ID (cx)
19
+
20
+ Environment variables:
21
+ GOOGLE_API_KEY: Google Cloud API key
22
+ GOOGLE_CSE_ID: Custom Search Engine ID
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ api_key: Optional[str] = None,
28
+ search_engine_id: Optional[str] = None,
29
+ ) -> None:
30
+ super().__init__(api_key)
31
+ self.search_engine_id = search_engine_id
32
+ self._base_url = "https://www.googleapis.com/customsearch/v1"
33
+
34
+ async def initialize(self) -> None:
35
+ """Initialize the Google Search provider."""
36
+ logger.info("Initializing GoogleSearchProvider")
37
+
38
+ if not self.api_key:
39
+ logger.warning("Google API key not configured - stub mode enabled")
40
+
41
+ if not self.search_engine_id:
42
+ logger.warning("Google CSE ID not configured - stub mode enabled")
43
+
44
+ self._initialized = True
45
+ logger.info("GoogleSearchProvider initialized")
46
+
47
+ async def search(
48
+ self,
49
+ query: str,
50
+ max_results: int = 10,
51
+ ) -> list[SearchResult]:
52
+ """
53
+ Search using Google Custom Search API.
54
+
55
+ Args:
56
+ query: Search query string
57
+ max_results: Maximum number of results (max 10 per request)
58
+
59
+ Returns:
60
+ List of SearchResult objects
61
+ """
62
+ logger.info(f"Google search: {query}")
63
+
64
+ if not self.api_key or not self.search_engine_id:
65
+ logger.warning("Google Search not configured, returning stub results")
66
+ return self._get_stub_results(query, max_results)
67
+
68
+ # Real implementation would look like:
69
+ # import httpx
70
+ # async with httpx.AsyncClient() as client:
71
+ # params = {
72
+ # "key": self.api_key,
73
+ # "cx": self.search_engine_id,
74
+ # "q": query,
75
+ # "num": min(max_results, 10),
76
+ # }
77
+ # response = await client.get(self._base_url, params=params)
78
+ # data = response.json()
79
+ #
80
+ # results = []
81
+ # for i, item in enumerate(data.get("items", [])):
82
+ # results.append(SearchResult(
83
+ # title=item.get("title", ""),
84
+ # url=item.get("link", ""),
85
+ # snippet=item.get("snippet", ""),
86
+ # position=i + 1,
87
+ # source="google",
88
+ # ))
89
+ # return results
90
+
91
+ return self._get_stub_results(query, max_results)
92
+
93
+ def _get_stub_results(
94
+ self,
95
+ query: str,
96
+ max_results: int,
97
+ ) -> list[SearchResult]:
98
+ """Generate stub results for testing."""
99
+ results = []
100
+ for i in range(min(max_results, 3)):
101
+ results.append(
102
+ SearchResult(
103
+ title=f"Google Result {i + 1}: {query}",
104
+ url=f"https://example.com/google/{i + 1}",
105
+ snippet=f"This is a stub Google search result for '{query}'. "
106
+ f"Configure GOOGLE_API_KEY and GOOGLE_CSE_ID for real results.",
107
+ position=i + 1,
108
+ source="google",
109
+ metadata={"stub": True},
110
+ )
111
+ )
112
+ return results
backend/app/tools/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tools module for ScrapeRL backend."""
2
+
3
+ from app.tools.registry import MCPToolRegistry
4
+ from app.tools.browser import BrowserTool
5
+ from app.tools.search import SearchTool
6
+ from app.tools.html import (
7
+ parse_html,
8
+ clean_html,
9
+ extract_text,
10
+ semantic_chunk,
11
+ extract_links,
12
+ extract_tables,
13
+ )
14
+
15
+ __all__ = [
16
+ "MCPToolRegistry",
17
+ "BrowserTool",
18
+ "SearchTool",
19
+ "parse_html",
20
+ "clean_html",
21
+ "extract_text",
22
+ "semantic_chunk",
23
+ "extract_links",
24
+ "extract_tables",
25
+ ]
backend/app/tools/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (580 Bytes). View file
 
backend/app/tools/__pycache__/browser.cpython-314.pyc ADDED
Binary file (13.2 kB). View file
 
backend/app/tools/__pycache__/html.cpython-314.pyc ADDED
Binary file (437 Bytes). View file
 
backend/app/tools/__pycache__/registry.cpython-314.pyc ADDED
Binary file (15.1 kB). View file
 
backend/app/tools/__pycache__/search.cpython-314.pyc ADDED
Binary file (7.2 kB). View file
 
backend/app/tools/browser.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browser automation tool for web scraping."""
2
+
3
+ from typing import Any, Optional
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+
7
+ from app.utils.logging import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class BrowserType(Enum):
13
+ """Supported browser types."""
14
+
15
+ CHROMIUM = "chromium"
16
+ FIREFOX = "firefox"
17
+ WEBKIT = "webkit"
18
+
19
+
20
+ @dataclass
21
+ class BrowserConfig:
22
+ """Configuration for browser instance."""
23
+
24
+ browser_type: BrowserType = BrowserType.CHROMIUM
25
+ headless: bool = True
26
+ timeout: int = 30000 # milliseconds
27
+ viewport_width: int = 1920
28
+ viewport_height: int = 1080
29
+ user_agent: Optional[str] = None
30
+ proxy: Optional[str] = None
31
+
32
+
33
+ @dataclass
34
+ class NavigationResult:
35
+ """Result of a navigation action."""
36
+
37
+ url: str
38
+ status: int
39
+ title: str
40
+ success: bool
41
+ error: Optional[str] = None
42
+
43
+
44
+ @dataclass
45
+ class ClickResult:
46
+ """Result of a click action."""
47
+
48
+ selector: str
49
+ success: bool
50
+ error: Optional[str] = None
51
+
52
+
53
+ @dataclass
54
+ class ScreenshotResult:
55
+ """Result of a screenshot action."""
56
+
57
+ data: bytes
58
+ format: str
59
+ width: int
60
+ height: int
61
+ success: bool
62
+ error: Optional[str] = None
63
+
64
+
65
+ class BrowserTool:
66
+ """
67
+ Browser automation tool using Playwright/Selenium.
68
+
69
+ This is a stub implementation that defines the interface.
70
+ Actual browser automation requires installing playwright or selenium.
71
+ """
72
+
73
+ def __init__(self, config: Optional[BrowserConfig] = None) -> None:
74
+ self.config = config or BrowserConfig()
75
+ self._browser: Any = None
76
+ self._context: Any = None
77
+ self._page: Any = None
78
+ self._initialized: bool = False
79
+
80
+ async def initialize(self) -> None:
81
+ """
82
+ Initialize the browser instance.
83
+
84
+ Note: This is a stub. Real implementation requires playwright:
85
+ pip install playwright
86
+ playwright install
87
+ """
88
+ logger.info(f"Initializing browser: {self.config.browser_type.value}")
89
+ # Stub: In real implementation, initialize playwright here
90
+ # from playwright.async_api import async_playwright
91
+ # self._playwright = await async_playwright().start()
92
+ # self._browser = await self._playwright.chromium.launch(headless=self.config.headless)
93
+ self._initialized = True
94
+ logger.info("Browser initialized (stub mode)")
95
+
96
+ async def shutdown(self) -> None:
97
+ """Close the browser and cleanup resources."""
98
+ logger.info("Shutting down browser")
99
+ if self._page:
100
+ # await self._page.close()
101
+ self._page = None
102
+ if self._context:
103
+ # await self._context.close()
104
+ self._context = None
105
+ if self._browser:
106
+ # await self._browser.close()
107
+ self._browser = None
108
+ self._initialized = False
109
+ logger.info("Browser shutdown complete")
110
+
111
+ async def navigate(
112
+ self,
113
+ url: str,
114
+ wait_until: str = "domcontentloaded",
115
+ timeout: Optional[int] = None,
116
+ ) -> NavigationResult:
117
+ """
118
+ Navigate to a URL.
119
+
120
+ Args:
121
+ url: Target URL
122
+ wait_until: Navigation wait condition (load, domcontentloaded, networkidle)
123
+ timeout: Navigation timeout in milliseconds
124
+
125
+ Returns:
126
+ NavigationResult with status and details
127
+ """
128
+ logger.info(f"Navigating to: {url}")
129
+
130
+ if not self._initialized:
131
+ return NavigationResult(
132
+ url=url,
133
+ status=0,
134
+ title="",
135
+ success=False,
136
+ error="Browser not initialized",
137
+ )
138
+
139
+ # Stub implementation
140
+ # Real implementation:
141
+ # response = await self._page.goto(url, wait_until=wait_until, timeout=timeout)
142
+ # return NavigationResult(
143
+ # url=self._page.url,
144
+ # status=response.status if response else 0,
145
+ # title=await self._page.title(),
146
+ # success=True,
147
+ # )
148
+
149
+ return NavigationResult(
150
+ url=url,
151
+ status=200,
152
+ title="Stub Page Title",
153
+ success=True,
154
+ error="Stub mode - no actual navigation",
155
+ )
156
+
157
+ async def click(
158
+ self,
159
+ selector: str,
160
+ timeout: Optional[int] = None,
161
+ force: bool = False,
162
+ ) -> ClickResult:
163
+ """
164
+ Click an element on the page.
165
+
166
+ Args:
167
+ selector: CSS or XPath selector
168
+ timeout: Click timeout in milliseconds
169
+ force: Force click even if element is obscured
170
+
171
+ Returns:
172
+ ClickResult indicating success or failure
173
+ """
174
+ logger.info(f"Clicking element: {selector}")
175
+
176
+ if not self._initialized:
177
+ return ClickResult(
178
+ selector=selector,
179
+ success=False,
180
+ error="Browser not initialized",
181
+ )
182
+
183
+ # Stub implementation
184
+ # Real implementation:
185
+ # await self._page.click(selector, timeout=timeout, force=force)
186
+
187
+ return ClickResult(
188
+ selector=selector,
189
+ success=True,
190
+ error="Stub mode - no actual click",
191
+ )
192
+
193
+ async def fill(
194
+ self,
195
+ selector: str,
196
+ value: str,
197
+ timeout: Optional[int] = None,
198
+ ) -> ClickResult:
199
+ """
200
+ Fill a form field with text.
201
+
202
+ Args:
203
+ selector: CSS or XPath selector
204
+ value: Text to enter
205
+ timeout: Action timeout in milliseconds
206
+
207
+ Returns:
208
+ ClickResult indicating success or failure
209
+ """
210
+ logger.info(f"Filling element: {selector} with value")
211
+
212
+ if not self._initialized:
213
+ return ClickResult(
214
+ selector=selector,
215
+ success=False,
216
+ error="Browser not initialized",
217
+ )
218
+
219
+ # Stub implementation
220
+ # Real implementation:
221
+ # await self._page.fill(selector, value, timeout=timeout)
222
+
223
+ return ClickResult(
224
+ selector=selector,
225
+ success=True,
226
+ error="Stub mode - no actual fill",
227
+ )
228
+
229
+ async def get_html(
230
+ self,
231
+ selector: Optional[str] = None,
232
+ ) -> str:
233
+ """
234
+ Get HTML content of the page or a specific element.
235
+
236
+ Args:
237
+ selector: Optional selector to get HTML of specific element
238
+
239
+ Returns:
240
+ HTML content as string
241
+ """
242
+ logger.info(f"Getting HTML for: {selector or 'full page'}")
243
+
244
+ if not self._initialized:
245
+ return ""
246
+
247
+ # Stub implementation
248
+ # Real implementation:
249
+ # if selector:
250
+ # element = await self._page.query_selector(selector)
251
+ # return await element.inner_html() if element else ""
252
+ # return await self._page.content()
253
+
254
+ return "<html><body><h1>Stub HTML Content</h1></body></html>"
255
+
256
+ async def screenshot(
257
+ self,
258
+ selector: Optional[str] = None,
259
+ full_page: bool = False,
260
+ format: str = "png",
261
+ ) -> ScreenshotResult:
262
+ """
263
+ Take a screenshot of the page or element.
264
+
265
+ Args:
266
+ selector: Optional selector to screenshot specific element
267
+ full_page: Capture full scrollable page
268
+ format: Image format (png, jpeg)
269
+
270
+ Returns:
271
+ ScreenshotResult with image data
272
+ """
273
+ logger.info(f"Taking screenshot: selector={selector}, full_page={full_page}")
274
+
275
+ if not self._initialized:
276
+ return ScreenshotResult(
277
+ data=b"",
278
+ format=format,
279
+ width=0,
280
+ height=0,
281
+ success=False,
282
+ error="Browser not initialized",
283
+ )
284
+
285
+ # Stub implementation
286
+ # Real implementation:
287
+ # if selector:
288
+ # element = await self._page.query_selector(selector)
289
+ # data = await element.screenshot(type=format) if element else b""
290
+ # else:
291
+ # data = await self._page.screenshot(full_page=full_page, type=format)
292
+
293
+ return ScreenshotResult(
294
+ data=b"stub_screenshot_data",
295
+ format=format,
296
+ width=self.config.viewport_width,
297
+ height=self.config.viewport_height,
298
+ success=True,
299
+ error="Stub mode - no actual screenshot",
300
+ )
301
+
302
+ async def evaluate(self, script: str) -> Any:
303
+ """
304
+ Execute JavaScript in the page context.
305
+
306
+ Args:
307
+ script: JavaScript code to execute
308
+
309
+ Returns:
310
+ Result of the script execution
311
+ """
312
+ logger.info(f"Evaluating script: {script[:50]}...")
313
+
314
+ if not self._initialized:
315
+ return None
316
+
317
+ # Stub implementation
318
+ # Real implementation:
319
+ # return await self._page.evaluate(script)
320
+
321
+ return None
322
+
323
+ async def wait_for_selector(
324
+ self,
325
+ selector: str,
326
+ timeout: Optional[int] = None,
327
+ state: str = "visible",
328
+ ) -> bool:
329
+ """
330
+ Wait for an element to appear on the page.
331
+
332
+ Args:
333
+ selector: CSS or XPath selector
334
+ timeout: Wait timeout in milliseconds
335
+ state: Element state to wait for (visible, hidden, attached, detached)
336
+
337
+ Returns:
338
+ True if element found, False otherwise
339
+ """
340
+ logger.info(f"Waiting for selector: {selector}")
341
+
342
+ if not self._initialized:
343
+ return False
344
+
345
+ # Stub implementation
346
+ # Real implementation:
347
+ # try:
348
+ # await self._page.wait_for_selector(selector, timeout=timeout, state=state)
349
+ # return True
350
+ # except TimeoutError:
351
+ # return False
352
+
353
+ return True
354
+
355
+ def health_check(self) -> bool:
356
+ """Check if the browser is healthy and responsive."""
357
+ return self._initialized
358
+
359
+ @property
360
+ def is_initialized(self) -> bool:
361
+ """Check if the browser has been initialized."""
362
+ return self._initialized
backend/app/tools/html.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTML processing tools for web scraping.
2
+
3
+ Re-exports utilities from app.utils.html for tool registration.
4
+ """
5
+
6
+ from app.utils.html import (
7
+ parse_html,
8
+ clean_html,
9
+ extract_text,
10
+ semantic_chunk,
11
+ extract_links,
12
+ extract_tables,
13
+ )
14
+
15
+ __all__ = [
16
+ "parse_html",
17
+ "clean_html",
18
+ "extract_text",
19
+ "semantic_chunk",
20
+ "extract_links",
21
+ "extract_tables",
22
+ ]
backend/app/tools/registry.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MCP Tool Registry for dynamic tool discovery and management."""
2
+
3
+ import asyncio
4
+ from typing import Any, Callable, Optional
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+
8
+ from app.utils.logging import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ class ToolStatus(Enum):
14
+ """Status of a registered tool."""
15
+
16
+ UNKNOWN = "unknown"
17
+ HEALTHY = "healthy"
18
+ UNHEALTHY = "unhealthy"
19
+ INITIALIZING = "initializing"
20
+ SHUTDOWN = "shutdown"
21
+
22
+
23
+ @dataclass
24
+ class ToolDefinition:
25
+ """Definition of a registered tool."""
26
+
27
+ name: str
28
+ description: str
29
+ handler: Callable[..., Any]
30
+ parameters: dict[str, Any] = field(default_factory=dict)
31
+ status: ToolStatus = ToolStatus.UNKNOWN
32
+ metadata: dict[str, Any] = field(default_factory=dict)
33
+
34
+
35
+ class MCPToolRegistry:
36
+ """
37
+ Registry for MCP tools with dynamic discovery and execution.
38
+
39
+ Manages tool lifecycle including registration, health checks,
40
+ and execution routing.
41
+ """
42
+
43
+ def __init__(self) -> None:
44
+ self._tools: dict[str, ToolDefinition] = {}
45
+ self._initialized: bool = False
46
+ self._health_check_interval: float = 30.0
47
+ self._health_check_task: Optional[asyncio.Task[None]] = None
48
+
49
+ async def initialize(self) -> None:
50
+ """Initialize the registry and start health monitoring."""
51
+ if self._initialized:
52
+ logger.warning("Registry already initialized")
53
+ return
54
+
55
+ logger.info("Initializing MCP Tool Registry")
56
+
57
+ # Start health check background task
58
+ self._health_check_task = asyncio.create_task(self._health_check_loop())
59
+ self._initialized = True
60
+
61
+ logger.info("MCP Tool Registry initialized")
62
+
63
+ async def shutdown(self) -> None:
64
+ """Shutdown the registry and cleanup resources."""
65
+ logger.info("Shutting down MCP Tool Registry")
66
+
67
+ # Cancel health check task
68
+ if self._health_check_task:
69
+ self._health_check_task.cancel()
70
+ try:
71
+ await self._health_check_task
72
+ except asyncio.CancelledError:
73
+ pass
74
+
75
+ # Mark all tools as shutdown
76
+ for tool in self._tools.values():
77
+ tool.status = ToolStatus.SHUTDOWN
78
+
79
+ self._initialized = False
80
+ logger.info("MCP Tool Registry shutdown complete")
81
+
82
+ def register(
83
+ self,
84
+ name: str,
85
+ handler: Callable[..., Any],
86
+ description: str = "",
87
+ parameters: Optional[dict[str, Any]] = None,
88
+ metadata: Optional[dict[str, Any]] = None,
89
+ ) -> ToolDefinition:
90
+ """
91
+ Register a new tool with the registry.
92
+
93
+ Args:
94
+ name: Unique tool name
95
+ handler: Callable that implements the tool
96
+ description: Human-readable description
97
+ parameters: JSON schema for tool parameters
98
+ metadata: Additional tool metadata
99
+
100
+ Returns:
101
+ The registered ToolDefinition
102
+
103
+ Raises:
104
+ ValueError: If a tool with the same name already exists
105
+ """
106
+ if name in self._tools:
107
+ raise ValueError(f"Tool '{name}' is already registered")
108
+
109
+ tool = ToolDefinition(
110
+ name=name,
111
+ description=description,
112
+ handler=handler,
113
+ parameters=parameters or {},
114
+ status=ToolStatus.INITIALIZING,
115
+ metadata=metadata or {},
116
+ )
117
+
118
+ self._tools[name] = tool
119
+ logger.info(f"Registered tool: {name}")
120
+
121
+ return tool
122
+
123
+ def unregister(self, name: str) -> bool:
124
+ """
125
+ Unregister a tool from the registry.
126
+
127
+ Args:
128
+ name: Tool name to unregister
129
+
130
+ Returns:
131
+ True if tool was removed, False if not found
132
+ """
133
+ if name in self._tools:
134
+ del self._tools[name]
135
+ logger.info(f"Unregistered tool: {name}")
136
+ return True
137
+ return False
138
+
139
+ def get(self, name: str) -> Optional[ToolDefinition]:
140
+ """
141
+ Get a tool definition by name.
142
+
143
+ Args:
144
+ name: Tool name to retrieve
145
+
146
+ Returns:
147
+ ToolDefinition if found, None otherwise
148
+ """
149
+ return self._tools.get(name)
150
+
151
+ def list_tools(
152
+ self,
153
+ include_unhealthy: bool = False,
154
+ ) -> list[ToolDefinition]:
155
+ """
156
+ List all registered tools.
157
+
158
+ Args:
159
+ include_unhealthy: Include tools with unhealthy status
160
+
161
+ Returns:
162
+ List of tool definitions
163
+ """
164
+ tools = list(self._tools.values())
165
+
166
+ if not include_unhealthy:
167
+ tools = [
168
+ t for t in tools
169
+ if t.status not in (ToolStatus.UNHEALTHY, ToolStatus.SHUTDOWN)
170
+ ]
171
+
172
+ return tools
173
+
174
+ async def execute(
175
+ self,
176
+ name: str,
177
+ **kwargs: Any,
178
+ ) -> Any:
179
+ """
180
+ Execute a tool by name with the given parameters.
181
+
182
+ Args:
183
+ name: Tool name to execute
184
+ **kwargs: Tool parameters
185
+
186
+ Returns:
187
+ Tool execution result
188
+
189
+ Raises:
190
+ KeyError: If tool is not found
191
+ RuntimeError: If tool is not healthy
192
+ """
193
+ tool = self.get(name)
194
+
195
+ if tool is None:
196
+ raise KeyError(f"Tool '{name}' not found")
197
+
198
+ if tool.status == ToolStatus.UNHEALTHY:
199
+ raise RuntimeError(f"Tool '{name}' is unhealthy")
200
+
201
+ if tool.status == ToolStatus.SHUTDOWN:
202
+ raise RuntimeError(f"Tool '{name}' has been shut down")
203
+
204
+ logger.debug(f"Executing tool: {name} with params: {kwargs}")
205
+
206
+ try:
207
+ # Handle both sync and async handlers
208
+ if asyncio.iscoroutinefunction(tool.handler):
209
+ result = await tool.handler(**kwargs)
210
+ else:
211
+ result = tool.handler(**kwargs)
212
+
213
+ return result
214
+
215
+ except Exception as e:
216
+ logger.error(f"Tool execution failed: {name} - {e}")
217
+ raise
218
+
219
+ async def health_check(self, name: str) -> ToolStatus:
220
+ """
221
+ Check the health of a specific tool.
222
+
223
+ Args:
224
+ name: Tool name to check
225
+
226
+ Returns:
227
+ Current tool status
228
+ """
229
+ tool = self.get(name)
230
+ if tool is None:
231
+ return ToolStatus.UNKNOWN
232
+
233
+ try:
234
+ # Try to call a health check method if available
235
+ handler = tool.handler
236
+ if hasattr(handler, "health_check"):
237
+ health_fn = getattr(handler, "health_check")
238
+ if asyncio.iscoroutinefunction(health_fn):
239
+ await health_fn()
240
+ else:
241
+ health_fn()
242
+
243
+ tool.status = ToolStatus.HEALTHY
244
+ except Exception as e:
245
+ logger.warning(f"Health check failed for {name}: {e}")
246
+ tool.status = ToolStatus.UNHEALTHY
247
+
248
+ return tool.status
249
+
250
+ async def health_check_all(self) -> dict[str, ToolStatus]:
251
+ """
252
+ Check health of all registered tools.
253
+
254
+ Returns:
255
+ Dictionary mapping tool names to their status
256
+ """
257
+ results: dict[str, ToolStatus] = {}
258
+
259
+ for name in self._tools:
260
+ results[name] = await self.health_check(name)
261
+
262
+ return results
263
+
264
+ async def _health_check_loop(self) -> None:
265
+ """Background task for periodic health checks."""
266
+ while True:
267
+ try:
268
+ await asyncio.sleep(self._health_check_interval)
269
+ await self.health_check_all()
270
+ except asyncio.CancelledError:
271
+ break
272
+ except Exception as e:
273
+ logger.error(f"Health check loop error: {e}")
274
+
275
+ def get_tool_schema(self, name: str) -> Optional[dict[str, Any]]:
276
+ """
277
+ Get the JSON schema for a tool's parameters.
278
+
279
+ Args:
280
+ name: Tool name
281
+
282
+ Returns:
283
+ Parameter schema dict or None if not found
284
+ """
285
+ tool = self.get(name)
286
+ if tool is None:
287
+ return None
288
+
289
+ return {
290
+ "name": tool.name,
291
+ "description": tool.description,
292
+ "parameters": tool.parameters,
293
+ }
294
+
295
+ def list_schemas(self) -> list[dict[str, Any]]:
296
+ """
297
+ Get schemas for all registered tools.
298
+
299
+ Returns:
300
+ List of tool schema dictionaries
301
+ """
302
+ schemas = []
303
+ for name in self._tools:
304
+ schema = self.get_tool_schema(name)
305
+ if schema:
306
+ schemas.append(schema)
307
+ return schemas
308
+
309
+ @property
310
+ def is_initialized(self) -> bool:
311
+ """Check if the registry has been initialized."""
312
+ return self._initialized
313
+
314
+ @property
315
+ def tool_count(self) -> int:
316
+ """Get the number of registered tools."""
317
+ return len(self._tools)
backend/app/tools/search.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search tool wrapper for search engine providers."""
2
+
3
+ from typing import Any, Optional
4
+ from dataclasses import dataclass
5
+
6
+ from app.utils.logging import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class SearchResult:
13
+ """Individual search result."""
14
+
15
+ title: str
16
+ url: str
17
+ snippet: str
18
+ position: int
19
+ source: str # Provider name
20
+ metadata: dict[str, Any] | None = None
21
+
22
+
23
+ @dataclass
24
+ class SearchResponse:
25
+ """Response from a search query."""
26
+
27
+ query: str
28
+ results: list[SearchResult]
29
+ total_results: int
30
+ provider: str
31
+ success: bool
32
+ error: Optional[str] = None
33
+
34
+
35
+ class SearchTool:
36
+ """
37
+ Search tool that wraps search engine providers.
38
+
39
+ Provides a unified interface for searching across different
40
+ search engine providers.
41
+ """
42
+
43
+ def __init__(self, default_provider: str = "duckduckgo") -> None:
44
+ self.default_provider = default_provider
45
+ self._engine: Any = None
46
+ self._initialized: bool = False
47
+
48
+ async def initialize(self, engine: Any = None) -> None:
49
+ """
50
+ Initialize the search tool with a search engine.
51
+
52
+ Args:
53
+ engine: SearchEngineRouter instance to use
54
+ """
55
+ logger.info("Initializing SearchTool")
56
+ self._engine = engine
57
+ self._initialized = True
58
+ logger.info("SearchTool initialized")
59
+
60
+ async def shutdown(self) -> None:
61
+ """Shutdown the search tool."""
62
+ logger.info("Shutting down SearchTool")
63
+ self._engine = None
64
+ self._initialized = False
65
+
66
+ async def search(
67
+ self,
68
+ query: str,
69
+ max_results: int = 10,
70
+ provider: Optional[str] = None,
71
+ ) -> SearchResponse:
72
+ """
73
+ Perform a search query.
74
+
75
+ Args:
76
+ query: Search query string
77
+ max_results: Maximum number of results to return
78
+ provider: Specific provider to use (optional)
79
+
80
+ Returns:
81
+ SearchResponse with results
82
+ """
83
+ logger.info(f"Searching for: {query}")
84
+
85
+ provider_name = provider or self.default_provider
86
+
87
+ if not self._initialized or self._engine is None:
88
+ logger.warning("SearchTool not properly initialized, using stub response")
89
+ return SearchResponse(
90
+ query=query,
91
+ results=[],
92
+ total_results=0,
93
+ provider=provider_name,
94
+ success=False,
95
+ error="Search engine not initialized",
96
+ )
97
+
98
+ try:
99
+ # Delegate to search engine router
100
+ results = await self._engine.search(
101
+ query=query,
102
+ max_results=max_results,
103
+ provider=provider_name,
104
+ )
105
+
106
+ return SearchResponse(
107
+ query=query,
108
+ results=results,
109
+ total_results=len(results),
110
+ provider=provider_name,
111
+ success=True,
112
+ )
113
+
114
+ except Exception as e:
115
+ logger.error(f"Search failed: {e}")
116
+ return SearchResponse(
117
+ query=query,
118
+ results=[],
119
+ total_results=0,
120
+ provider=provider_name,
121
+ success=False,
122
+ error=str(e),
123
+ )
124
+
125
+ async def get_results(
126
+ self,
127
+ query: str,
128
+ max_results: int = 10,
129
+ provider: Optional[str] = None,
130
+ ) -> list[SearchResult]:
131
+ """
132
+ Get search results as a list.
133
+
134
+ Args:
135
+ query: Search query string
136
+ max_results: Maximum number of results to return
137
+ provider: Specific provider to use (optional)
138
+
139
+ Returns:
140
+ List of SearchResult objects
141
+ """
142
+ response = await self.search(query, max_results, provider)
143
+ return response.results
144
+
145
+ def health_check(self) -> bool:
146
+ """Check if the search tool is healthy."""
147
+ return self._initialized and self._engine is not None
148
+
149
+ @property
150
+ def is_initialized(self) -> bool:
151
+ """Check if the search tool has been initialized."""
152
+ return self._initialized