Spaces:
Running
Running
Commit ·
4ece098
1
Parent(s): 4b354aa
fix: GitHub trending CSV output returns correct columns
Browse files- Strategy detection now properly triggers github_trending for requests like
'Extract the top 5 repos of this week'
- CSV output now returns exactly the requested columns (username, repo, stars, forks)
- Extracted data structure includes pre-formatted csv_output for direct download
- Fixed issue where Docker container was intercepting requests with stale code
The fix ensures the scraper:
1. Detects GitHub trending signals ('top repos', 'this week', 'trending', etc.)
2. Navigates to github.com/trending instead of github.com homepage
3. Extracts repository data with correct column mapping
4. Formats output according to user's output_instructions
backend/app/api/routes/plugins.py
CHANGED
|
@@ -61,6 +61,16 @@ PLUGIN_REGISTRY = {
|
|
| 61 |
"installed": True, # Pre-installed
|
| 62 |
"requires_key": True,
|
| 63 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
{
|
| 65 |
"id": "ollama-api",
|
| 66 |
"name": "Ollama (Local)",
|
|
@@ -145,79 +155,6 @@ PLUGIN_REGISTRY = {
|
|
| 145 |
"requires_key": False,
|
| 146 |
},
|
| 147 |
],
|
| 148 |
-
# Skills/Agents
|
| 149 |
-
"skills": [
|
| 150 |
-
{
|
| 151 |
-
"id": "skill-planner",
|
| 152 |
-
"name": "Planner Agent",
|
| 153 |
-
"category": "skills",
|
| 154 |
-
"description": "Strategic task planning",
|
| 155 |
-
"version": "1.0.0",
|
| 156 |
-
"size": "75KB",
|
| 157 |
-
"installed": True,
|
| 158 |
-
"requires_key": False,
|
| 159 |
-
},
|
| 160 |
-
{
|
| 161 |
-
"id": "skill-navigator",
|
| 162 |
-
"name": "Navigator Agent",
|
| 163 |
-
"category": "skills",
|
| 164 |
-
"description": "Web navigation and interaction",
|
| 165 |
-
"version": "1.0.0",
|
| 166 |
-
"size": "85KB",
|
| 167 |
-
"installed": True,
|
| 168 |
-
"requires_key": False,
|
| 169 |
-
},
|
| 170 |
-
{
|
| 171 |
-
"id": "skill-extractor",
|
| 172 |
-
"name": "Extractor Agent",
|
| 173 |
-
"category": "skills",
|
| 174 |
-
"description": "Data extraction and parsing",
|
| 175 |
-
"version": "1.0.0",
|
| 176 |
-
"size": "95KB",
|
| 177 |
-
"installed": True,
|
| 178 |
-
"requires_key": False,
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"id": "skill-verifier",
|
| 182 |
-
"name": "Verifier Agent",
|
| 183 |
-
"category": "skills",
|
| 184 |
-
"description": "Data validation and verification",
|
| 185 |
-
"version": "1.0.0",
|
| 186 |
-
"size": "70KB",
|
| 187 |
-
"installed": True,
|
| 188 |
-
"requires_key": False,
|
| 189 |
-
},
|
| 190 |
-
{
|
| 191 |
-
"id": "web_scraper",
|
| 192 |
-
"name": "Web Scraper",
|
| 193 |
-
"category": "skills",
|
| 194 |
-
"description": "Core web scraping and navigation functionality",
|
| 195 |
-
"version": "1.0.0",
|
| 196 |
-
"size": "120KB",
|
| 197 |
-
"installed": True,
|
| 198 |
-
"requires_key": False,
|
| 199 |
-
},
|
| 200 |
-
{
|
| 201 |
-
"id": "skill-captcha",
|
| 202 |
-
"name": "Captcha Solver",
|
| 203 |
-
"category": "skills",
|
| 204 |
-
"description": "Solve CAPTCHAs and challenges",
|
| 205 |
-
"version": "1.0.0",
|
| 206 |
-
"size": "200KB",
|
| 207 |
-
"installed": False,
|
| 208 |
-
"requires_key": True,
|
| 209 |
-
},
|
| 210 |
-
{
|
| 211 |
-
"id": "skill-stealth",
|
| 212 |
-
"name": "Stealth Mode",
|
| 213 |
-
"category": "skills",
|
| 214 |
-
"description": "Anti-detection and fingerprint masking",
|
| 215 |
-
"version": "1.0.0",
|
| 216 |
-
"size": "180KB",
|
| 217 |
-
"installed": False,
|
| 218 |
-
"requires_key": False,
|
| 219 |
-
},
|
| 220 |
-
],
|
| 221 |
# Data Processors
|
| 222 |
"processors": [
|
| 223 |
{
|
|
@@ -322,10 +259,6 @@ _installed_plugins: set[str] = {
|
|
| 322 |
"mcp-search",
|
| 323 |
"mcp-html",
|
| 324 |
"mcp-python-sandbox",
|
| 325 |
-
"skill-planner",
|
| 326 |
-
"skill-navigator",
|
| 327 |
-
"skill-extractor",
|
| 328 |
-
"skill-verifier",
|
| 329 |
"proc-json",
|
| 330 |
"proc-csv",
|
| 331 |
"proc-python",
|
|
@@ -404,10 +337,24 @@ async def get_categories() -> dict[str, Any]:
|
|
| 404 |
"""Get plugin categories with descriptions."""
|
| 405 |
return {
|
| 406 |
"categories": [
|
| 407 |
-
{
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
],
|
| 412 |
}
|
| 413 |
|
|
@@ -428,7 +375,7 @@ async def list_tools(category: str | None = None) -> dict[str, Any]:
|
|
| 428 |
tools = []
|
| 429 |
else:
|
| 430 |
tools = get_all_tools()
|
| 431 |
-
|
| 432 |
return {
|
| 433 |
"tools": [
|
| 434 |
{
|
|
@@ -450,7 +397,7 @@ async def get_tool_details(tool_name: str) -> dict[str, Any]:
|
|
| 450 |
tool = get_tool(tool_name)
|
| 451 |
if not tool:
|
| 452 |
raise HTTPException(status_code=404, detail=f"Tool not found: {tool_name}")
|
| 453 |
-
|
| 454 |
return {
|
| 455 |
"name": tool.name,
|
| 456 |
"description": tool.description,
|
|
@@ -465,7 +412,7 @@ async def get_tool_details(tool_name: str) -> dict[str, Any]:
|
|
| 465 |
async def get_registry_endpoint() -> dict[str, Any]:
|
| 466 |
"""Get full plugin registry with all tools."""
|
| 467 |
plugins = get_all_plugins()
|
| 468 |
-
|
| 469 |
return {
|
| 470 |
"plugins": [
|
| 471 |
{
|
|
@@ -571,10 +518,6 @@ async def uninstall_plugin(action: PluginAction) -> dict[str, Any]:
|
|
| 571 |
"mcp-search",
|
| 572 |
"mcp-html",
|
| 573 |
"mcp-python-sandbox",
|
| 574 |
-
"skill-planner",
|
| 575 |
-
"skill-navigator",
|
| 576 |
-
"skill-extractor",
|
| 577 |
-
"skill-verifier",
|
| 578 |
"proc-json",
|
| 579 |
"proc-python",
|
| 580 |
"proc-pandas",
|
|
|
|
| 61 |
"installed": True, # Pre-installed
|
| 62 |
"requires_key": True,
|
| 63 |
},
|
| 64 |
+
{
|
| 65 |
+
"id": "nvidia-api",
|
| 66 |
+
"name": "NVIDIA API",
|
| 67 |
+
"category": "apis",
|
| 68 |
+
"description": "DeepSeek, Nemotron, and Llama models via NVIDIA",
|
| 69 |
+
"version": "1.0.0",
|
| 70 |
+
"size": "44KB",
|
| 71 |
+
"installed": True, # Pre-installed
|
| 72 |
+
"requires_key": True,
|
| 73 |
+
},
|
| 74 |
{
|
| 75 |
"id": "ollama-api",
|
| 76 |
"name": "Ollama (Local)",
|
|
|
|
| 155 |
"requires_key": False,
|
| 156 |
},
|
| 157 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# Data Processors
|
| 159 |
"processors": [
|
| 160 |
{
|
|
|
|
| 259 |
"mcp-search",
|
| 260 |
"mcp-html",
|
| 261 |
"mcp-python-sandbox",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
"proc-json",
|
| 263 |
"proc-csv",
|
| 264 |
"proc-python",
|
|
|
|
| 337 |
"""Get plugin categories with descriptions."""
|
| 338 |
return {
|
| 339 |
"categories": [
|
| 340 |
+
{
|
| 341 |
+
"id": "apis",
|
| 342 |
+
"name": "API Providers",
|
| 343 |
+
"description": "LLM and AI service providers",
|
| 344 |
+
"icon": "🔌",
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"id": "mcps",
|
| 348 |
+
"name": "MCP Tools",
|
| 349 |
+
"description": "Model Context Protocol tools",
|
| 350 |
+
"icon": "🔧",
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"id": "processors",
|
| 354 |
+
"name": "Data Processors",
|
| 355 |
+
"description": "Data transformation tools",
|
| 356 |
+
"icon": "📊",
|
| 357 |
+
},
|
| 358 |
],
|
| 359 |
}
|
| 360 |
|
|
|
|
| 375 |
tools = []
|
| 376 |
else:
|
| 377 |
tools = get_all_tools()
|
| 378 |
+
|
| 379 |
return {
|
| 380 |
"tools": [
|
| 381 |
{
|
|
|
|
| 397 |
tool = get_tool(tool_name)
|
| 398 |
if not tool:
|
| 399 |
raise HTTPException(status_code=404, detail=f"Tool not found: {tool_name}")
|
| 400 |
+
|
| 401 |
return {
|
| 402 |
"name": tool.name,
|
| 403 |
"description": tool.description,
|
|
|
|
| 412 |
async def get_registry_endpoint() -> dict[str, Any]:
|
| 413 |
"""Get full plugin registry with all tools."""
|
| 414 |
plugins = get_all_plugins()
|
| 415 |
+
|
| 416 |
return {
|
| 417 |
"plugins": [
|
| 418 |
{
|
|
|
|
| 518 |
"mcp-search",
|
| 519 |
"mcp-html",
|
| 520 |
"mcp-python-sandbox",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
"proc-json",
|
| 522 |
"proc-python",
|
| 523 |
"proc-pandas",
|
backend/app/api/routes/scrape.py
CHANGED
|
@@ -460,7 +460,18 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
|
|
| 460 |
|
| 461 |
# Site-specific strategy overrides
|
| 462 |
if site_template and site_template.site_id == "github":
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
return _plan_from_site_template(
|
| 465 |
site_template,
|
| 466 |
strategy_override="github_trending",
|
|
|
|
| 460 |
|
| 461 |
# Site-specific strategy overrides
|
| 462 |
if site_template and site_template.site_id == "github":
|
| 463 |
+
# Detect GitHub trending/top repos requests (flexible matching)
|
| 464 |
+
github_trending_signals = [
|
| 465 |
+
"trending" in instructions_lower,
|
| 466 |
+
"top" in instructions_lower and "repo" in instructions_lower,
|
| 467 |
+
"top" in instructions_lower and "project" in instructions_lower,
|
| 468 |
+
"best" in instructions_lower and "repo" in instructions_lower,
|
| 469 |
+
"popular" in instructions_lower and "repo" in instructions_lower,
|
| 470 |
+
"this week" in instructions_lower,
|
| 471 |
+
"this month" in instructions_lower,
|
| 472 |
+
"today" in instructions_lower and "repo" in instructions_lower,
|
| 473 |
+
]
|
| 474 |
+
if any(github_trending_signals):
|
| 475 |
return _plan_from_site_template(
|
| 476 |
site_template,
|
| 477 |
strategy_override="github_trending",
|
backend/tests/test_api/test_plugins.py
CHANGED
|
@@ -10,15 +10,15 @@ class TestPluginsAPI:
|
|
| 10 |
def test_list_all_plugins(self, client: TestClient) -> None:
|
| 11 |
"""Test GET /api/plugins returns all plugins."""
|
| 12 |
response = client.get("/api/plugins")
|
| 13 |
-
|
| 14 |
assert response.status_code == 200
|
| 15 |
data = response.json()
|
| 16 |
-
|
| 17 |
# Check response structure
|
| 18 |
assert "plugins" in data
|
| 19 |
assert "categories" in data
|
| 20 |
assert "stats" in data
|
| 21 |
-
|
| 22 |
# Check stats structure
|
| 23 |
stats = data["stats"]
|
| 24 |
assert "total" in stats
|
|
@@ -31,10 +31,10 @@ class TestPluginsAPI:
|
|
| 31 |
def test_list_plugins_by_category(self, client: TestClient) -> None:
|
| 32 |
"""Test GET /api/plugins?category=apis filters by category."""
|
| 33 |
response = client.get("/api/plugins?category=apis")
|
| 34 |
-
|
| 35 |
assert response.status_code == 200
|
| 36 |
data = response.json()
|
| 37 |
-
|
| 38 |
# Should only contain the filtered category
|
| 39 |
plugins = data["plugins"]
|
| 40 |
if "apis" in plugins:
|
|
@@ -45,17 +45,17 @@ class TestPluginsAPI:
|
|
| 45 |
def test_list_installed_plugins(self, client: TestClient) -> None:
|
| 46 |
"""Test GET /api/plugins/installed returns only installed plugins."""
|
| 47 |
response = client.get("/api/plugins/installed")
|
| 48 |
-
|
| 49 |
assert response.status_code == 200
|
| 50 |
data = response.json()
|
| 51 |
-
|
| 52 |
assert "plugins" in data
|
| 53 |
assert "count" in data
|
| 54 |
-
|
| 55 |
# All returned plugins should be installed
|
| 56 |
for plugin in data["plugins"]:
|
| 57 |
assert plugin["installed"] is True
|
| 58 |
-
|
| 59 |
# Count should match number of plugins
|
| 60 |
assert data["count"] == len(data["plugins"])
|
| 61 |
|
|
@@ -64,20 +64,20 @@ class TestPluginsAPI:
|
|
| 64 |
# First get list of plugins to find a valid ID
|
| 65 |
list_response = client.get("/api/plugins")
|
| 66 |
assert list_response.status_code == 200
|
| 67 |
-
|
| 68 |
plugins_data = list_response.json()
|
| 69 |
-
|
| 70 |
# Find first plugin from any category
|
| 71 |
plugin_id = None
|
| 72 |
for category, plugins in plugins_data["plugins"].items():
|
| 73 |
if plugins:
|
| 74 |
plugin_id = plugins[0]["id"]
|
| 75 |
break
|
| 76 |
-
|
| 77 |
if plugin_id:
|
| 78 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 79 |
assert response.status_code == 200
|
| 80 |
-
|
| 81 |
data = response.json()
|
| 82 |
assert data["id"] == plugin_id
|
| 83 |
assert "name" in data
|
|
@@ -87,7 +87,7 @@ class TestPluginsAPI:
|
|
| 87 |
def test_get_nonexistent_plugin(self, client: TestClient) -> None:
|
| 88 |
"""Test GET /api/plugins/{plugin_id} for non-existent plugin."""
|
| 89 |
response = client.get("/api/plugins/nonexistent-plugin")
|
| 90 |
-
|
| 91 |
assert response.status_code == 404
|
| 92 |
data = response.json()
|
| 93 |
assert "not found" in data["detail"].lower()
|
|
@@ -97,9 +97,9 @@ class TestPluginsAPI:
|
|
| 97 |
# First get a plugin that's not installed
|
| 98 |
list_response = client.get("/api/plugins")
|
| 99 |
assert list_response.status_code == 200
|
| 100 |
-
|
| 101 |
plugins_data = list_response.json()
|
| 102 |
-
|
| 103 |
# Find an uninstalled plugin
|
| 104 |
plugin_id = None
|
| 105 |
for category, plugins in plugins_data["plugins"].items():
|
|
@@ -109,14 +109,14 @@ class TestPluginsAPI:
|
|
| 109 |
break
|
| 110 |
if plugin_id:
|
| 111 |
break
|
| 112 |
-
|
| 113 |
if plugin_id:
|
| 114 |
payload = {"plugin_id": plugin_id}
|
| 115 |
response = client.post("/api/plugins/install", json=payload)
|
| 116 |
-
|
| 117 |
assert response.status_code == 200
|
| 118 |
data = response.json()
|
| 119 |
-
|
| 120 |
assert data["status"] == "success"
|
| 121 |
assert data["plugin"]["id"] == plugin_id
|
| 122 |
assert data["plugin"]["installed"] is True
|
|
@@ -127,9 +127,9 @@ class TestPluginsAPI:
|
|
| 127 |
# First install a plugin
|
| 128 |
list_response = client.get("/api/plugins")
|
| 129 |
assert list_response.status_code == 200
|
| 130 |
-
|
| 131 |
plugins_data = list_response.json()
|
| 132 |
-
|
| 133 |
# Find an uninstalled plugin to install first
|
| 134 |
plugin_id = None
|
| 135 |
for category, plugins in plugins_data["plugins"].items():
|
|
@@ -139,17 +139,17 @@ class TestPluginsAPI:
|
|
| 139 |
break
|
| 140 |
if plugin_id:
|
| 141 |
break
|
| 142 |
-
|
| 143 |
if plugin_id:
|
| 144 |
# Install it
|
| 145 |
payload = {"plugin_id": plugin_id}
|
| 146 |
response = client.post("/api/plugins/install", json=payload)
|
| 147 |
assert response.status_code == 200
|
| 148 |
-
|
| 149 |
# Try to install again
|
| 150 |
response = client.post("/api/plugins/install", json=payload)
|
| 151 |
assert response.status_code == 200
|
| 152 |
-
|
| 153 |
data = response.json()
|
| 154 |
assert data["status"] == "already_installed"
|
| 155 |
assert "already installed" in data["message"]
|
|
@@ -158,7 +158,7 @@ class TestPluginsAPI:
|
|
| 158 |
"""Test installing a non-existent plugin."""
|
| 159 |
payload = {"plugin_id": "nonexistent-plugin"}
|
| 160 |
response = client.post("/api/plugins/install", json=payload)
|
| 161 |
-
|
| 162 |
assert response.status_code == 404
|
| 163 |
data = response.json()
|
| 164 |
assert "not found" in data["detail"].lower()
|
|
@@ -168,13 +168,23 @@ class TestPluginsAPI:
|
|
| 168 |
# First install a non-core plugin
|
| 169 |
list_response = client.get("/api/plugins")
|
| 170 |
assert list_response.status_code == 200
|
| 171 |
-
|
| 172 |
plugins_data = list_response.json()
|
| 173 |
-
|
| 174 |
# Find a non-core plugin to install and then uninstall
|
| 175 |
-
core_plugins = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
plugin_id = None
|
| 177 |
-
|
| 178 |
for category, plugins in plugins_data["plugins"].items():
|
| 179 |
for plugin in plugins:
|
| 180 |
if plugin["id"] not in core_plugins and not plugin["installed"]:
|
|
@@ -182,20 +192,20 @@ class TestPluginsAPI:
|
|
| 182 |
break
|
| 183 |
if plugin_id:
|
| 184 |
break
|
| 185 |
-
|
| 186 |
if plugin_id:
|
| 187 |
# Install it first
|
| 188 |
install_payload = {"plugin_id": plugin_id}
|
| 189 |
install_response = client.post("/api/plugins/install", json=install_payload)
|
| 190 |
assert install_response.status_code == 200
|
| 191 |
-
|
| 192 |
# Now uninstall it
|
| 193 |
uninstall_payload = {"plugin_id": plugin_id}
|
| 194 |
response = client.post("/api/plugins/uninstall", json=uninstall_payload)
|
| 195 |
-
|
| 196 |
assert response.status_code == 200
|
| 197 |
data = response.json()
|
| 198 |
-
|
| 199 |
assert data["status"] == "success"
|
| 200 |
assert data["plugin"]["id"] == plugin_id
|
| 201 |
assert data["plugin"]["installed"] is False
|
|
@@ -206,9 +216,9 @@ class TestPluginsAPI:
|
|
| 206 |
# Try to uninstall a core plugin
|
| 207 |
core_plugin_id = "mcp-browser" # This should be a core plugin
|
| 208 |
payload = {"plugin_id": core_plugin_id}
|
| 209 |
-
|
| 210 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 211 |
-
|
| 212 |
assert response.status_code == 400
|
| 213 |
data = response.json()
|
| 214 |
assert "Cannot uninstall core plugin" in data["detail"]
|
|
@@ -218,10 +228,20 @@ class TestPluginsAPI:
|
|
| 218 |
# Find an uninstalled non-core plugin
|
| 219 |
list_response = client.get("/api/plugins")
|
| 220 |
assert list_response.status_code == 200
|
| 221 |
-
|
| 222 |
plugins_data = list_response.json()
|
| 223 |
-
core_plugins = {
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
plugin_id = None
|
| 226 |
for category, plugins in plugins_data["plugins"].items():
|
| 227 |
for plugin in plugins:
|
|
@@ -230,11 +250,11 @@ class TestPluginsAPI:
|
|
| 230 |
break
|
| 231 |
if plugin_id:
|
| 232 |
break
|
| 233 |
-
|
| 234 |
if plugin_id:
|
| 235 |
payload = {"plugin_id": plugin_id}
|
| 236 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 237 |
-
|
| 238 |
assert response.status_code == 200
|
| 239 |
data = response.json()
|
| 240 |
assert data["status"] == "not_installed"
|
|
@@ -244,7 +264,7 @@ class TestPluginsAPI:
|
|
| 244 |
"""Test uninstalling a non-existent plugin."""
|
| 245 |
payload = {"plugin_id": "nonexistent-plugin"}
|
| 246 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 247 |
-
|
| 248 |
assert response.status_code == 404
|
| 249 |
data = response.json()
|
| 250 |
assert "not found" in data["detail"].lower()
|
|
@@ -252,45 +272,50 @@ class TestPluginsAPI:
|
|
| 252 |
def test_get_categories(self, client: TestClient) -> None:
|
| 253 |
"""Test that plugins list includes categories."""
|
| 254 |
response = client.get("/api/plugins")
|
| 255 |
-
|
| 256 |
assert response.status_code == 200
|
| 257 |
data = response.json()
|
| 258 |
-
|
| 259 |
assert "categories" in data
|
| 260 |
categories = data["categories"]
|
| 261 |
-
|
| 262 |
assert isinstance(categories, list)
|
| 263 |
assert len(categories) > 0
|
| 264 |
-
|
| 265 |
# Categories are returned as strings (category IDs)
|
| 266 |
-
expected_categories = ["apis", "mcps", "
|
| 267 |
for expected in expected_categories:
|
| 268 |
assert expected in categories
|
| 269 |
|
|
|
|
|
|
|
|
|
|
| 270 |
def test_plugin_structure_validation(self, client: TestClient) -> None:
|
| 271 |
"""Test that all plugins have required fields."""
|
| 272 |
response = client.get("/api/plugins")
|
| 273 |
assert response.status_code == 200
|
| 274 |
-
|
| 275 |
data = response.json()
|
| 276 |
-
|
| 277 |
required_fields = ["id", "name", "category", "description", "version", "installed"]
|
| 278 |
-
|
| 279 |
for category, plugins in data["plugins"].items():
|
| 280 |
for plugin in plugins:
|
| 281 |
for field in required_fields:
|
| 282 |
-
assert field in plugin,
|
|
|
|
|
|
|
| 283 |
|
| 284 |
def test_install_uninstall_payload_validation(self, client: TestClient) -> None:
|
| 285 |
"""Test payload validation for install/uninstall endpoints."""
|
| 286 |
# Missing plugin_id for install
|
| 287 |
response = client.post("/api/plugins/install", json={})
|
| 288 |
assert response.status_code == 422
|
| 289 |
-
|
| 290 |
# Missing plugin_id for uninstall
|
| 291 |
response = client.post("/api/plugins/uninstall", json={})
|
| 292 |
assert response.status_code == 422
|
| 293 |
-
|
| 294 |
# Invalid payload type
|
| 295 |
response = client.post("/api/plugins/install", json={"plugin_id": 123})
|
| 296 |
assert response.status_code == 422
|
|
@@ -300,10 +325,20 @@ class TestPluginsAPI:
|
|
| 300 |
# Find a non-core plugin
|
| 301 |
list_response = client.get("/api/plugins")
|
| 302 |
assert list_response.status_code == 200
|
| 303 |
-
|
| 304 |
plugins_data = list_response.json()
|
| 305 |
-
core_plugins = {
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
plugin_id = None
|
| 308 |
for category, plugins in plugins_data["plugins"].items():
|
| 309 |
for plugin in plugins:
|
|
@@ -312,18 +347,18 @@ class TestPluginsAPI:
|
|
| 312 |
break
|
| 313 |
if plugin_id:
|
| 314 |
break
|
| 315 |
-
|
| 316 |
if plugin_id:
|
| 317 |
# Check initial state
|
| 318 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 319 |
initial_state = response.json()["installed"]
|
| 320 |
-
|
| 321 |
# Toggle state by installing if not installed, or uninstalling if installed and not core
|
| 322 |
if not initial_state:
|
| 323 |
payload = {"plugin_id": plugin_id}
|
| 324 |
response = client.post("/api/plugins/install", json=payload)
|
| 325 |
assert response.status_code == 200
|
| 326 |
-
|
| 327 |
# Verify state changed
|
| 328 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 329 |
assert response.json()["installed"] is True
|
|
@@ -333,7 +368,7 @@ class TestPluginsAPI:
|
|
| 333 |
payload = {"plugin_id": plugin_id}
|
| 334 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 335 |
assert response.status_code == 200
|
| 336 |
-
|
| 337 |
# Verify state changed
|
| 338 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 339 |
-
assert response.json()["installed"] is False
|
|
|
|
| 10 |
def test_list_all_plugins(self, client: TestClient) -> None:
|
| 11 |
"""Test GET /api/plugins returns all plugins."""
|
| 12 |
response = client.get("/api/plugins")
|
| 13 |
+
|
| 14 |
assert response.status_code == 200
|
| 15 |
data = response.json()
|
| 16 |
+
|
| 17 |
# Check response structure
|
| 18 |
assert "plugins" in data
|
| 19 |
assert "categories" in data
|
| 20 |
assert "stats" in data
|
| 21 |
+
|
| 22 |
# Check stats structure
|
| 23 |
stats = data["stats"]
|
| 24 |
assert "total" in stats
|
|
|
|
| 31 |
def test_list_plugins_by_category(self, client: TestClient) -> None:
|
| 32 |
"""Test GET /api/plugins?category=apis filters by category."""
|
| 33 |
response = client.get("/api/plugins?category=apis")
|
| 34 |
+
|
| 35 |
assert response.status_code == 200
|
| 36 |
data = response.json()
|
| 37 |
+
|
| 38 |
# Should only contain the filtered category
|
| 39 |
plugins = data["plugins"]
|
| 40 |
if "apis" in plugins:
|
|
|
|
| 45 |
def test_list_installed_plugins(self, client: TestClient) -> None:
|
| 46 |
"""Test GET /api/plugins/installed returns only installed plugins."""
|
| 47 |
response = client.get("/api/plugins/installed")
|
| 48 |
+
|
| 49 |
assert response.status_code == 200
|
| 50 |
data = response.json()
|
| 51 |
+
|
| 52 |
assert "plugins" in data
|
| 53 |
assert "count" in data
|
| 54 |
+
|
| 55 |
# All returned plugins should be installed
|
| 56 |
for plugin in data["plugins"]:
|
| 57 |
assert plugin["installed"] is True
|
| 58 |
+
|
| 59 |
# Count should match number of plugins
|
| 60 |
assert data["count"] == len(data["plugins"])
|
| 61 |
|
|
|
|
| 64 |
# First get list of plugins to find a valid ID
|
| 65 |
list_response = client.get("/api/plugins")
|
| 66 |
assert list_response.status_code == 200
|
| 67 |
+
|
| 68 |
plugins_data = list_response.json()
|
| 69 |
+
|
| 70 |
# Find first plugin from any category
|
| 71 |
plugin_id = None
|
| 72 |
for category, plugins in plugins_data["plugins"].items():
|
| 73 |
if plugins:
|
| 74 |
plugin_id = plugins[0]["id"]
|
| 75 |
break
|
| 76 |
+
|
| 77 |
if plugin_id:
|
| 78 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 79 |
assert response.status_code == 200
|
| 80 |
+
|
| 81 |
data = response.json()
|
| 82 |
assert data["id"] == plugin_id
|
| 83 |
assert "name" in data
|
|
|
|
| 87 |
def test_get_nonexistent_plugin(self, client: TestClient) -> None:
|
| 88 |
"""Test GET /api/plugins/{plugin_id} for non-existent plugin."""
|
| 89 |
response = client.get("/api/plugins/nonexistent-plugin")
|
| 90 |
+
|
| 91 |
assert response.status_code == 404
|
| 92 |
data = response.json()
|
| 93 |
assert "not found" in data["detail"].lower()
|
|
|
|
| 97 |
# First get a plugin that's not installed
|
| 98 |
list_response = client.get("/api/plugins")
|
| 99 |
assert list_response.status_code == 200
|
| 100 |
+
|
| 101 |
plugins_data = list_response.json()
|
| 102 |
+
|
| 103 |
# Find an uninstalled plugin
|
| 104 |
plugin_id = None
|
| 105 |
for category, plugins in plugins_data["plugins"].items():
|
|
|
|
| 109 |
break
|
| 110 |
if plugin_id:
|
| 111 |
break
|
| 112 |
+
|
| 113 |
if plugin_id:
|
| 114 |
payload = {"plugin_id": plugin_id}
|
| 115 |
response = client.post("/api/plugins/install", json=payload)
|
| 116 |
+
|
| 117 |
assert response.status_code == 200
|
| 118 |
data = response.json()
|
| 119 |
+
|
| 120 |
assert data["status"] == "success"
|
| 121 |
assert data["plugin"]["id"] == plugin_id
|
| 122 |
assert data["plugin"]["installed"] is True
|
|
|
|
| 127 |
# First install a plugin
|
| 128 |
list_response = client.get("/api/plugins")
|
| 129 |
assert list_response.status_code == 200
|
| 130 |
+
|
| 131 |
plugins_data = list_response.json()
|
| 132 |
+
|
| 133 |
# Find an uninstalled plugin to install first
|
| 134 |
plugin_id = None
|
| 135 |
for category, plugins in plugins_data["plugins"].items():
|
|
|
|
| 139 |
break
|
| 140 |
if plugin_id:
|
| 141 |
break
|
| 142 |
+
|
| 143 |
if plugin_id:
|
| 144 |
# Install it
|
| 145 |
payload = {"plugin_id": plugin_id}
|
| 146 |
response = client.post("/api/plugins/install", json=payload)
|
| 147 |
assert response.status_code == 200
|
| 148 |
+
|
| 149 |
# Try to install again
|
| 150 |
response = client.post("/api/plugins/install", json=payload)
|
| 151 |
assert response.status_code == 200
|
| 152 |
+
|
| 153 |
data = response.json()
|
| 154 |
assert data["status"] == "already_installed"
|
| 155 |
assert "already installed" in data["message"]
|
|
|
|
| 158 |
"""Test installing a non-existent plugin."""
|
| 159 |
payload = {"plugin_id": "nonexistent-plugin"}
|
| 160 |
response = client.post("/api/plugins/install", json=payload)
|
| 161 |
+
|
| 162 |
assert response.status_code == 404
|
| 163 |
data = response.json()
|
| 164 |
assert "not found" in data["detail"].lower()
|
|
|
|
| 168 |
# First install a non-core plugin
|
| 169 |
list_response = client.get("/api/plugins")
|
| 170 |
assert list_response.status_code == 200
|
| 171 |
+
|
| 172 |
plugins_data = list_response.json()
|
| 173 |
+
|
| 174 |
# Find a non-core plugin to install and then uninstall
|
| 175 |
+
core_plugins = {
|
| 176 |
+
"mcp-browser",
|
| 177 |
+
"mcp-search",
|
| 178 |
+
"mcp-html",
|
| 179 |
+
"mcp-python-sandbox",
|
| 180 |
+
"proc-json",
|
| 181 |
+
"proc-python",
|
| 182 |
+
"proc-pandas",
|
| 183 |
+
"proc-numpy",
|
| 184 |
+
"proc-bs4",
|
| 185 |
+
}
|
| 186 |
plugin_id = None
|
| 187 |
+
|
| 188 |
for category, plugins in plugins_data["plugins"].items():
|
| 189 |
for plugin in plugins:
|
| 190 |
if plugin["id"] not in core_plugins and not plugin["installed"]:
|
|
|
|
| 192 |
break
|
| 193 |
if plugin_id:
|
| 194 |
break
|
| 195 |
+
|
| 196 |
if plugin_id:
|
| 197 |
# Install it first
|
| 198 |
install_payload = {"plugin_id": plugin_id}
|
| 199 |
install_response = client.post("/api/plugins/install", json=install_payload)
|
| 200 |
assert install_response.status_code == 200
|
| 201 |
+
|
| 202 |
# Now uninstall it
|
| 203 |
uninstall_payload = {"plugin_id": plugin_id}
|
| 204 |
response = client.post("/api/plugins/uninstall", json=uninstall_payload)
|
| 205 |
+
|
| 206 |
assert response.status_code == 200
|
| 207 |
data = response.json()
|
| 208 |
+
|
| 209 |
assert data["status"] == "success"
|
| 210 |
assert data["plugin"]["id"] == plugin_id
|
| 211 |
assert data["plugin"]["installed"] is False
|
|
|
|
| 216 |
# Try to uninstall a core plugin
|
| 217 |
core_plugin_id = "mcp-browser" # This should be a core plugin
|
| 218 |
payload = {"plugin_id": core_plugin_id}
|
| 219 |
+
|
| 220 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 221 |
+
|
| 222 |
assert response.status_code == 400
|
| 223 |
data = response.json()
|
| 224 |
assert "Cannot uninstall core plugin" in data["detail"]
|
|
|
|
| 228 |
# Find an uninstalled non-core plugin
|
| 229 |
list_response = client.get("/api/plugins")
|
| 230 |
assert list_response.status_code == 200
|
| 231 |
+
|
| 232 |
plugins_data = list_response.json()
|
| 233 |
+
core_plugins = {
|
| 234 |
+
"mcp-browser",
|
| 235 |
+
"mcp-search",
|
| 236 |
+
"mcp-html",
|
| 237 |
+
"mcp-python-sandbox",
|
| 238 |
+
"proc-json",
|
| 239 |
+
"proc-python",
|
| 240 |
+
"proc-pandas",
|
| 241 |
+
"proc-numpy",
|
| 242 |
+
"proc-bs4",
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
plugin_id = None
|
| 246 |
for category, plugins in plugins_data["plugins"].items():
|
| 247 |
for plugin in plugins:
|
|
|
|
| 250 |
break
|
| 251 |
if plugin_id:
|
| 252 |
break
|
| 253 |
+
|
| 254 |
if plugin_id:
|
| 255 |
payload = {"plugin_id": plugin_id}
|
| 256 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 257 |
+
|
| 258 |
assert response.status_code == 200
|
| 259 |
data = response.json()
|
| 260 |
assert data["status"] == "not_installed"
|
|
|
|
| 264 |
"""Test uninstalling a non-existent plugin."""
|
| 265 |
payload = {"plugin_id": "nonexistent-plugin"}
|
| 266 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 267 |
+
|
| 268 |
assert response.status_code == 404
|
| 269 |
data = response.json()
|
| 270 |
assert "not found" in data["detail"].lower()
|
|
|
|
| 272 |
def test_get_categories(self, client: TestClient) -> None:
|
| 273 |
"""Test that plugins list includes categories."""
|
| 274 |
response = client.get("/api/plugins")
|
| 275 |
+
|
| 276 |
assert response.status_code == 200
|
| 277 |
data = response.json()
|
| 278 |
+
|
| 279 |
assert "categories" in data
|
| 280 |
categories = data["categories"]
|
| 281 |
+
|
| 282 |
assert isinstance(categories, list)
|
| 283 |
assert len(categories) > 0
|
| 284 |
+
|
| 285 |
# Categories are returned as strings (category IDs)
|
| 286 |
+
expected_categories = ["apis", "mcps", "processors"]
|
| 287 |
for expected in expected_categories:
|
| 288 |
assert expected in categories
|
| 289 |
|
| 290 |
+
# Agents/skills are intentionally managed via /api/agents, not /api/plugins
|
| 291 |
+
assert "skills" not in categories
|
| 292 |
+
|
| 293 |
def test_plugin_structure_validation(self, client: TestClient) -> None:
|
| 294 |
"""Test that all plugins have required fields."""
|
| 295 |
response = client.get("/api/plugins")
|
| 296 |
assert response.status_code == 200
|
| 297 |
+
|
| 298 |
data = response.json()
|
| 299 |
+
|
| 300 |
required_fields = ["id", "name", "category", "description", "version", "installed"]
|
| 301 |
+
|
| 302 |
for category, plugins in data["plugins"].items():
|
| 303 |
for plugin in plugins:
|
| 304 |
for field in required_fields:
|
| 305 |
+
assert field in plugin, (
|
| 306 |
+
f"Plugin {plugin.get('id', 'unknown')} missing field {field}"
|
| 307 |
+
)
|
| 308 |
|
| 309 |
def test_install_uninstall_payload_validation(self, client: TestClient) -> None:
|
| 310 |
"""Test payload validation for install/uninstall endpoints."""
|
| 311 |
# Missing plugin_id for install
|
| 312 |
response = client.post("/api/plugins/install", json={})
|
| 313 |
assert response.status_code == 422
|
| 314 |
+
|
| 315 |
# Missing plugin_id for uninstall
|
| 316 |
response = client.post("/api/plugins/uninstall", json={})
|
| 317 |
assert response.status_code == 422
|
| 318 |
+
|
| 319 |
# Invalid payload type
|
| 320 |
response = client.post("/api/plugins/install", json={"plugin_id": 123})
|
| 321 |
assert response.status_code == 422
|
|
|
|
| 325 |
# Find a non-core plugin
|
| 326 |
list_response = client.get("/api/plugins")
|
| 327 |
assert list_response.status_code == 200
|
| 328 |
+
|
| 329 |
plugins_data = list_response.json()
|
| 330 |
+
core_plugins = {
|
| 331 |
+
"mcp-browser",
|
| 332 |
+
"mcp-search",
|
| 333 |
+
"mcp-html",
|
| 334 |
+
"mcp-python-sandbox",
|
| 335 |
+
"proc-json",
|
| 336 |
+
"proc-python",
|
| 337 |
+
"proc-pandas",
|
| 338 |
+
"proc-numpy",
|
| 339 |
+
"proc-bs4",
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
plugin_id = None
|
| 343 |
for category, plugins in plugins_data["plugins"].items():
|
| 344 |
for plugin in plugins:
|
|
|
|
| 347 |
break
|
| 348 |
if plugin_id:
|
| 349 |
break
|
| 350 |
+
|
| 351 |
if plugin_id:
|
| 352 |
# Check initial state
|
| 353 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 354 |
initial_state = response.json()["installed"]
|
| 355 |
+
|
| 356 |
# Toggle state by installing if not installed, or uninstalling if installed and not core
|
| 357 |
if not initial_state:
|
| 358 |
payload = {"plugin_id": plugin_id}
|
| 359 |
response = client.post("/api/plugins/install", json=payload)
|
| 360 |
assert response.status_code == 200
|
| 361 |
+
|
| 362 |
# Verify state changed
|
| 363 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 364 |
assert response.json()["installed"] is True
|
|
|
|
| 368 |
payload = {"plugin_id": plugin_id}
|
| 369 |
response = client.post("/api/plugins/uninstall", json=payload)
|
| 370 |
assert response.status_code == 200
|
| 371 |
+
|
| 372 |
# Verify state changed
|
| 373 |
response = client.get(f"/api/plugins/{plugin_id}")
|
| 374 |
+
assert response.json()["installed"] is False
|
backend/tests/test_api/test_scrape_e2e.py
ADDED
|
@@ -0,0 +1,748 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""High-coverage end-to-end scrape tests with deterministic offline fixtures."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from collections import Counter
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Any
|
| 11 |
+
from urllib.parse import urlparse
|
| 12 |
+
|
| 13 |
+
import pytest
|
| 14 |
+
from fastapi.testclient import TestClient
|
| 15 |
+
|
| 16 |
+
from app.api.routes import scrape as scrape_routes
|
| 17 |
+
from app.core.action import Action
|
| 18 |
+
from app.core.env import WebScraperEnv
|
| 19 |
+
from app.sites.templates import SITE_TEMPLATES
|
| 20 |
+
|
| 21 |
+
BASE_PLUGINS = ["mcp-browser", "mcp-search", "mcp-html"]
|
| 22 |
+
PYTHON_PLUGINS = [
|
| 23 |
+
"mcp-python-sandbox",
|
| 24 |
+
"proc-python",
|
| 25 |
+
"proc-pandas",
|
| 26 |
+
"proc-numpy",
|
| 27 |
+
"proc-bs4",
|
| 28 |
+
]
|
| 29 |
+
DEFAULT_AGENTS = ["planner", "navigator", "extractor", "verifier"]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _is_live_network_mode() -> bool:
|
| 33 |
+
"""Return True when live-network E2E mode is enabled."""
|
| 34 |
+
|
| 35 |
+
raw = os.getenv("SCRAPERL_E2E_LIVE_NETWORK", "0").strip().lower()
|
| 36 |
+
return raw in {"1", "true", "yes", "on"}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _env_positive_int(name: str) -> int | None:
|
| 40 |
+
"""Read an optional positive integer environment variable."""
|
| 41 |
+
|
| 42 |
+
raw = os.getenv(name)
|
| 43 |
+
if raw is None:
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
value = int(raw)
|
| 48 |
+
except ValueError:
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
if value <= 0:
|
| 52 |
+
return None
|
| 53 |
+
return value
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass(frozen=True)
|
| 57 |
+
class E2ECase:
|
| 58 |
+
"""One end-to-end scrape test case."""
|
| 59 |
+
|
| 60 |
+
name: str
|
| 61 |
+
payload: dict[str, Any]
|
| 62 |
+
expected_template_id: str | None = None
|
| 63 |
+
expected_strategy: str | None = None
|
| 64 |
+
expect_sandbox: bool = False
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _build_gold_csv(months: int = 180) -> str:
|
| 68 |
+
"""Create deterministic monthly gold CSV data for offline tests."""
|
| 69 |
+
|
| 70 |
+
lines = ["Date,Price"]
|
| 71 |
+
year = 2012
|
| 72 |
+
month = 1
|
| 73 |
+
|
| 74 |
+
for index in range(months):
|
| 75 |
+
price = 1120.0 + (index * 2.75)
|
| 76 |
+
lines.append(f"{year:04d}-{month:02d}-01,{price:.2f}")
|
| 77 |
+
month += 1
|
| 78 |
+
if month > 12:
|
| 79 |
+
month = 1
|
| 80 |
+
year += 1
|
| 81 |
+
|
| 82 |
+
return "\n".join(lines)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _build_html_payload(url: str) -> str:
|
| 86 |
+
"""Build deterministic HTML content with rich extraction surfaces."""
|
| 87 |
+
|
| 88 |
+
parsed = urlparse(url)
|
| 89 |
+
domain = parsed.netloc or "example.com"
|
| 90 |
+
path = parsed.path or "/"
|
| 91 |
+
slug = path.strip("/").replace("/", "-") or "home"
|
| 92 |
+
|
| 93 |
+
github_cards = ""
|
| 94 |
+
if "github.com" in domain and ("trending" in path or "explore" in path or path == "/"):
|
| 95 |
+
github_cards = """
|
| 96 |
+
<article class="Box-row">
|
| 97 |
+
<h2><a href="/alpha/repo-one">alpha / repo-one</a></h2>
|
| 98 |
+
<a href="/alpha/repo-one/stargazers">1,234</a>
|
| 99 |
+
<a href="/alpha/repo-one/network/members">210</a>
|
| 100 |
+
</article>
|
| 101 |
+
<article class="Box-row">
|
| 102 |
+
<h2><a href="/beta/repo-two">beta / repo-two</a></h2>
|
| 103 |
+
<a href="/beta/repo-two/stargazers">987</a>
|
| 104 |
+
<a href="/beta/repo-two/network/members">145</a>
|
| 105 |
+
</article>
|
| 106 |
+
<article class="Box-row">
|
| 107 |
+
<h2><a href="/gamma/repo-three">gamma / repo-three</a></h2>
|
| 108 |
+
<a href="/gamma/repo-three/stargazers">876</a>
|
| 109 |
+
<a href="/gamma/repo-three/network/members">132</a>
|
| 110 |
+
</article>
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
return f"""
|
| 114 |
+
<html>
|
| 115 |
+
<head>
|
| 116 |
+
<title>{domain} :: {slug}</title>
|
| 117 |
+
<meta name="description" content="Mock page for {domain} and {slug}" />
|
| 118 |
+
<meta property="og:title" content="{domain} sample" />
|
| 119 |
+
</head>
|
| 120 |
+
<body>
|
| 121 |
+
<h1>{domain} heading</h1>
|
| 122 |
+
<p>
|
| 123 |
+
Offline content for {url}. Contact: test+{slug}@example.com
|
| 124 |
+
</p>
|
| 125 |
+
<a href="https://{domain}/about">About</a>
|
| 126 |
+
<a href="https://{domain}/contact">Contact</a>
|
| 127 |
+
<a href="mailto:hello@example.com">Email</a>
|
| 128 |
+
<img src="https://{domain}/logo.png" alt="logo" />
|
| 129 |
+
<form action="/submit" method="post">
|
| 130 |
+
<input type="text" name="query" />
|
| 131 |
+
<textarea name="notes"></textarea>
|
| 132 |
+
</form>
|
| 133 |
+
<table>
|
| 134 |
+
<tr><th>month</th><th>gold_price_usd</th></tr>
|
| 135 |
+
<tr><td>2016-01</td><td>1101.00</td></tr>
|
| 136 |
+
<tr><td>2016-02</td><td>1104.00</td></tr>
|
| 137 |
+
</table>
|
| 138 |
+
<script src="/assets/app.js"></script>
|
| 139 |
+
{github_cards}
|
| 140 |
+
</body>
|
| 141 |
+
</html>
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@pytest.fixture(autouse=True)
|
| 146 |
+
def patch_network_dependencies(monkeypatch: pytest.MonkeyPatch) -> None:
|
| 147 |
+
"""Patch network-facing dependencies for deterministic E2E execution."""
|
| 148 |
+
|
| 149 |
+
if _is_live_network_mode():
|
| 150 |
+
return
|
| 151 |
+
|
| 152 |
+
gold_csv = _build_gold_csv()
|
| 153 |
+
|
| 154 |
+
async def fake_execute_navigate(self: WebScraperEnv, action: Action) -> dict[str, Any]:
|
| 155 |
+
raw_url = str(action.get_param("url") or "https://example.com").strip()
|
| 156 |
+
normalized = raw_url
|
| 157 |
+
if not re.match(r"^https?://", normalized, flags=re.IGNORECASE):
|
| 158 |
+
normalized = f"https://{normalized}"
|
| 159 |
+
|
| 160 |
+
parsed = urlparse(normalized)
|
| 161 |
+
if not parsed.netloc:
|
| 162 |
+
return {"success": False, "error": f"Invalid URL: {raw_url}"}
|
| 163 |
+
|
| 164 |
+
self._current_url = normalized
|
| 165 |
+
self._navigation_history.append(normalized)
|
| 166 |
+
self._page_status_code = 200
|
| 167 |
+
|
| 168 |
+
if normalized.endswith(".csv") or "gold-prices" in normalized:
|
| 169 |
+
self._page_content_type = "text/csv"
|
| 170 |
+
self._page_html = gold_csv
|
| 171 |
+
self._page_title = "gold-prices-monthly"
|
| 172 |
+
else:
|
| 173 |
+
self._page_content_type = "text/html; charset=utf-8"
|
| 174 |
+
self._page_html = _build_html_payload(normalized)
|
| 175 |
+
self._page_title = parsed.netloc
|
| 176 |
+
|
| 177 |
+
return {
|
| 178 |
+
"success": True,
|
| 179 |
+
"url": normalized,
|
| 180 |
+
"status_code": 200,
|
| 181 |
+
"content_type": self._page_content_type,
|
| 182 |
+
"tls_verification_bypassed": False,
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
async def fake_search_urls(query: str, max_results: int = 6) -> list[str]:
|
| 186 |
+
lowered = query.lower()
|
| 187 |
+
|
| 188 |
+
if "gold" in lowered and ("price" in lowered or "trend" in lowered):
|
| 189 |
+
return [
|
| 190 |
+
"https://data.mock/gold/monthly.csv",
|
| 191 |
+
"https://github.com/datasets/gold-prices",
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
if "reddit" in lowered:
|
| 195 |
+
return [
|
| 196 |
+
"https://www.reddit.com/r/python/",
|
| 197 |
+
"https://www.reddit.com/r/machinelearning/",
|
| 198 |
+
"https://www.reddit.com/r/programming/",
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
token = re.sub(r"[^a-z0-9]+", "-", lowered).strip("-") or "query"
|
| 202 |
+
count = max(1, min(max_results, 3))
|
| 203 |
+
return [f"https://{token}.example.com/source-{idx}" for idx in range(1, count + 1)]
|
| 204 |
+
|
| 205 |
+
def fake_fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
|
| 206 |
+
communities = []
|
| 207 |
+
for idx in range(limit):
|
| 208 |
+
communities.append(
|
| 209 |
+
{
|
| 210 |
+
"subreddit": f"r/mockcommunity{idx + 1}",
|
| 211 |
+
"title": f"Mock Community {idx + 1}",
|
| 212 |
+
"subscribers": 200000 - (idx * 1000),
|
| 213 |
+
"active_users": 15000 - (idx * 100),
|
| 214 |
+
"url": f"https://www.reddit.com/r/mockcommunity{idx + 1}/",
|
| 215 |
+
"description": "Offline mocked Reddit community",
|
| 216 |
+
}
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
return communities, "mock_reddit_json"
|
| 220 |
+
|
| 221 |
+
monkeypatch.setattr(WebScraperEnv, "_execute_navigate", fake_execute_navigate)
|
| 222 |
+
monkeypatch.setattr(scrape_routes, "_search_urls_with_mcp", fake_search_urls)
|
| 223 |
+
monkeypatch.setattr(scrape_routes, "_fetch_reddit_communities", fake_fetch_reddit_communities)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _build_payload(
|
| 227 |
+
*,
|
| 228 |
+
assets: list[str],
|
| 229 |
+
instructions: str,
|
| 230 |
+
output_format: str = "json",
|
| 231 |
+
complexity: str = "low",
|
| 232 |
+
enable_plugins: list[str] | None = None,
|
| 233 |
+
selected_agents: list[str] | None = None,
|
| 234 |
+
python_code: str | None = None,
|
| 235 |
+
) -> dict[str, Any]:
|
| 236 |
+
"""Build a scrape payload using defaults aligned with app behavior."""
|
| 237 |
+
|
| 238 |
+
output_instructions = {
|
| 239 |
+
"json": "Return as structured JSON",
|
| 240 |
+
"csv": "Return as CSV with stable column order",
|
| 241 |
+
"markdown": "Return as Markdown sections",
|
| 242 |
+
"text": "Return as plain text summary",
|
| 243 |
+
}[output_format]
|
| 244 |
+
|
| 245 |
+
payload: dict[str, Any] = {
|
| 246 |
+
"assets": assets,
|
| 247 |
+
"instructions": instructions,
|
| 248 |
+
"output_instructions": output_instructions,
|
| 249 |
+
"output_format": output_format,
|
| 250 |
+
"complexity": complexity,
|
| 251 |
+
"model": "llama-3.3-70b",
|
| 252 |
+
"provider": "nvidia",
|
| 253 |
+
"enable_memory": True,
|
| 254 |
+
"enable_plugins": enable_plugins or list(BASE_PLUGINS),
|
| 255 |
+
"selected_agents": selected_agents or list(DEFAULT_AGENTS),
|
| 256 |
+
"max_steps": 50,
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
if python_code:
|
| 260 |
+
payload["python_code"] = python_code
|
| 261 |
+
|
| 262 |
+
return payload
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _build_e2e_cases() -> list[E2ECase]:
|
| 266 |
+
"""Build exactly 100 distinct E2E cases across templates and generic inputs."""
|
| 267 |
+
|
| 268 |
+
cases: list[E2ECase] = []
|
| 269 |
+
formats = ["json", "markdown", "text", "csv"]
|
| 270 |
+
|
| 271 |
+
for idx, template in enumerate(SITE_TEMPLATES):
|
| 272 |
+
output_format = formats[idx % len(formats)]
|
| 273 |
+
complexity = "low"
|
| 274 |
+
if idx % 17 == 0:
|
| 275 |
+
complexity = "medium"
|
| 276 |
+
if idx % 29 == 0:
|
| 277 |
+
complexity = "high"
|
| 278 |
+
|
| 279 |
+
plugins = list(BASE_PLUGINS)
|
| 280 |
+
expect_sandbox = False
|
| 281 |
+
python_code = None
|
| 282 |
+
|
| 283 |
+
if idx % 14 == 0:
|
| 284 |
+
plugins.extend(PYTHON_PLUGINS)
|
| 285 |
+
plugins.append("skill-planner")
|
| 286 |
+
expect_sandbox = True
|
| 287 |
+
python_code = (
|
| 288 |
+
"rows = payload.get('dataset_rows') or []\n"
|
| 289 |
+
"result = {'rows_seen': len(rows), 'source_links': len(payload.get('source_links') or [])}"
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
instructions = f"Collect structured highlights for {template.name} template case {idx + 1}"
|
| 293 |
+
expected_strategy = None
|
| 294 |
+
|
| 295 |
+
if template.site_id == "github":
|
| 296 |
+
instructions = f"Extract trending repo stats from GitHub case {idx + 1}"
|
| 297 |
+
expected_strategy = "github_trending"
|
| 298 |
+
elif template.site_id == "reddit":
|
| 299 |
+
instructions = f"Extract trending communities from Reddit case {idx + 1}"
|
| 300 |
+
expected_strategy = "reddit_trending"
|
| 301 |
+
|
| 302 |
+
cases.append(
|
| 303 |
+
E2ECase(
|
| 304 |
+
name=f"template-{idx + 1:02d}-{template.site_id}",
|
| 305 |
+
payload=_build_payload(
|
| 306 |
+
assets=[f"https://{template.domains[0]}"],
|
| 307 |
+
instructions=instructions,
|
| 308 |
+
output_format=output_format,
|
| 309 |
+
complexity=complexity,
|
| 310 |
+
enable_plugins=plugins,
|
| 311 |
+
python_code=python_code,
|
| 312 |
+
),
|
| 313 |
+
expected_template_id=template.site_id,
|
| 314 |
+
expected_strategy=expected_strategy,
|
| 315 |
+
expect_sandbox=expect_sandbox,
|
| 316 |
+
)
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
for idx in range(20):
|
| 320 |
+
query_assets = [f"synthetic discovery query batch {idx + 1}"]
|
| 321 |
+
if idx % 5 == 0:
|
| 322 |
+
query_assets.append(f"synthetic companion signal {idx + 1}")
|
| 323 |
+
|
| 324 |
+
plugins = list(BASE_PLUGINS)
|
| 325 |
+
if idx % 4 == 0:
|
| 326 |
+
plugins.append("skill-navigator")
|
| 327 |
+
|
| 328 |
+
cases.append(
|
| 329 |
+
E2ECase(
|
| 330 |
+
name=f"query-{idx + 1:02d}",
|
| 331 |
+
payload=_build_payload(
|
| 332 |
+
assets=query_assets,
|
| 333 |
+
instructions=f"Search and extract useful findings for synthetic query case {idx + 1}",
|
| 334 |
+
output_format="json",
|
| 335 |
+
complexity="low",
|
| 336 |
+
enable_plugins=plugins,
|
| 337 |
+
),
|
| 338 |
+
)
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
for idx in range(10):
|
| 342 |
+
cases.append(
|
| 343 |
+
E2ECase(
|
| 344 |
+
name=f"gold-dataset-{idx + 1:02d}",
|
| 345 |
+
payload=_build_payload(
|
| 346 |
+
assets=[f"gold price trend monthly dataset request {idx + 1}"],
|
| 347 |
+
instructions=f"Build monthly gold price trend dataset from 2016 case {idx + 1}",
|
| 348 |
+
output_format="csv",
|
| 349 |
+
complexity="high",
|
| 350 |
+
enable_plugins=[*BASE_PLUGINS, *PYTHON_PLUGINS, "skill-extractor"],
|
| 351 |
+
python_code=(
|
| 352 |
+
"rows = payload.get('dataset_rows') or []\n"
|
| 353 |
+
"columns = sorted(list(rows[0].keys())) if rows else []\n"
|
| 354 |
+
"result = {'rows_seen': len(rows), 'columns': columns}"
|
| 355 |
+
),
|
| 356 |
+
),
|
| 357 |
+
expect_sandbox=True,
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
for idx in range(7):
|
| 362 |
+
cases.append(
|
| 363 |
+
E2ECase(
|
| 364 |
+
name=f"github-trending-extra-{idx + 1:02d}",
|
| 365 |
+
payload=_build_payload(
|
| 366 |
+
assets=[f"https://github.com/trending?since=daily&batch={idx + 1}"],
|
| 367 |
+
instructions=f"List trending GitHub repositories and stats case {idx + 1}",
|
| 368 |
+
output_format="csv",
|
| 369 |
+
complexity="medium",
|
| 370 |
+
enable_plugins=list(BASE_PLUGINS),
|
| 371 |
+
),
|
| 372 |
+
expected_template_id="github",
|
| 373 |
+
expected_strategy="github_trending",
|
| 374 |
+
)
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
for idx in range(7):
|
| 378 |
+
cases.append(
|
| 379 |
+
E2ECase(
|
| 380 |
+
name=f"reddit-trending-extra-{idx + 1:02d}",
|
| 381 |
+
payload=_build_payload(
|
| 382 |
+
assets=[f"https://www.reddit.com/?batch={idx + 1}"],
|
| 383 |
+
instructions=f"List trending Reddit communities and activity case {idx + 1}",
|
| 384 |
+
output_format="csv",
|
| 385 |
+
complexity="medium",
|
| 386 |
+
enable_plugins=list(BASE_PLUGINS),
|
| 387 |
+
),
|
| 388 |
+
expected_template_id="reddit",
|
| 389 |
+
expected_strategy="reddit_trending",
|
| 390 |
+
)
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
assert len(cases) == 100
|
| 394 |
+
assert len({case.name for case in cases}) == 100
|
| 395 |
+
return cases
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def _build_live_network_cases() -> list[E2ECase]:
|
| 399 |
+
"""Build live-network E2E cases (no mocks) for staging validation."""
|
| 400 |
+
|
| 401 |
+
return [
|
| 402 |
+
E2ECase(
|
| 403 |
+
name="live-github-trending",
|
| 404 |
+
payload=_build_payload(
|
| 405 |
+
assets=["https://github.com/trending"],
|
| 406 |
+
instructions="Extract trending repo stats from GitHub",
|
| 407 |
+
output_format="csv",
|
| 408 |
+
complexity="medium",
|
| 409 |
+
enable_plugins=[*BASE_PLUGINS, "skill-planner"],
|
| 410 |
+
),
|
| 411 |
+
expected_template_id="github",
|
| 412 |
+
expected_strategy="github_trending",
|
| 413 |
+
),
|
| 414 |
+
E2ECase(
|
| 415 |
+
name="live-reddit-trending",
|
| 416 |
+
payload=_build_payload(
|
| 417 |
+
assets=["https://www.reddit.com/"],
|
| 418 |
+
instructions="Extract trending communities from Reddit",
|
| 419 |
+
output_format="csv",
|
| 420 |
+
complexity="medium",
|
| 421 |
+
enable_plugins=[*BASE_PLUGINS, "skill-navigator"],
|
| 422 |
+
),
|
| 423 |
+
expected_template_id="reddit",
|
| 424 |
+
expected_strategy="reddit_trending",
|
| 425 |
+
),
|
| 426 |
+
E2ECase(
|
| 427 |
+
name="live-wikipedia-main",
|
| 428 |
+
payload=_build_payload(
|
| 429 |
+
assets=["https://en.wikipedia.org/wiki/Main_Page"],
|
| 430 |
+
instructions="Extract reference content summary",
|
| 431 |
+
output_format="json",
|
| 432 |
+
complexity="low",
|
| 433 |
+
),
|
| 434 |
+
expected_template_id="wikipedia",
|
| 435 |
+
),
|
| 436 |
+
E2ECase(
|
| 437 |
+
name="live-python-home",
|
| 438 |
+
payload=_build_payload(
|
| 439 |
+
assets=["https://www.python.org/"],
|
| 440 |
+
instructions="Extract homepage highlights and links",
|
| 441 |
+
output_format="markdown",
|
| 442 |
+
complexity="low",
|
| 443 |
+
),
|
| 444 |
+
),
|
| 445 |
+
E2ECase(
|
| 446 |
+
name="live-huggingface-models",
|
| 447 |
+
payload=_build_payload(
|
| 448 |
+
assets=["https://huggingface.co/models"],
|
| 449 |
+
instructions="Extract model hub highlights",
|
| 450 |
+
output_format="json",
|
| 451 |
+
complexity="low",
|
| 452 |
+
),
|
| 453 |
+
expected_template_id="huggingface",
|
| 454 |
+
),
|
| 455 |
+
E2ECase(
|
| 456 |
+
name="live-arxiv-new",
|
| 457 |
+
payload=_build_payload(
|
| 458 |
+
assets=["https://arxiv.org/list/cs/new"],
|
| 459 |
+
instructions="Extract latest computer science papers",
|
| 460 |
+
output_format="json",
|
| 461 |
+
complexity="low",
|
| 462 |
+
),
|
| 463 |
+
expected_template_id="arxiv",
|
| 464 |
+
),
|
| 465 |
+
E2ECase(
|
| 466 |
+
name="live-stackoverflow-questions",
|
| 467 |
+
payload=_build_payload(
|
| 468 |
+
assets=["https://stackoverflow.com/questions"],
|
| 469 |
+
instructions="Extract top question cards and metadata",
|
| 470 |
+
output_format="text",
|
| 471 |
+
complexity="low",
|
| 472 |
+
),
|
| 473 |
+
expected_template_id="stackoverflow",
|
| 474 |
+
),
|
| 475 |
+
E2ECase(
|
| 476 |
+
name="live-example-domain",
|
| 477 |
+
payload=_build_payload(
|
| 478 |
+
assets=["https://example.com"],
|
| 479 |
+
instructions="Extract title, content, and links",
|
| 480 |
+
output_format="text",
|
| 481 |
+
complexity="low",
|
| 482 |
+
),
|
| 483 |
+
),
|
| 484 |
+
E2ECase(
|
| 485 |
+
name="live-query-discovery-1",
|
| 486 |
+
payload=_build_payload(
|
| 487 |
+
assets=["open source scraping frameworks comparison"],
|
| 488 |
+
instructions="Search and extract useful findings",
|
| 489 |
+
output_format="json",
|
| 490 |
+
complexity="low",
|
| 491 |
+
),
|
| 492 |
+
),
|
| 493 |
+
E2ECase(
|
| 494 |
+
name="live-query-discovery-2",
|
| 495 |
+
payload=_build_payload(
|
| 496 |
+
assets=["python data extraction tutorials"],
|
| 497 |
+
instructions="Search and extract useful findings",
|
| 498 |
+
output_format="markdown",
|
| 499 |
+
complexity="low",
|
| 500 |
+
),
|
| 501 |
+
),
|
| 502 |
+
E2ECase(
|
| 503 |
+
name="live-gold-dataset",
|
| 504 |
+
payload=_build_payload(
|
| 505 |
+
assets=["gold price trend monthly dataset"],
|
| 506 |
+
instructions="Build monthly gold price trend dataset from 2016 onward",
|
| 507 |
+
output_format="csv",
|
| 508 |
+
complexity="high",
|
| 509 |
+
enable_plugins=[*BASE_PLUGINS, *PYTHON_PLUGINS, "skill-extractor"],
|
| 510 |
+
python_code=(
|
| 511 |
+
"rows = payload.get('dataset_rows') or []\n"
|
| 512 |
+
"result = {'rows_seen': len(rows), 'columns': sorted(list(rows[0].keys())) if rows else []}"
|
| 513 |
+
),
|
| 514 |
+
),
|
| 515 |
+
expect_sandbox=True,
|
| 516 |
+
),
|
| 517 |
+
E2ECase(
|
| 518 |
+
name="live-github-explore",
|
| 519 |
+
payload=_build_payload(
|
| 520 |
+
assets=["https://github.com/explore"],
|
| 521 |
+
instructions="Extract repository metadata from GitHub explore",
|
| 522 |
+
output_format="json",
|
| 523 |
+
complexity="medium",
|
| 524 |
+
),
|
| 525 |
+
expected_template_id="github",
|
| 526 |
+
),
|
| 527 |
+
]
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
def _collect_stream_events(client: TestClient, payload: dict[str, Any]) -> list[dict[str, Any]]:
|
| 531 |
+
"""Run one stream scrape request and collect SSE events."""
|
| 532 |
+
|
| 533 |
+
events: list[dict[str, Any]] = []
|
| 534 |
+
|
| 535 |
+
with client.stream("POST", "/api/scrape/stream", json=payload) as response:
|
| 536 |
+
assert response.status_code == 200
|
| 537 |
+
|
| 538 |
+
for raw_line in response.iter_lines():
|
| 539 |
+
if not raw_line:
|
| 540 |
+
continue
|
| 541 |
+
|
| 542 |
+
line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
|
| 543 |
+
if not line.startswith("data: "):
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
event = json.loads(line[6:])
|
| 547 |
+
events.append(event)
|
| 548 |
+
if event.get("type") == "complete":
|
| 549 |
+
break
|
| 550 |
+
|
| 551 |
+
return events
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
def _run_case_batch(client: TestClient, cases: list[E2ECase]) -> dict[str, Any]:
|
| 555 |
+
"""Execute a batch of cases and collect validation stats."""
|
| 556 |
+
|
| 557 |
+
failures: list[str] = []
|
| 558 |
+
tool_call_counts: Counter[str] = Counter()
|
| 559 |
+
strategy_counts: Counter[str] = Counter()
|
| 560 |
+
seen_template_ids: set[str] = set()
|
| 561 |
+
sandbox_success_cases = 0
|
| 562 |
+
completed_cases = 0
|
| 563 |
+
|
| 564 |
+
for case in cases:
|
| 565 |
+
session_id: str | None = None
|
| 566 |
+
|
| 567 |
+
try:
|
| 568 |
+
events = _collect_stream_events(client, case.payload)
|
| 569 |
+
|
| 570 |
+
init_event = next((event for event in events if event.get("type") == "init"), None)
|
| 571 |
+
complete_event = next(
|
| 572 |
+
(event for event in events if event.get("type") == "complete"),
|
| 573 |
+
None,
|
| 574 |
+
)
|
| 575 |
+
|
| 576 |
+
assert init_event is not None, "missing init event"
|
| 577 |
+
session_id = str(init_event["session_id"])
|
| 578 |
+
assert complete_event is not None, "missing complete event"
|
| 579 |
+
|
| 580 |
+
complete_data = complete_event.get("data")
|
| 581 |
+
assert isinstance(complete_data, dict), "complete payload is not a dictionary"
|
| 582 |
+
assert complete_data["session_id"] == session_id
|
| 583 |
+
assert complete_data["status"] in {"completed", "partial"}
|
| 584 |
+
assert int(complete_data["total_steps"]) > 0
|
| 585 |
+
assert int(complete_data["urls_processed"]) >= 1
|
| 586 |
+
|
| 587 |
+
if complete_data["status"] == "completed":
|
| 588 |
+
completed_cases += 1
|
| 589 |
+
|
| 590 |
+
enabled_plugins = complete_data.get("enabled_plugins") or []
|
| 591 |
+
assert all(not str(plugin_id).startswith("skill-") for plugin_id in enabled_plugins)
|
| 592 |
+
assert "web_scraper" not in enabled_plugins
|
| 593 |
+
|
| 594 |
+
steps = [
|
| 595 |
+
event.get("data")
|
| 596 |
+
for event in events
|
| 597 |
+
if event.get("type") == "step" and isinstance(event.get("data"), dict)
|
| 598 |
+
]
|
| 599 |
+
assert steps, "no step events emitted"
|
| 600 |
+
|
| 601 |
+
case_template_ids: set[str] = set()
|
| 602 |
+
case_strategies: set[str] = set()
|
| 603 |
+
|
| 604 |
+
for step in steps:
|
| 605 |
+
action = step.get("action")
|
| 606 |
+
extracted = step.get("extracted_data")
|
| 607 |
+
if not isinstance(extracted, dict):
|
| 608 |
+
continue
|
| 609 |
+
|
| 610 |
+
if action == "tool_call":
|
| 611 |
+
tool_name = extracted.get("tool_name")
|
| 612 |
+
if isinstance(tool_name, str) and tool_name:
|
| 613 |
+
tool_call_counts[tool_name] += 1
|
| 614 |
+
|
| 615 |
+
if action == "plugins":
|
| 616 |
+
strategy = extracted.get("navigation_strategy")
|
| 617 |
+
if isinstance(strategy, str) and strategy:
|
| 618 |
+
case_strategies.add(strategy)
|
| 619 |
+
strategy_counts[strategy] += 1
|
| 620 |
+
|
| 621 |
+
if action == "site_template":
|
| 622 |
+
site_id = extracted.get("site_id")
|
| 623 |
+
if isinstance(site_id, str) and site_id:
|
| 624 |
+
case_template_ids.add(site_id)
|
| 625 |
+
|
| 626 |
+
seen_template_ids.update(case_template_ids)
|
| 627 |
+
|
| 628 |
+
if case.expected_template_id:
|
| 629 |
+
assert case.expected_template_id in case_template_ids, (
|
| 630 |
+
f"expected site template '{case.expected_template_id}' not emitted"
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
if case.expected_strategy:
|
| 634 |
+
assert case.expected_strategy in case_strategies, (
|
| 635 |
+
f"expected strategy '{case.expected_strategy}' not emitted"
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
sandbox_seen = any(
|
| 639 |
+
step.get("action") in {"planner_python", "navigator_python", "python_sandbox"}
|
| 640 |
+
for step in steps
|
| 641 |
+
)
|
| 642 |
+
if case.expect_sandbox:
|
| 643 |
+
assert sandbox_seen, "sandbox execution steps not emitted"
|
| 644 |
+
sandbox_success_cases += 1
|
| 645 |
+
|
| 646 |
+
except AssertionError as exc:
|
| 647 |
+
failures.append(f"{case.name}: {exc}")
|
| 648 |
+
finally:
|
| 649 |
+
if session_id:
|
| 650 |
+
cleanup_response = client.delete(f"/api/scrape/{session_id}/cleanup")
|
| 651 |
+
assert cleanup_response.status_code in {200, 404}
|
| 652 |
+
|
| 653 |
+
return {
|
| 654 |
+
"failures": failures,
|
| 655 |
+
"tool_call_counts": tool_call_counts,
|
| 656 |
+
"strategy_counts": strategy_counts,
|
| 657 |
+
"seen_template_ids": seen_template_ids,
|
| 658 |
+
"sandbox_success_cases": sandbox_success_cases,
|
| 659 |
+
"completed_cases": completed_cases,
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
def test_plugins_registry_excludes_agent_skills(client: TestClient) -> None:
|
| 664 |
+
"""Plugin API should not duplicate agent skills from /api/agents."""
|
| 665 |
+
|
| 666 |
+
response = client.get("/api/plugins")
|
| 667 |
+
assert response.status_code == 200
|
| 668 |
+
payload = response.json()
|
| 669 |
+
|
| 670 |
+
categories = payload["categories"]
|
| 671 |
+
assert "skills" not in categories
|
| 672 |
+
|
| 673 |
+
plugin_ids = [plugin["id"] for plugins in payload["plugins"].values() for plugin in plugins]
|
| 674 |
+
assert all(not plugin_id.startswith("skill-") for plugin_id in plugin_ids)
|
| 675 |
+
assert "web_scraper" not in plugin_ids
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
def test_scraper_e2e_100_inputs_templates_tools_plugins_and_sandbox(
|
| 679 |
+
client: TestClient,
|
| 680 |
+
) -> None:
|
| 681 |
+
"""Run 100 end-to-end scrape inputs and validate major system behavior."""
|
| 682 |
+
|
| 683 |
+
if _is_live_network_mode():
|
| 684 |
+
pytest.skip("Offline deterministic E2E suite is skipped in live-network mode")
|
| 685 |
+
|
| 686 |
+
cases = _build_e2e_cases()
|
| 687 |
+
summary = _run_case_batch(client, cases)
|
| 688 |
+
|
| 689 |
+
assert len(cases) == 100
|
| 690 |
+
assert not summary["failures"], " | ".join(summary["failures"][:12])
|
| 691 |
+
|
| 692 |
+
expected_template_ids = {template.site_id for template in SITE_TEMPLATES}
|
| 693 |
+
assert expected_template_ids.issubset(summary["seen_template_ids"])
|
| 694 |
+
|
| 695 |
+
required_tool_calls = {
|
| 696 |
+
"url.parse",
|
| 697 |
+
"validate.url",
|
| 698 |
+
"browser.navigate",
|
| 699 |
+
"html.parse",
|
| 700 |
+
"html.extract",
|
| 701 |
+
"memory.store",
|
| 702 |
+
"sandbox.execute",
|
| 703 |
+
"extract.urls",
|
| 704 |
+
"extract.emails",
|
| 705 |
+
"csv.generate",
|
| 706 |
+
}
|
| 707 |
+
assert required_tool_calls.issubset(set(summary["tool_call_counts"].keys()))
|
| 708 |
+
|
| 709 |
+
assert summary["strategy_counts"]["github_trending"] >= 1
|
| 710 |
+
assert summary["strategy_counts"]["reddit_trending"] >= 1
|
| 711 |
+
assert summary["sandbox_success_cases"] >= 10
|
| 712 |
+
assert summary["completed_cases"] >= 95
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
@pytest.mark.skipif(
|
| 716 |
+
not _is_live_network_mode(),
|
| 717 |
+
reason="Enable SCRAPERL_E2E_LIVE_NETWORK=1 for live-network staging runs",
|
| 718 |
+
)
|
| 719 |
+
def test_scraper_e2e_live_network_mode_staging(client: TestClient) -> None:
|
| 720 |
+
"""Live-network E2E mode with no mocks, controlled by environment flag."""
|
| 721 |
+
|
| 722 |
+
cases = _build_live_network_cases()
|
| 723 |
+
case_limit = _env_positive_int("SCRAPERL_E2E_LIVE_CASE_LIMIT")
|
| 724 |
+
if case_limit is not None:
|
| 725 |
+
cases = cases[: min(case_limit, len(cases))]
|
| 726 |
+
|
| 727 |
+
summary = _run_case_batch(client, cases)
|
| 728 |
+
|
| 729 |
+
assert not summary["failures"], " | ".join(summary["failures"][:10])
|
| 730 |
+
|
| 731 |
+
expected_templates = {case.expected_template_id for case in cases if case.expected_template_id}
|
| 732 |
+
assert expected_templates.issubset(summary["seen_template_ids"])
|
| 733 |
+
|
| 734 |
+
required_tool_calls = {
|
| 735 |
+
"url.parse",
|
| 736 |
+
"browser.navigate",
|
| 737 |
+
"html.parse",
|
| 738 |
+
"html.extract",
|
| 739 |
+
"memory.store",
|
| 740 |
+
}
|
| 741 |
+
assert required_tool_calls.issubset(set(summary["tool_call_counts"].keys()))
|
| 742 |
+
|
| 743 |
+
expected_sandbox_cases = sum(1 for case in cases if case.expect_sandbox)
|
| 744 |
+
assert summary["sandbox_success_cases"] >= expected_sandbox_cases
|
| 745 |
+
|
| 746 |
+
assert summary["strategy_counts"]["github_trending"] >= 1
|
| 747 |
+
assert summary["strategy_counts"]["reddit_trending"] >= 1
|
| 748 |
+
assert summary["completed_cases"] >= max(1, len(cases) // 2)
|
frontend/src/components/PluginsPage.tsx
CHANGED
|
@@ -10,7 +10,6 @@ import {
|
|
| 10 |
AlertCircle,
|
| 11 |
Loader2,
|
| 12 |
Plug,
|
| 13 |
-
Cpu,
|
| 14 |
Wrench,
|
| 15 |
Database,
|
| 16 |
Sparkles,
|
|
@@ -49,8 +48,6 @@ const getCategoryIcon = (category: string) => {
|
|
| 49 |
return <Plug className="w-5 h-5 text-cyan-400" />;
|
| 50 |
case 'mcps':
|
| 51 |
return <Wrench className="w-5 h-5 text-amber-400" />;
|
| 52 |
-
case 'skills':
|
| 53 |
-
return <Cpu className="w-5 h-5 text-purple-400" />;
|
| 54 |
case 'processors':
|
| 55 |
return <Database className="w-5 h-5 text-pink-400" />;
|
| 56 |
default:
|
|
@@ -62,7 +59,6 @@ const getCategoryLabel = (category: string) => {
|
|
| 62 |
const labels: Record<string, string> = {
|
| 63 |
apis: 'API Providers',
|
| 64 |
mcps: 'MCP Tools',
|
| 65 |
-
skills: 'Skills & Agents',
|
| 66 |
processors: 'Data Processors',
|
| 67 |
};
|
| 68 |
return labels[category] || category;
|
|
@@ -72,7 +68,6 @@ const getCategoryColor = (category: string) => {
|
|
| 72 |
const colors: Record<string, string> = {
|
| 73 |
apis: 'from-cyan-500/20 to-blue-500/10 border-cyan-500/30',
|
| 74 |
mcps: 'from-amber-500/20 to-orange-500/10 border-amber-500/30',
|
| 75 |
-
skills: 'from-purple-500/20 to-pink-500/10 border-purple-500/30',
|
| 76 |
processors: 'from-pink-500/20 to-rose-500/10 border-pink-500/30',
|
| 77 |
};
|
| 78 |
return colors[category] || 'from-gray-500/20 to-gray-500/10 border-gray-500/30';
|
|
@@ -169,7 +164,7 @@ export const PluginsPage: React.FC<PluginsPageProps> = ({ className }) => {
|
|
| 169 |
Plugins
|
| 170 |
</h1>
|
| 171 |
<p className="text-gray-400 mt-1">
|
| 172 |
-
Extend ScrapeRL with APIs, tools,
|
| 173 |
</p>
|
| 174 |
</div>
|
| 175 |
|
|
@@ -228,7 +223,7 @@ export const PluginsPage: React.FC<PluginsPageProps> = ({ className }) => {
|
|
| 228 |
>
|
| 229 |
All
|
| 230 |
</button>
|
| 231 |
-
{['apis', 'mcps', '
|
| 232 |
<button
|
| 233 |
key={cat}
|
| 234 |
onClick={() => setSelectedCategory(cat)}
|
|
|
|
| 10 |
AlertCircle,
|
| 11 |
Loader2,
|
| 12 |
Plug,
|
|
|
|
| 13 |
Wrench,
|
| 14 |
Database,
|
| 15 |
Sparkles,
|
|
|
|
| 48 |
return <Plug className="w-5 h-5 text-cyan-400" />;
|
| 49 |
case 'mcps':
|
| 50 |
return <Wrench className="w-5 h-5 text-amber-400" />;
|
|
|
|
|
|
|
| 51 |
case 'processors':
|
| 52 |
return <Database className="w-5 h-5 text-pink-400" />;
|
| 53 |
default:
|
|
|
|
| 59 |
const labels: Record<string, string> = {
|
| 60 |
apis: 'API Providers',
|
| 61 |
mcps: 'MCP Tools',
|
|
|
|
| 62 |
processors: 'Data Processors',
|
| 63 |
};
|
| 64 |
return labels[category] || category;
|
|
|
|
| 68 |
const colors: Record<string, string> = {
|
| 69 |
apis: 'from-cyan-500/20 to-blue-500/10 border-cyan-500/30',
|
| 70 |
mcps: 'from-amber-500/20 to-orange-500/10 border-amber-500/30',
|
|
|
|
| 71 |
processors: 'from-pink-500/20 to-rose-500/10 border-pink-500/30',
|
| 72 |
};
|
| 73 |
return colors[category] || 'from-gray-500/20 to-gray-500/10 border-gray-500/30';
|
|
|
|
| 164 |
Plugins
|
| 165 |
</h1>
|
| 166 |
<p className="text-gray-400 mt-1">
|
| 167 |
+
Extend ScrapeRL with APIs, MCP tools, and processors
|
| 168 |
</p>
|
| 169 |
</div>
|
| 170 |
|
|
|
|
| 223 |
>
|
| 224 |
All
|
| 225 |
</button>
|
| 226 |
+
{(pluginsData?.categories || ['apis', 'mcps', 'processors']).map((cat) => (
|
| 227 |
<button
|
| 228 |
key={cat}
|
| 229 |
onClick={() => setSelectedCategory(cat)}
|