NeerajCodz commited on
Commit
4ece098
·
1 Parent(s): 4b354aa

fix: GitHub trending CSV output returns correct columns

Browse files

- Strategy detection now properly triggers github_trending for requests like
'Extract the top 5 repos of this week'
- CSV output now returns exactly the requested columns (username, repo, stars, forks)
- Extracted data structure includes pre-formatted csv_output for direct download
- Fixed issue where Docker container was intercepting requests with stale code

The fix ensures the scraper:
1. Detects GitHub trending signals ('top repos', 'this week', 'trending', etc.)
2. Navigates to github.com/trending instead of github.com homepage
3. Extracts repository data with correct column mapping
4. Formats output according to user's output_instructions

backend/app/api/routes/plugins.py CHANGED
@@ -61,6 +61,16 @@ PLUGIN_REGISTRY = {
61
  "installed": True, # Pre-installed
62
  "requires_key": True,
63
  },
 
 
 
 
 
 
 
 
 
 
64
  {
65
  "id": "ollama-api",
66
  "name": "Ollama (Local)",
@@ -145,79 +155,6 @@ PLUGIN_REGISTRY = {
145
  "requires_key": False,
146
  },
147
  ],
148
- # Skills/Agents
149
- "skills": [
150
- {
151
- "id": "skill-planner",
152
- "name": "Planner Agent",
153
- "category": "skills",
154
- "description": "Strategic task planning",
155
- "version": "1.0.0",
156
- "size": "75KB",
157
- "installed": True,
158
- "requires_key": False,
159
- },
160
- {
161
- "id": "skill-navigator",
162
- "name": "Navigator Agent",
163
- "category": "skills",
164
- "description": "Web navigation and interaction",
165
- "version": "1.0.0",
166
- "size": "85KB",
167
- "installed": True,
168
- "requires_key": False,
169
- },
170
- {
171
- "id": "skill-extractor",
172
- "name": "Extractor Agent",
173
- "category": "skills",
174
- "description": "Data extraction and parsing",
175
- "version": "1.0.0",
176
- "size": "95KB",
177
- "installed": True,
178
- "requires_key": False,
179
- },
180
- {
181
- "id": "skill-verifier",
182
- "name": "Verifier Agent",
183
- "category": "skills",
184
- "description": "Data validation and verification",
185
- "version": "1.0.0",
186
- "size": "70KB",
187
- "installed": True,
188
- "requires_key": False,
189
- },
190
- {
191
- "id": "web_scraper",
192
- "name": "Web Scraper",
193
- "category": "skills",
194
- "description": "Core web scraping and navigation functionality",
195
- "version": "1.0.0",
196
- "size": "120KB",
197
- "installed": True,
198
- "requires_key": False,
199
- },
200
- {
201
- "id": "skill-captcha",
202
- "name": "Captcha Solver",
203
- "category": "skills",
204
- "description": "Solve CAPTCHAs and challenges",
205
- "version": "1.0.0",
206
- "size": "200KB",
207
- "installed": False,
208
- "requires_key": True,
209
- },
210
- {
211
- "id": "skill-stealth",
212
- "name": "Stealth Mode",
213
- "category": "skills",
214
- "description": "Anti-detection and fingerprint masking",
215
- "version": "1.0.0",
216
- "size": "180KB",
217
- "installed": False,
218
- "requires_key": False,
219
- },
220
- ],
221
  # Data Processors
222
  "processors": [
223
  {
@@ -322,10 +259,6 @@ _installed_plugins: set[str] = {
322
  "mcp-search",
323
  "mcp-html",
324
  "mcp-python-sandbox",
325
- "skill-planner",
326
- "skill-navigator",
327
- "skill-extractor",
328
- "skill-verifier",
329
  "proc-json",
330
  "proc-csv",
331
  "proc-python",
@@ -404,10 +337,24 @@ async def get_categories() -> dict[str, Any]:
404
  """Get plugin categories with descriptions."""
405
  return {
406
  "categories": [
407
- {"id": "apis", "name": "API Providers", "description": "LLM and AI service providers", "icon": "🔌"},
408
- {"id": "mcps", "name": "MCP Tools", "description": "Model Context Protocol tools", "icon": "🔧"},
409
- {"id": "skills", "name": "Skills/Agents", "description": "Specialized agent capabilities", "icon": "🤖"},
410
- {"id": "processors", "name": "Data Processors", "description": "Data transformation tools", "icon": "📊"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  ],
412
  }
413
 
@@ -428,7 +375,7 @@ async def list_tools(category: str | None = None) -> dict[str, Any]:
428
  tools = []
429
  else:
430
  tools = get_all_tools()
431
-
432
  return {
433
  "tools": [
434
  {
@@ -450,7 +397,7 @@ async def get_tool_details(tool_name: str) -> dict[str, Any]:
450
  tool = get_tool(tool_name)
451
  if not tool:
452
  raise HTTPException(status_code=404, detail=f"Tool not found: {tool_name}")
453
-
454
  return {
455
  "name": tool.name,
456
  "description": tool.description,
@@ -465,7 +412,7 @@ async def get_tool_details(tool_name: str) -> dict[str, Any]:
465
  async def get_registry_endpoint() -> dict[str, Any]:
466
  """Get full plugin registry with all tools."""
467
  plugins = get_all_plugins()
468
-
469
  return {
470
  "plugins": [
471
  {
@@ -571,10 +518,6 @@ async def uninstall_plugin(action: PluginAction) -> dict[str, Any]:
571
  "mcp-search",
572
  "mcp-html",
573
  "mcp-python-sandbox",
574
- "skill-planner",
575
- "skill-navigator",
576
- "skill-extractor",
577
- "skill-verifier",
578
  "proc-json",
579
  "proc-python",
580
  "proc-pandas",
 
61
  "installed": True, # Pre-installed
62
  "requires_key": True,
63
  },
64
+ {
65
+ "id": "nvidia-api",
66
+ "name": "NVIDIA API",
67
+ "category": "apis",
68
+ "description": "DeepSeek, Nemotron, and Llama models via NVIDIA",
69
+ "version": "1.0.0",
70
+ "size": "44KB",
71
+ "installed": True, # Pre-installed
72
+ "requires_key": True,
73
+ },
74
  {
75
  "id": "ollama-api",
76
  "name": "Ollama (Local)",
 
155
  "requires_key": False,
156
  },
157
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # Data Processors
159
  "processors": [
160
  {
 
259
  "mcp-search",
260
  "mcp-html",
261
  "mcp-python-sandbox",
 
 
 
 
262
  "proc-json",
263
  "proc-csv",
264
  "proc-python",
 
337
  """Get plugin categories with descriptions."""
338
  return {
339
  "categories": [
340
+ {
341
+ "id": "apis",
342
+ "name": "API Providers",
343
+ "description": "LLM and AI service providers",
344
+ "icon": "🔌",
345
+ },
346
+ {
347
+ "id": "mcps",
348
+ "name": "MCP Tools",
349
+ "description": "Model Context Protocol tools",
350
+ "icon": "🔧",
351
+ },
352
+ {
353
+ "id": "processors",
354
+ "name": "Data Processors",
355
+ "description": "Data transformation tools",
356
+ "icon": "📊",
357
+ },
358
  ],
359
  }
360
 
 
375
  tools = []
376
  else:
377
  tools = get_all_tools()
378
+
379
  return {
380
  "tools": [
381
  {
 
397
  tool = get_tool(tool_name)
398
  if not tool:
399
  raise HTTPException(status_code=404, detail=f"Tool not found: {tool_name}")
400
+
401
  return {
402
  "name": tool.name,
403
  "description": tool.description,
 
412
  async def get_registry_endpoint() -> dict[str, Any]:
413
  """Get full plugin registry with all tools."""
414
  plugins = get_all_plugins()
415
+
416
  return {
417
  "plugins": [
418
  {
 
518
  "mcp-search",
519
  "mcp-html",
520
  "mcp-python-sandbox",
 
 
 
 
521
  "proc-json",
522
  "proc-python",
523
  "proc-pandas",
backend/app/api/routes/scrape.py CHANGED
@@ -460,7 +460,18 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
460
 
461
  # Site-specific strategy overrides
462
  if site_template and site_template.site_id == "github":
463
- if "trending" in instructions_lower and "repo" in instructions_lower:
 
 
 
 
 
 
 
 
 
 
 
464
  return _plan_from_site_template(
465
  site_template,
466
  strategy_override="github_trending",
 
460
 
461
  # Site-specific strategy overrides
462
  if site_template and site_template.site_id == "github":
463
+ # Detect GitHub trending/top repos requests (flexible matching)
464
+ github_trending_signals = [
465
+ "trending" in instructions_lower,
466
+ "top" in instructions_lower and "repo" in instructions_lower,
467
+ "top" in instructions_lower and "project" in instructions_lower,
468
+ "best" in instructions_lower and "repo" in instructions_lower,
469
+ "popular" in instructions_lower and "repo" in instructions_lower,
470
+ "this week" in instructions_lower,
471
+ "this month" in instructions_lower,
472
+ "today" in instructions_lower and "repo" in instructions_lower,
473
+ ]
474
+ if any(github_trending_signals):
475
  return _plan_from_site_template(
476
  site_template,
477
  strategy_override="github_trending",
backend/tests/test_api/test_plugins.py CHANGED
@@ -10,15 +10,15 @@ class TestPluginsAPI:
10
  def test_list_all_plugins(self, client: TestClient) -> None:
11
  """Test GET /api/plugins returns all plugins."""
12
  response = client.get("/api/plugins")
13
-
14
  assert response.status_code == 200
15
  data = response.json()
16
-
17
  # Check response structure
18
  assert "plugins" in data
19
  assert "categories" in data
20
  assert "stats" in data
21
-
22
  # Check stats structure
23
  stats = data["stats"]
24
  assert "total" in stats
@@ -31,10 +31,10 @@ class TestPluginsAPI:
31
  def test_list_plugins_by_category(self, client: TestClient) -> None:
32
  """Test GET /api/plugins?category=apis filters by category."""
33
  response = client.get("/api/plugins?category=apis")
34
-
35
  assert response.status_code == 200
36
  data = response.json()
37
-
38
  # Should only contain the filtered category
39
  plugins = data["plugins"]
40
  if "apis" in plugins:
@@ -45,17 +45,17 @@ class TestPluginsAPI:
45
  def test_list_installed_plugins(self, client: TestClient) -> None:
46
  """Test GET /api/plugins/installed returns only installed plugins."""
47
  response = client.get("/api/plugins/installed")
48
-
49
  assert response.status_code == 200
50
  data = response.json()
51
-
52
  assert "plugins" in data
53
  assert "count" in data
54
-
55
  # All returned plugins should be installed
56
  for plugin in data["plugins"]:
57
  assert plugin["installed"] is True
58
-
59
  # Count should match number of plugins
60
  assert data["count"] == len(data["plugins"])
61
 
@@ -64,20 +64,20 @@ class TestPluginsAPI:
64
  # First get list of plugins to find a valid ID
65
  list_response = client.get("/api/plugins")
66
  assert list_response.status_code == 200
67
-
68
  plugins_data = list_response.json()
69
-
70
  # Find first plugin from any category
71
  plugin_id = None
72
  for category, plugins in plugins_data["plugins"].items():
73
  if plugins:
74
  plugin_id = plugins[0]["id"]
75
  break
76
-
77
  if plugin_id:
78
  response = client.get(f"/api/plugins/{plugin_id}")
79
  assert response.status_code == 200
80
-
81
  data = response.json()
82
  assert data["id"] == plugin_id
83
  assert "name" in data
@@ -87,7 +87,7 @@ class TestPluginsAPI:
87
  def test_get_nonexistent_plugin(self, client: TestClient) -> None:
88
  """Test GET /api/plugins/{plugin_id} for non-existent plugin."""
89
  response = client.get("/api/plugins/nonexistent-plugin")
90
-
91
  assert response.status_code == 404
92
  data = response.json()
93
  assert "not found" in data["detail"].lower()
@@ -97,9 +97,9 @@ class TestPluginsAPI:
97
  # First get a plugin that's not installed
98
  list_response = client.get("/api/plugins")
99
  assert list_response.status_code == 200
100
-
101
  plugins_data = list_response.json()
102
-
103
  # Find an uninstalled plugin
104
  plugin_id = None
105
  for category, plugins in plugins_data["plugins"].items():
@@ -109,14 +109,14 @@ class TestPluginsAPI:
109
  break
110
  if plugin_id:
111
  break
112
-
113
  if plugin_id:
114
  payload = {"plugin_id": plugin_id}
115
  response = client.post("/api/plugins/install", json=payload)
116
-
117
  assert response.status_code == 200
118
  data = response.json()
119
-
120
  assert data["status"] == "success"
121
  assert data["plugin"]["id"] == plugin_id
122
  assert data["plugin"]["installed"] is True
@@ -127,9 +127,9 @@ class TestPluginsAPI:
127
  # First install a plugin
128
  list_response = client.get("/api/plugins")
129
  assert list_response.status_code == 200
130
-
131
  plugins_data = list_response.json()
132
-
133
  # Find an uninstalled plugin to install first
134
  plugin_id = None
135
  for category, plugins in plugins_data["plugins"].items():
@@ -139,17 +139,17 @@ class TestPluginsAPI:
139
  break
140
  if plugin_id:
141
  break
142
-
143
  if plugin_id:
144
  # Install it
145
  payload = {"plugin_id": plugin_id}
146
  response = client.post("/api/plugins/install", json=payload)
147
  assert response.status_code == 200
148
-
149
  # Try to install again
150
  response = client.post("/api/plugins/install", json=payload)
151
  assert response.status_code == 200
152
-
153
  data = response.json()
154
  assert data["status"] == "already_installed"
155
  assert "already installed" in data["message"]
@@ -158,7 +158,7 @@ class TestPluginsAPI:
158
  """Test installing a non-existent plugin."""
159
  payload = {"plugin_id": "nonexistent-plugin"}
160
  response = client.post("/api/plugins/install", json=payload)
161
-
162
  assert response.status_code == 404
163
  data = response.json()
164
  assert "not found" in data["detail"].lower()
@@ -168,13 +168,23 @@ class TestPluginsAPI:
168
  # First install a non-core plugin
169
  list_response = client.get("/api/plugins")
170
  assert list_response.status_code == 200
171
-
172
  plugins_data = list_response.json()
173
-
174
  # Find a non-core plugin to install and then uninstall
175
- core_plugins = {"mcp-browser", "mcp-search", "mcp-html", "skill-planner", "skill-navigator", "skill-extractor", "skill-verifier", "proc-json"}
 
 
 
 
 
 
 
 
 
 
176
  plugin_id = None
177
-
178
  for category, plugins in plugins_data["plugins"].items():
179
  for plugin in plugins:
180
  if plugin["id"] not in core_plugins and not plugin["installed"]:
@@ -182,20 +192,20 @@ class TestPluginsAPI:
182
  break
183
  if plugin_id:
184
  break
185
-
186
  if plugin_id:
187
  # Install it first
188
  install_payload = {"plugin_id": plugin_id}
189
  install_response = client.post("/api/plugins/install", json=install_payload)
190
  assert install_response.status_code == 200
191
-
192
  # Now uninstall it
193
  uninstall_payload = {"plugin_id": plugin_id}
194
  response = client.post("/api/plugins/uninstall", json=uninstall_payload)
195
-
196
  assert response.status_code == 200
197
  data = response.json()
198
-
199
  assert data["status"] == "success"
200
  assert data["plugin"]["id"] == plugin_id
201
  assert data["plugin"]["installed"] is False
@@ -206,9 +216,9 @@ class TestPluginsAPI:
206
  # Try to uninstall a core plugin
207
  core_plugin_id = "mcp-browser" # This should be a core plugin
208
  payload = {"plugin_id": core_plugin_id}
209
-
210
  response = client.post("/api/plugins/uninstall", json=payload)
211
-
212
  assert response.status_code == 400
213
  data = response.json()
214
  assert "Cannot uninstall core plugin" in data["detail"]
@@ -218,10 +228,20 @@ class TestPluginsAPI:
218
  # Find an uninstalled non-core plugin
219
  list_response = client.get("/api/plugins")
220
  assert list_response.status_code == 200
221
-
222
  plugins_data = list_response.json()
223
- core_plugins = {"mcp-browser", "mcp-search", "mcp-html", "skill-planner", "skill-navigator", "skill-extractor", "skill-verifier", "proc-json"}
224
-
 
 
 
 
 
 
 
 
 
 
225
  plugin_id = None
226
  for category, plugins in plugins_data["plugins"].items():
227
  for plugin in plugins:
@@ -230,11 +250,11 @@ class TestPluginsAPI:
230
  break
231
  if plugin_id:
232
  break
233
-
234
  if plugin_id:
235
  payload = {"plugin_id": plugin_id}
236
  response = client.post("/api/plugins/uninstall", json=payload)
237
-
238
  assert response.status_code == 200
239
  data = response.json()
240
  assert data["status"] == "not_installed"
@@ -244,7 +264,7 @@ class TestPluginsAPI:
244
  """Test uninstalling a non-existent plugin."""
245
  payload = {"plugin_id": "nonexistent-plugin"}
246
  response = client.post("/api/plugins/uninstall", json=payload)
247
-
248
  assert response.status_code == 404
249
  data = response.json()
250
  assert "not found" in data["detail"].lower()
@@ -252,45 +272,50 @@ class TestPluginsAPI:
252
  def test_get_categories(self, client: TestClient) -> None:
253
  """Test that plugins list includes categories."""
254
  response = client.get("/api/plugins")
255
-
256
  assert response.status_code == 200
257
  data = response.json()
258
-
259
  assert "categories" in data
260
  categories = data["categories"]
261
-
262
  assert isinstance(categories, list)
263
  assert len(categories) > 0
264
-
265
  # Categories are returned as strings (category IDs)
266
- expected_categories = ["apis", "mcps", "skills", "processors"]
267
  for expected in expected_categories:
268
  assert expected in categories
269
 
 
 
 
270
  def test_plugin_structure_validation(self, client: TestClient) -> None:
271
  """Test that all plugins have required fields."""
272
  response = client.get("/api/plugins")
273
  assert response.status_code == 200
274
-
275
  data = response.json()
276
-
277
  required_fields = ["id", "name", "category", "description", "version", "installed"]
278
-
279
  for category, plugins in data["plugins"].items():
280
  for plugin in plugins:
281
  for field in required_fields:
282
- assert field in plugin, f"Plugin {plugin.get('id', 'unknown')} missing field {field}"
 
 
283
 
284
  def test_install_uninstall_payload_validation(self, client: TestClient) -> None:
285
  """Test payload validation for install/uninstall endpoints."""
286
  # Missing plugin_id for install
287
  response = client.post("/api/plugins/install", json={})
288
  assert response.status_code == 422
289
-
290
  # Missing plugin_id for uninstall
291
  response = client.post("/api/plugins/uninstall", json={})
292
  assert response.status_code == 422
293
-
294
  # Invalid payload type
295
  response = client.post("/api/plugins/install", json={"plugin_id": 123})
296
  assert response.status_code == 422
@@ -300,10 +325,20 @@ class TestPluginsAPI:
300
  # Find a non-core plugin
301
  list_response = client.get("/api/plugins")
302
  assert list_response.status_code == 200
303
-
304
  plugins_data = list_response.json()
305
- core_plugins = {"mcp-browser", "mcp-search", "mcp-html", "skill-planner", "skill-navigator", "skill-extractor", "skill-verifier", "proc-json"}
306
-
 
 
 
 
 
 
 
 
 
 
307
  plugin_id = None
308
  for category, plugins in plugins_data["plugins"].items():
309
  for plugin in plugins:
@@ -312,18 +347,18 @@ class TestPluginsAPI:
312
  break
313
  if plugin_id:
314
  break
315
-
316
  if plugin_id:
317
  # Check initial state
318
  response = client.get(f"/api/plugins/{plugin_id}")
319
  initial_state = response.json()["installed"]
320
-
321
  # Toggle state by installing if not installed, or uninstalling if installed and not core
322
  if not initial_state:
323
  payload = {"plugin_id": plugin_id}
324
  response = client.post("/api/plugins/install", json=payload)
325
  assert response.status_code == 200
326
-
327
  # Verify state changed
328
  response = client.get(f"/api/plugins/{plugin_id}")
329
  assert response.json()["installed"] is True
@@ -333,7 +368,7 @@ class TestPluginsAPI:
333
  payload = {"plugin_id": plugin_id}
334
  response = client.post("/api/plugins/uninstall", json=payload)
335
  assert response.status_code == 200
336
-
337
  # Verify state changed
338
  response = client.get(f"/api/plugins/{plugin_id}")
339
- assert response.json()["installed"] is False
 
10
  def test_list_all_plugins(self, client: TestClient) -> None:
11
  """Test GET /api/plugins returns all plugins."""
12
  response = client.get("/api/plugins")
13
+
14
  assert response.status_code == 200
15
  data = response.json()
16
+
17
  # Check response structure
18
  assert "plugins" in data
19
  assert "categories" in data
20
  assert "stats" in data
21
+
22
  # Check stats structure
23
  stats = data["stats"]
24
  assert "total" in stats
 
31
  def test_list_plugins_by_category(self, client: TestClient) -> None:
32
  """Test GET /api/plugins?category=apis filters by category."""
33
  response = client.get("/api/plugins?category=apis")
34
+
35
  assert response.status_code == 200
36
  data = response.json()
37
+
38
  # Should only contain the filtered category
39
  plugins = data["plugins"]
40
  if "apis" in plugins:
 
45
  def test_list_installed_plugins(self, client: TestClient) -> None:
46
  """Test GET /api/plugins/installed returns only installed plugins."""
47
  response = client.get("/api/plugins/installed")
48
+
49
  assert response.status_code == 200
50
  data = response.json()
51
+
52
  assert "plugins" in data
53
  assert "count" in data
54
+
55
  # All returned plugins should be installed
56
  for plugin in data["plugins"]:
57
  assert plugin["installed"] is True
58
+
59
  # Count should match number of plugins
60
  assert data["count"] == len(data["plugins"])
61
 
 
64
  # First get list of plugins to find a valid ID
65
  list_response = client.get("/api/plugins")
66
  assert list_response.status_code == 200
67
+
68
  plugins_data = list_response.json()
69
+
70
  # Find first plugin from any category
71
  plugin_id = None
72
  for category, plugins in plugins_data["plugins"].items():
73
  if plugins:
74
  plugin_id = plugins[0]["id"]
75
  break
76
+
77
  if plugin_id:
78
  response = client.get(f"/api/plugins/{plugin_id}")
79
  assert response.status_code == 200
80
+
81
  data = response.json()
82
  assert data["id"] == plugin_id
83
  assert "name" in data
 
87
  def test_get_nonexistent_plugin(self, client: TestClient) -> None:
88
  """Test GET /api/plugins/{plugin_id} for non-existent plugin."""
89
  response = client.get("/api/plugins/nonexistent-plugin")
90
+
91
  assert response.status_code == 404
92
  data = response.json()
93
  assert "not found" in data["detail"].lower()
 
97
  # First get a plugin that's not installed
98
  list_response = client.get("/api/plugins")
99
  assert list_response.status_code == 200
100
+
101
  plugins_data = list_response.json()
102
+
103
  # Find an uninstalled plugin
104
  plugin_id = None
105
  for category, plugins in plugins_data["plugins"].items():
 
109
  break
110
  if plugin_id:
111
  break
112
+
113
  if plugin_id:
114
  payload = {"plugin_id": plugin_id}
115
  response = client.post("/api/plugins/install", json=payload)
116
+
117
  assert response.status_code == 200
118
  data = response.json()
119
+
120
  assert data["status"] == "success"
121
  assert data["plugin"]["id"] == plugin_id
122
  assert data["plugin"]["installed"] is True
 
127
  # First install a plugin
128
  list_response = client.get("/api/plugins")
129
  assert list_response.status_code == 200
130
+
131
  plugins_data = list_response.json()
132
+
133
  # Find an uninstalled plugin to install first
134
  plugin_id = None
135
  for category, plugins in plugins_data["plugins"].items():
 
139
  break
140
  if plugin_id:
141
  break
142
+
143
  if plugin_id:
144
  # Install it
145
  payload = {"plugin_id": plugin_id}
146
  response = client.post("/api/plugins/install", json=payload)
147
  assert response.status_code == 200
148
+
149
  # Try to install again
150
  response = client.post("/api/plugins/install", json=payload)
151
  assert response.status_code == 200
152
+
153
  data = response.json()
154
  assert data["status"] == "already_installed"
155
  assert "already installed" in data["message"]
 
158
  """Test installing a non-existent plugin."""
159
  payload = {"plugin_id": "nonexistent-plugin"}
160
  response = client.post("/api/plugins/install", json=payload)
161
+
162
  assert response.status_code == 404
163
  data = response.json()
164
  assert "not found" in data["detail"].lower()
 
168
  # First install a non-core plugin
169
  list_response = client.get("/api/plugins")
170
  assert list_response.status_code == 200
171
+
172
  plugins_data = list_response.json()
173
+
174
  # Find a non-core plugin to install and then uninstall
175
+ core_plugins = {
176
+ "mcp-browser",
177
+ "mcp-search",
178
+ "mcp-html",
179
+ "mcp-python-sandbox",
180
+ "proc-json",
181
+ "proc-python",
182
+ "proc-pandas",
183
+ "proc-numpy",
184
+ "proc-bs4",
185
+ }
186
  plugin_id = None
187
+
188
  for category, plugins in plugins_data["plugins"].items():
189
  for plugin in plugins:
190
  if plugin["id"] not in core_plugins and not plugin["installed"]:
 
192
  break
193
  if plugin_id:
194
  break
195
+
196
  if plugin_id:
197
  # Install it first
198
  install_payload = {"plugin_id": plugin_id}
199
  install_response = client.post("/api/plugins/install", json=install_payload)
200
  assert install_response.status_code == 200
201
+
202
  # Now uninstall it
203
  uninstall_payload = {"plugin_id": plugin_id}
204
  response = client.post("/api/plugins/uninstall", json=uninstall_payload)
205
+
206
  assert response.status_code == 200
207
  data = response.json()
208
+
209
  assert data["status"] == "success"
210
  assert data["plugin"]["id"] == plugin_id
211
  assert data["plugin"]["installed"] is False
 
216
  # Try to uninstall a core plugin
217
  core_plugin_id = "mcp-browser" # This should be a core plugin
218
  payload = {"plugin_id": core_plugin_id}
219
+
220
  response = client.post("/api/plugins/uninstall", json=payload)
221
+
222
  assert response.status_code == 400
223
  data = response.json()
224
  assert "Cannot uninstall core plugin" in data["detail"]
 
228
  # Find an uninstalled non-core plugin
229
  list_response = client.get("/api/plugins")
230
  assert list_response.status_code == 200
231
+
232
  plugins_data = list_response.json()
233
+ core_plugins = {
234
+ "mcp-browser",
235
+ "mcp-search",
236
+ "mcp-html",
237
+ "mcp-python-sandbox",
238
+ "proc-json",
239
+ "proc-python",
240
+ "proc-pandas",
241
+ "proc-numpy",
242
+ "proc-bs4",
243
+ }
244
+
245
  plugin_id = None
246
  for category, plugins in plugins_data["plugins"].items():
247
  for plugin in plugins:
 
250
  break
251
  if plugin_id:
252
  break
253
+
254
  if plugin_id:
255
  payload = {"plugin_id": plugin_id}
256
  response = client.post("/api/plugins/uninstall", json=payload)
257
+
258
  assert response.status_code == 200
259
  data = response.json()
260
  assert data["status"] == "not_installed"
 
264
  """Test uninstalling a non-existent plugin."""
265
  payload = {"plugin_id": "nonexistent-plugin"}
266
  response = client.post("/api/plugins/uninstall", json=payload)
267
+
268
  assert response.status_code == 404
269
  data = response.json()
270
  assert "not found" in data["detail"].lower()
 
272
  def test_get_categories(self, client: TestClient) -> None:
273
  """Test that plugins list includes categories."""
274
  response = client.get("/api/plugins")
275
+
276
  assert response.status_code == 200
277
  data = response.json()
278
+
279
  assert "categories" in data
280
  categories = data["categories"]
281
+
282
  assert isinstance(categories, list)
283
  assert len(categories) > 0
284
+
285
  # Categories are returned as strings (category IDs)
286
+ expected_categories = ["apis", "mcps", "processors"]
287
  for expected in expected_categories:
288
  assert expected in categories
289
 
290
+ # Agents/skills are intentionally managed via /api/agents, not /api/plugins
291
+ assert "skills" not in categories
292
+
293
  def test_plugin_structure_validation(self, client: TestClient) -> None:
294
  """Test that all plugins have required fields."""
295
  response = client.get("/api/plugins")
296
  assert response.status_code == 200
297
+
298
  data = response.json()
299
+
300
  required_fields = ["id", "name", "category", "description", "version", "installed"]
301
+
302
  for category, plugins in data["plugins"].items():
303
  for plugin in plugins:
304
  for field in required_fields:
305
+ assert field in plugin, (
306
+ f"Plugin {plugin.get('id', 'unknown')} missing field {field}"
307
+ )
308
 
309
  def test_install_uninstall_payload_validation(self, client: TestClient) -> None:
310
  """Test payload validation for install/uninstall endpoints."""
311
  # Missing plugin_id for install
312
  response = client.post("/api/plugins/install", json={})
313
  assert response.status_code == 422
314
+
315
  # Missing plugin_id for uninstall
316
  response = client.post("/api/plugins/uninstall", json={})
317
  assert response.status_code == 422
318
+
319
  # Invalid payload type
320
  response = client.post("/api/plugins/install", json={"plugin_id": 123})
321
  assert response.status_code == 422
 
325
  # Find a non-core plugin
326
  list_response = client.get("/api/plugins")
327
  assert list_response.status_code == 200
328
+
329
  plugins_data = list_response.json()
330
+ core_plugins = {
331
+ "mcp-browser",
332
+ "mcp-search",
333
+ "mcp-html",
334
+ "mcp-python-sandbox",
335
+ "proc-json",
336
+ "proc-python",
337
+ "proc-pandas",
338
+ "proc-numpy",
339
+ "proc-bs4",
340
+ }
341
+
342
  plugin_id = None
343
  for category, plugins in plugins_data["plugins"].items():
344
  for plugin in plugins:
 
347
  break
348
  if plugin_id:
349
  break
350
+
351
  if plugin_id:
352
  # Check initial state
353
  response = client.get(f"/api/plugins/{plugin_id}")
354
  initial_state = response.json()["installed"]
355
+
356
  # Toggle state by installing if not installed, or uninstalling if installed and not core
357
  if not initial_state:
358
  payload = {"plugin_id": plugin_id}
359
  response = client.post("/api/plugins/install", json=payload)
360
  assert response.status_code == 200
361
+
362
  # Verify state changed
363
  response = client.get(f"/api/plugins/{plugin_id}")
364
  assert response.json()["installed"] is True
 
368
  payload = {"plugin_id": plugin_id}
369
  response = client.post("/api/plugins/uninstall", json=payload)
370
  assert response.status_code == 200
371
+
372
  # Verify state changed
373
  response = client.get(f"/api/plugins/{plugin_id}")
374
+ assert response.json()["installed"] is False
backend/tests/test_api/test_scrape_e2e.py ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """High-coverage end-to-end scrape tests with deterministic offline fixtures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ from collections import Counter
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+ from urllib.parse import urlparse
12
+
13
+ import pytest
14
+ from fastapi.testclient import TestClient
15
+
16
+ from app.api.routes import scrape as scrape_routes
17
+ from app.core.action import Action
18
+ from app.core.env import WebScraperEnv
19
+ from app.sites.templates import SITE_TEMPLATES
20
+
21
+ BASE_PLUGINS = ["mcp-browser", "mcp-search", "mcp-html"]
22
+ PYTHON_PLUGINS = [
23
+ "mcp-python-sandbox",
24
+ "proc-python",
25
+ "proc-pandas",
26
+ "proc-numpy",
27
+ "proc-bs4",
28
+ ]
29
+ DEFAULT_AGENTS = ["planner", "navigator", "extractor", "verifier"]
30
+
31
+
32
+ def _is_live_network_mode() -> bool:
33
+ """Return True when live-network E2E mode is enabled."""
34
+
35
+ raw = os.getenv("SCRAPERL_E2E_LIVE_NETWORK", "0").strip().lower()
36
+ return raw in {"1", "true", "yes", "on"}
37
+
38
+
39
+ def _env_positive_int(name: str) -> int | None:
40
+ """Read an optional positive integer environment variable."""
41
+
42
+ raw = os.getenv(name)
43
+ if raw is None:
44
+ return None
45
+
46
+ try:
47
+ value = int(raw)
48
+ except ValueError:
49
+ return None
50
+
51
+ if value <= 0:
52
+ return None
53
+ return value
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class E2ECase:
58
+ """One end-to-end scrape test case."""
59
+
60
+ name: str
61
+ payload: dict[str, Any]
62
+ expected_template_id: str | None = None
63
+ expected_strategy: str | None = None
64
+ expect_sandbox: bool = False
65
+
66
+
67
+ def _build_gold_csv(months: int = 180) -> str:
68
+ """Create deterministic monthly gold CSV data for offline tests."""
69
+
70
+ lines = ["Date,Price"]
71
+ year = 2012
72
+ month = 1
73
+
74
+ for index in range(months):
75
+ price = 1120.0 + (index * 2.75)
76
+ lines.append(f"{year:04d}-{month:02d}-01,{price:.2f}")
77
+ month += 1
78
+ if month > 12:
79
+ month = 1
80
+ year += 1
81
+
82
+ return "\n".join(lines)
83
+
84
+
85
+ def _build_html_payload(url: str) -> str:
86
+ """Build deterministic HTML content with rich extraction surfaces."""
87
+
88
+ parsed = urlparse(url)
89
+ domain = parsed.netloc or "example.com"
90
+ path = parsed.path or "/"
91
+ slug = path.strip("/").replace("/", "-") or "home"
92
+
93
+ github_cards = ""
94
+ if "github.com" in domain and ("trending" in path or "explore" in path or path == "/"):
95
+ github_cards = """
96
+ <article class="Box-row">
97
+ <h2><a href="/alpha/repo-one">alpha / repo-one</a></h2>
98
+ <a href="/alpha/repo-one/stargazers">1,234</a>
99
+ <a href="/alpha/repo-one/network/members">210</a>
100
+ </article>
101
+ <article class="Box-row">
102
+ <h2><a href="/beta/repo-two">beta / repo-two</a></h2>
103
+ <a href="/beta/repo-two/stargazers">987</a>
104
+ <a href="/beta/repo-two/network/members">145</a>
105
+ </article>
106
+ <article class="Box-row">
107
+ <h2><a href="/gamma/repo-three">gamma / repo-three</a></h2>
108
+ <a href="/gamma/repo-three/stargazers">876</a>
109
+ <a href="/gamma/repo-three/network/members">132</a>
110
+ </article>
111
+ """
112
+
113
+ return f"""
114
+ <html>
115
+ <head>
116
+ <title>{domain} :: {slug}</title>
117
+ <meta name="description" content="Mock page for {domain} and {slug}" />
118
+ <meta property="og:title" content="{domain} sample" />
119
+ </head>
120
+ <body>
121
+ <h1>{domain} heading</h1>
122
+ <p>
123
+ Offline content for {url}. Contact: test+{slug}@example.com
124
+ </p>
125
+ <a href="https://{domain}/about">About</a>
126
+ <a href="https://{domain}/contact">Contact</a>
127
+ <a href="mailto:hello@example.com">Email</a>
128
+ <img src="https://{domain}/logo.png" alt="logo" />
129
+ <form action="/submit" method="post">
130
+ <input type="text" name="query" />
131
+ <textarea name="notes"></textarea>
132
+ </form>
133
+ <table>
134
+ <tr><th>month</th><th>gold_price_usd</th></tr>
135
+ <tr><td>2016-01</td><td>1101.00</td></tr>
136
+ <tr><td>2016-02</td><td>1104.00</td></tr>
137
+ </table>
138
+ <script src="/assets/app.js"></script>
139
+ {github_cards}
140
+ </body>
141
+ </html>
142
+ """
143
+
144
+
145
+ @pytest.fixture(autouse=True)
146
+ def patch_network_dependencies(monkeypatch: pytest.MonkeyPatch) -> None:
147
+ """Patch network-facing dependencies for deterministic E2E execution."""
148
+
149
+ if _is_live_network_mode():
150
+ return
151
+
152
+ gold_csv = _build_gold_csv()
153
+
154
+ async def fake_execute_navigate(self: WebScraperEnv, action: Action) -> dict[str, Any]:
155
+ raw_url = str(action.get_param("url") or "https://example.com").strip()
156
+ normalized = raw_url
157
+ if not re.match(r"^https?://", normalized, flags=re.IGNORECASE):
158
+ normalized = f"https://{normalized}"
159
+
160
+ parsed = urlparse(normalized)
161
+ if not parsed.netloc:
162
+ return {"success": False, "error": f"Invalid URL: {raw_url}"}
163
+
164
+ self._current_url = normalized
165
+ self._navigation_history.append(normalized)
166
+ self._page_status_code = 200
167
+
168
+ if normalized.endswith(".csv") or "gold-prices" in normalized:
169
+ self._page_content_type = "text/csv"
170
+ self._page_html = gold_csv
171
+ self._page_title = "gold-prices-monthly"
172
+ else:
173
+ self._page_content_type = "text/html; charset=utf-8"
174
+ self._page_html = _build_html_payload(normalized)
175
+ self._page_title = parsed.netloc
176
+
177
+ return {
178
+ "success": True,
179
+ "url": normalized,
180
+ "status_code": 200,
181
+ "content_type": self._page_content_type,
182
+ "tls_verification_bypassed": False,
183
+ }
184
+
185
+ async def fake_search_urls(query: str, max_results: int = 6) -> list[str]:
186
+ lowered = query.lower()
187
+
188
+ if "gold" in lowered and ("price" in lowered or "trend" in lowered):
189
+ return [
190
+ "https://data.mock/gold/monthly.csv",
191
+ "https://github.com/datasets/gold-prices",
192
+ ]
193
+
194
+ if "reddit" in lowered:
195
+ return [
196
+ "https://www.reddit.com/r/python/",
197
+ "https://www.reddit.com/r/machinelearning/",
198
+ "https://www.reddit.com/r/programming/",
199
+ ]
200
+
201
+ token = re.sub(r"[^a-z0-9]+", "-", lowered).strip("-") or "query"
202
+ count = max(1, min(max_results, 3))
203
+ return [f"https://{token}.example.com/source-{idx}" for idx in range(1, count + 1)]
204
+
205
+ def fake_fetch_reddit_communities(limit: int = 25) -> tuple[list[dict[str, Any]], str]:
206
+ communities = []
207
+ for idx in range(limit):
208
+ communities.append(
209
+ {
210
+ "subreddit": f"r/mockcommunity{idx + 1}",
211
+ "title": f"Mock Community {idx + 1}",
212
+ "subscribers": 200000 - (idx * 1000),
213
+ "active_users": 15000 - (idx * 100),
214
+ "url": f"https://www.reddit.com/r/mockcommunity{idx + 1}/",
215
+ "description": "Offline mocked Reddit community",
216
+ }
217
+ )
218
+
219
+ return communities, "mock_reddit_json"
220
+
221
+ monkeypatch.setattr(WebScraperEnv, "_execute_navigate", fake_execute_navigate)
222
+ monkeypatch.setattr(scrape_routes, "_search_urls_with_mcp", fake_search_urls)
223
+ monkeypatch.setattr(scrape_routes, "_fetch_reddit_communities", fake_fetch_reddit_communities)
224
+
225
+
226
+ def _build_payload(
227
+ *,
228
+ assets: list[str],
229
+ instructions: str,
230
+ output_format: str = "json",
231
+ complexity: str = "low",
232
+ enable_plugins: list[str] | None = None,
233
+ selected_agents: list[str] | None = None,
234
+ python_code: str | None = None,
235
+ ) -> dict[str, Any]:
236
+ """Build a scrape payload using defaults aligned with app behavior."""
237
+
238
+ output_instructions = {
239
+ "json": "Return as structured JSON",
240
+ "csv": "Return as CSV with stable column order",
241
+ "markdown": "Return as Markdown sections",
242
+ "text": "Return as plain text summary",
243
+ }[output_format]
244
+
245
+ payload: dict[str, Any] = {
246
+ "assets": assets,
247
+ "instructions": instructions,
248
+ "output_instructions": output_instructions,
249
+ "output_format": output_format,
250
+ "complexity": complexity,
251
+ "model": "llama-3.3-70b",
252
+ "provider": "nvidia",
253
+ "enable_memory": True,
254
+ "enable_plugins": enable_plugins or list(BASE_PLUGINS),
255
+ "selected_agents": selected_agents or list(DEFAULT_AGENTS),
256
+ "max_steps": 50,
257
+ }
258
+
259
+ if python_code:
260
+ payload["python_code"] = python_code
261
+
262
+ return payload
263
+
264
+
265
+ def _build_e2e_cases() -> list[E2ECase]:
266
+ """Build exactly 100 distinct E2E cases across templates and generic inputs."""
267
+
268
+ cases: list[E2ECase] = []
269
+ formats = ["json", "markdown", "text", "csv"]
270
+
271
+ for idx, template in enumerate(SITE_TEMPLATES):
272
+ output_format = formats[idx % len(formats)]
273
+ complexity = "low"
274
+ if idx % 17 == 0:
275
+ complexity = "medium"
276
+ if idx % 29 == 0:
277
+ complexity = "high"
278
+
279
+ plugins = list(BASE_PLUGINS)
280
+ expect_sandbox = False
281
+ python_code = None
282
+
283
+ if idx % 14 == 0:
284
+ plugins.extend(PYTHON_PLUGINS)
285
+ plugins.append("skill-planner")
286
+ expect_sandbox = True
287
+ python_code = (
288
+ "rows = payload.get('dataset_rows') or []\n"
289
+ "result = {'rows_seen': len(rows), 'source_links': len(payload.get('source_links') or [])}"
290
+ )
291
+
292
+ instructions = f"Collect structured highlights for {template.name} template case {idx + 1}"
293
+ expected_strategy = None
294
+
295
+ if template.site_id == "github":
296
+ instructions = f"Extract trending repo stats from GitHub case {idx + 1}"
297
+ expected_strategy = "github_trending"
298
+ elif template.site_id == "reddit":
299
+ instructions = f"Extract trending communities from Reddit case {idx + 1}"
300
+ expected_strategy = "reddit_trending"
301
+
302
+ cases.append(
303
+ E2ECase(
304
+ name=f"template-{idx + 1:02d}-{template.site_id}",
305
+ payload=_build_payload(
306
+ assets=[f"https://{template.domains[0]}"],
307
+ instructions=instructions,
308
+ output_format=output_format,
309
+ complexity=complexity,
310
+ enable_plugins=plugins,
311
+ python_code=python_code,
312
+ ),
313
+ expected_template_id=template.site_id,
314
+ expected_strategy=expected_strategy,
315
+ expect_sandbox=expect_sandbox,
316
+ )
317
+ )
318
+
319
+ for idx in range(20):
320
+ query_assets = [f"synthetic discovery query batch {idx + 1}"]
321
+ if idx % 5 == 0:
322
+ query_assets.append(f"synthetic companion signal {idx + 1}")
323
+
324
+ plugins = list(BASE_PLUGINS)
325
+ if idx % 4 == 0:
326
+ plugins.append("skill-navigator")
327
+
328
+ cases.append(
329
+ E2ECase(
330
+ name=f"query-{idx + 1:02d}",
331
+ payload=_build_payload(
332
+ assets=query_assets,
333
+ instructions=f"Search and extract useful findings for synthetic query case {idx + 1}",
334
+ output_format="json",
335
+ complexity="low",
336
+ enable_plugins=plugins,
337
+ ),
338
+ )
339
+ )
340
+
341
+ for idx in range(10):
342
+ cases.append(
343
+ E2ECase(
344
+ name=f"gold-dataset-{idx + 1:02d}",
345
+ payload=_build_payload(
346
+ assets=[f"gold price trend monthly dataset request {idx + 1}"],
347
+ instructions=f"Build monthly gold price trend dataset from 2016 case {idx + 1}",
348
+ output_format="csv",
349
+ complexity="high",
350
+ enable_plugins=[*BASE_PLUGINS, *PYTHON_PLUGINS, "skill-extractor"],
351
+ python_code=(
352
+ "rows = payload.get('dataset_rows') or []\n"
353
+ "columns = sorted(list(rows[0].keys())) if rows else []\n"
354
+ "result = {'rows_seen': len(rows), 'columns': columns}"
355
+ ),
356
+ ),
357
+ expect_sandbox=True,
358
+ )
359
+ )
360
+
361
+ for idx in range(7):
362
+ cases.append(
363
+ E2ECase(
364
+ name=f"github-trending-extra-{idx + 1:02d}",
365
+ payload=_build_payload(
366
+ assets=[f"https://github.com/trending?since=daily&batch={idx + 1}"],
367
+ instructions=f"List trending GitHub repositories and stats case {idx + 1}",
368
+ output_format="csv",
369
+ complexity="medium",
370
+ enable_plugins=list(BASE_PLUGINS),
371
+ ),
372
+ expected_template_id="github",
373
+ expected_strategy="github_trending",
374
+ )
375
+ )
376
+
377
+ for idx in range(7):
378
+ cases.append(
379
+ E2ECase(
380
+ name=f"reddit-trending-extra-{idx + 1:02d}",
381
+ payload=_build_payload(
382
+ assets=[f"https://www.reddit.com/?batch={idx + 1}"],
383
+ instructions=f"List trending Reddit communities and activity case {idx + 1}",
384
+ output_format="csv",
385
+ complexity="medium",
386
+ enable_plugins=list(BASE_PLUGINS),
387
+ ),
388
+ expected_template_id="reddit",
389
+ expected_strategy="reddit_trending",
390
+ )
391
+ )
392
+
393
+ assert len(cases) == 100
394
+ assert len({case.name for case in cases}) == 100
395
+ return cases
396
+
397
+
398
+ def _build_live_network_cases() -> list[E2ECase]:
399
+ """Build live-network E2E cases (no mocks) for staging validation."""
400
+
401
+ return [
402
+ E2ECase(
403
+ name="live-github-trending",
404
+ payload=_build_payload(
405
+ assets=["https://github.com/trending"],
406
+ instructions="Extract trending repo stats from GitHub",
407
+ output_format="csv",
408
+ complexity="medium",
409
+ enable_plugins=[*BASE_PLUGINS, "skill-planner"],
410
+ ),
411
+ expected_template_id="github",
412
+ expected_strategy="github_trending",
413
+ ),
414
+ E2ECase(
415
+ name="live-reddit-trending",
416
+ payload=_build_payload(
417
+ assets=["https://www.reddit.com/"],
418
+ instructions="Extract trending communities from Reddit",
419
+ output_format="csv",
420
+ complexity="medium",
421
+ enable_plugins=[*BASE_PLUGINS, "skill-navigator"],
422
+ ),
423
+ expected_template_id="reddit",
424
+ expected_strategy="reddit_trending",
425
+ ),
426
+ E2ECase(
427
+ name="live-wikipedia-main",
428
+ payload=_build_payload(
429
+ assets=["https://en.wikipedia.org/wiki/Main_Page"],
430
+ instructions="Extract reference content summary",
431
+ output_format="json",
432
+ complexity="low",
433
+ ),
434
+ expected_template_id="wikipedia",
435
+ ),
436
+ E2ECase(
437
+ name="live-python-home",
438
+ payload=_build_payload(
439
+ assets=["https://www.python.org/"],
440
+ instructions="Extract homepage highlights and links",
441
+ output_format="markdown",
442
+ complexity="low",
443
+ ),
444
+ ),
445
+ E2ECase(
446
+ name="live-huggingface-models",
447
+ payload=_build_payload(
448
+ assets=["https://huggingface.co/models"],
449
+ instructions="Extract model hub highlights",
450
+ output_format="json",
451
+ complexity="low",
452
+ ),
453
+ expected_template_id="huggingface",
454
+ ),
455
+ E2ECase(
456
+ name="live-arxiv-new",
457
+ payload=_build_payload(
458
+ assets=["https://arxiv.org/list/cs/new"],
459
+ instructions="Extract latest computer science papers",
460
+ output_format="json",
461
+ complexity="low",
462
+ ),
463
+ expected_template_id="arxiv",
464
+ ),
465
+ E2ECase(
466
+ name="live-stackoverflow-questions",
467
+ payload=_build_payload(
468
+ assets=["https://stackoverflow.com/questions"],
469
+ instructions="Extract top question cards and metadata",
470
+ output_format="text",
471
+ complexity="low",
472
+ ),
473
+ expected_template_id="stackoverflow",
474
+ ),
475
+ E2ECase(
476
+ name="live-example-domain",
477
+ payload=_build_payload(
478
+ assets=["https://example.com"],
479
+ instructions="Extract title, content, and links",
480
+ output_format="text",
481
+ complexity="low",
482
+ ),
483
+ ),
484
+ E2ECase(
485
+ name="live-query-discovery-1",
486
+ payload=_build_payload(
487
+ assets=["open source scraping frameworks comparison"],
488
+ instructions="Search and extract useful findings",
489
+ output_format="json",
490
+ complexity="low",
491
+ ),
492
+ ),
493
+ E2ECase(
494
+ name="live-query-discovery-2",
495
+ payload=_build_payload(
496
+ assets=["python data extraction tutorials"],
497
+ instructions="Search and extract useful findings",
498
+ output_format="markdown",
499
+ complexity="low",
500
+ ),
501
+ ),
502
+ E2ECase(
503
+ name="live-gold-dataset",
504
+ payload=_build_payload(
505
+ assets=["gold price trend monthly dataset"],
506
+ instructions="Build monthly gold price trend dataset from 2016 onward",
507
+ output_format="csv",
508
+ complexity="high",
509
+ enable_plugins=[*BASE_PLUGINS, *PYTHON_PLUGINS, "skill-extractor"],
510
+ python_code=(
511
+ "rows = payload.get('dataset_rows') or []\n"
512
+ "result = {'rows_seen': len(rows), 'columns': sorted(list(rows[0].keys())) if rows else []}"
513
+ ),
514
+ ),
515
+ expect_sandbox=True,
516
+ ),
517
+ E2ECase(
518
+ name="live-github-explore",
519
+ payload=_build_payload(
520
+ assets=["https://github.com/explore"],
521
+ instructions="Extract repository metadata from GitHub explore",
522
+ output_format="json",
523
+ complexity="medium",
524
+ ),
525
+ expected_template_id="github",
526
+ ),
527
+ ]
528
+
529
+
530
+ def _collect_stream_events(client: TestClient, payload: dict[str, Any]) -> list[dict[str, Any]]:
531
+ """Run one stream scrape request and collect SSE events."""
532
+
533
+ events: list[dict[str, Any]] = []
534
+
535
+ with client.stream("POST", "/api/scrape/stream", json=payload) as response:
536
+ assert response.status_code == 200
537
+
538
+ for raw_line in response.iter_lines():
539
+ if not raw_line:
540
+ continue
541
+
542
+ line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
543
+ if not line.startswith("data: "):
544
+ continue
545
+
546
+ event = json.loads(line[6:])
547
+ events.append(event)
548
+ if event.get("type") == "complete":
549
+ break
550
+
551
+ return events
552
+
553
+
554
+ def _run_case_batch(client: TestClient, cases: list[E2ECase]) -> dict[str, Any]:
555
+ """Execute a batch of cases and collect validation stats."""
556
+
557
+ failures: list[str] = []
558
+ tool_call_counts: Counter[str] = Counter()
559
+ strategy_counts: Counter[str] = Counter()
560
+ seen_template_ids: set[str] = set()
561
+ sandbox_success_cases = 0
562
+ completed_cases = 0
563
+
564
+ for case in cases:
565
+ session_id: str | None = None
566
+
567
+ try:
568
+ events = _collect_stream_events(client, case.payload)
569
+
570
+ init_event = next((event for event in events if event.get("type") == "init"), None)
571
+ complete_event = next(
572
+ (event for event in events if event.get("type") == "complete"),
573
+ None,
574
+ )
575
+
576
+ assert init_event is not None, "missing init event"
577
+ session_id = str(init_event["session_id"])
578
+ assert complete_event is not None, "missing complete event"
579
+
580
+ complete_data = complete_event.get("data")
581
+ assert isinstance(complete_data, dict), "complete payload is not a dictionary"
582
+ assert complete_data["session_id"] == session_id
583
+ assert complete_data["status"] in {"completed", "partial"}
584
+ assert int(complete_data["total_steps"]) > 0
585
+ assert int(complete_data["urls_processed"]) >= 1
586
+
587
+ if complete_data["status"] == "completed":
588
+ completed_cases += 1
589
+
590
+ enabled_plugins = complete_data.get("enabled_plugins") or []
591
+ assert all(not str(plugin_id).startswith("skill-") for plugin_id in enabled_plugins)
592
+ assert "web_scraper" not in enabled_plugins
593
+
594
+ steps = [
595
+ event.get("data")
596
+ for event in events
597
+ if event.get("type") == "step" and isinstance(event.get("data"), dict)
598
+ ]
599
+ assert steps, "no step events emitted"
600
+
601
+ case_template_ids: set[str] = set()
602
+ case_strategies: set[str] = set()
603
+
604
+ for step in steps:
605
+ action = step.get("action")
606
+ extracted = step.get("extracted_data")
607
+ if not isinstance(extracted, dict):
608
+ continue
609
+
610
+ if action == "tool_call":
611
+ tool_name = extracted.get("tool_name")
612
+ if isinstance(tool_name, str) and tool_name:
613
+ tool_call_counts[tool_name] += 1
614
+
615
+ if action == "plugins":
616
+ strategy = extracted.get("navigation_strategy")
617
+ if isinstance(strategy, str) and strategy:
618
+ case_strategies.add(strategy)
619
+ strategy_counts[strategy] += 1
620
+
621
+ if action == "site_template":
622
+ site_id = extracted.get("site_id")
623
+ if isinstance(site_id, str) and site_id:
624
+ case_template_ids.add(site_id)
625
+
626
+ seen_template_ids.update(case_template_ids)
627
+
628
+ if case.expected_template_id:
629
+ assert case.expected_template_id in case_template_ids, (
630
+ f"expected site template '{case.expected_template_id}' not emitted"
631
+ )
632
+
633
+ if case.expected_strategy:
634
+ assert case.expected_strategy in case_strategies, (
635
+ f"expected strategy '{case.expected_strategy}' not emitted"
636
+ )
637
+
638
+ sandbox_seen = any(
639
+ step.get("action") in {"planner_python", "navigator_python", "python_sandbox"}
640
+ for step in steps
641
+ )
642
+ if case.expect_sandbox:
643
+ assert sandbox_seen, "sandbox execution steps not emitted"
644
+ sandbox_success_cases += 1
645
+
646
+ except AssertionError as exc:
647
+ failures.append(f"{case.name}: {exc}")
648
+ finally:
649
+ if session_id:
650
+ cleanup_response = client.delete(f"/api/scrape/{session_id}/cleanup")
651
+ assert cleanup_response.status_code in {200, 404}
652
+
653
+ return {
654
+ "failures": failures,
655
+ "tool_call_counts": tool_call_counts,
656
+ "strategy_counts": strategy_counts,
657
+ "seen_template_ids": seen_template_ids,
658
+ "sandbox_success_cases": sandbox_success_cases,
659
+ "completed_cases": completed_cases,
660
+ }
661
+
662
+
663
+ def test_plugins_registry_excludes_agent_skills(client: TestClient) -> None:
664
+ """Plugin API should not duplicate agent skills from /api/agents."""
665
+
666
+ response = client.get("/api/plugins")
667
+ assert response.status_code == 200
668
+ payload = response.json()
669
+
670
+ categories = payload["categories"]
671
+ assert "skills" not in categories
672
+
673
+ plugin_ids = [plugin["id"] for plugins in payload["plugins"].values() for plugin in plugins]
674
+ assert all(not plugin_id.startswith("skill-") for plugin_id in plugin_ids)
675
+ assert "web_scraper" not in plugin_ids
676
+
677
+
678
+ def test_scraper_e2e_100_inputs_templates_tools_plugins_and_sandbox(
679
+ client: TestClient,
680
+ ) -> None:
681
+ """Run 100 end-to-end scrape inputs and validate major system behavior."""
682
+
683
+ if _is_live_network_mode():
684
+ pytest.skip("Offline deterministic E2E suite is skipped in live-network mode")
685
+
686
+ cases = _build_e2e_cases()
687
+ summary = _run_case_batch(client, cases)
688
+
689
+ assert len(cases) == 100
690
+ assert not summary["failures"], " | ".join(summary["failures"][:12])
691
+
692
+ expected_template_ids = {template.site_id for template in SITE_TEMPLATES}
693
+ assert expected_template_ids.issubset(summary["seen_template_ids"])
694
+
695
+ required_tool_calls = {
696
+ "url.parse",
697
+ "validate.url",
698
+ "browser.navigate",
699
+ "html.parse",
700
+ "html.extract",
701
+ "memory.store",
702
+ "sandbox.execute",
703
+ "extract.urls",
704
+ "extract.emails",
705
+ "csv.generate",
706
+ }
707
+ assert required_tool_calls.issubset(set(summary["tool_call_counts"].keys()))
708
+
709
+ assert summary["strategy_counts"]["github_trending"] >= 1
710
+ assert summary["strategy_counts"]["reddit_trending"] >= 1
711
+ assert summary["sandbox_success_cases"] >= 10
712
+ assert summary["completed_cases"] >= 95
713
+
714
+
715
+ @pytest.mark.skipif(
716
+ not _is_live_network_mode(),
717
+ reason="Enable SCRAPERL_E2E_LIVE_NETWORK=1 for live-network staging runs",
718
+ )
719
+ def test_scraper_e2e_live_network_mode_staging(client: TestClient) -> None:
720
+ """Live-network E2E mode with no mocks, controlled by environment flag."""
721
+
722
+ cases = _build_live_network_cases()
723
+ case_limit = _env_positive_int("SCRAPERL_E2E_LIVE_CASE_LIMIT")
724
+ if case_limit is not None:
725
+ cases = cases[: min(case_limit, len(cases))]
726
+
727
+ summary = _run_case_batch(client, cases)
728
+
729
+ assert not summary["failures"], " | ".join(summary["failures"][:10])
730
+
731
+ expected_templates = {case.expected_template_id for case in cases if case.expected_template_id}
732
+ assert expected_templates.issubset(summary["seen_template_ids"])
733
+
734
+ required_tool_calls = {
735
+ "url.parse",
736
+ "browser.navigate",
737
+ "html.parse",
738
+ "html.extract",
739
+ "memory.store",
740
+ }
741
+ assert required_tool_calls.issubset(set(summary["tool_call_counts"].keys()))
742
+
743
+ expected_sandbox_cases = sum(1 for case in cases if case.expect_sandbox)
744
+ assert summary["sandbox_success_cases"] >= expected_sandbox_cases
745
+
746
+ assert summary["strategy_counts"]["github_trending"] >= 1
747
+ assert summary["strategy_counts"]["reddit_trending"] >= 1
748
+ assert summary["completed_cases"] >= max(1, len(cases) // 2)
frontend/src/components/PluginsPage.tsx CHANGED
@@ -10,7 +10,6 @@ import {
10
  AlertCircle,
11
  Loader2,
12
  Plug,
13
- Cpu,
14
  Wrench,
15
  Database,
16
  Sparkles,
@@ -49,8 +48,6 @@ const getCategoryIcon = (category: string) => {
49
  return <Plug className="w-5 h-5 text-cyan-400" />;
50
  case 'mcps':
51
  return <Wrench className="w-5 h-5 text-amber-400" />;
52
- case 'skills':
53
- return <Cpu className="w-5 h-5 text-purple-400" />;
54
  case 'processors':
55
  return <Database className="w-5 h-5 text-pink-400" />;
56
  default:
@@ -62,7 +59,6 @@ const getCategoryLabel = (category: string) => {
62
  const labels: Record<string, string> = {
63
  apis: 'API Providers',
64
  mcps: 'MCP Tools',
65
- skills: 'Skills & Agents',
66
  processors: 'Data Processors',
67
  };
68
  return labels[category] || category;
@@ -72,7 +68,6 @@ const getCategoryColor = (category: string) => {
72
  const colors: Record<string, string> = {
73
  apis: 'from-cyan-500/20 to-blue-500/10 border-cyan-500/30',
74
  mcps: 'from-amber-500/20 to-orange-500/10 border-amber-500/30',
75
- skills: 'from-purple-500/20 to-pink-500/10 border-purple-500/30',
76
  processors: 'from-pink-500/20 to-rose-500/10 border-pink-500/30',
77
  };
78
  return colors[category] || 'from-gray-500/20 to-gray-500/10 border-gray-500/30';
@@ -169,7 +164,7 @@ export const PluginsPage: React.FC<PluginsPageProps> = ({ className }) => {
169
  Plugins
170
  </h1>
171
  <p className="text-gray-400 mt-1">
172
- Extend ScrapeRL with APIs, tools, skills, and processors
173
  </p>
174
  </div>
175
 
@@ -228,7 +223,7 @@ export const PluginsPage: React.FC<PluginsPageProps> = ({ className }) => {
228
  >
229
  All
230
  </button>
231
- {['apis', 'mcps', 'skills', 'processors'].map((cat) => (
232
  <button
233
  key={cat}
234
  onClick={() => setSelectedCategory(cat)}
 
10
  AlertCircle,
11
  Loader2,
12
  Plug,
 
13
  Wrench,
14
  Database,
15
  Sparkles,
 
48
  return <Plug className="w-5 h-5 text-cyan-400" />;
49
  case 'mcps':
50
  return <Wrench className="w-5 h-5 text-amber-400" />;
 
 
51
  case 'processors':
52
  return <Database className="w-5 h-5 text-pink-400" />;
53
  default:
 
59
  const labels: Record<string, string> = {
60
  apis: 'API Providers',
61
  mcps: 'MCP Tools',
 
62
  processors: 'Data Processors',
63
  };
64
  return labels[category] || category;
 
68
  const colors: Record<string, string> = {
69
  apis: 'from-cyan-500/20 to-blue-500/10 border-cyan-500/30',
70
  mcps: 'from-amber-500/20 to-orange-500/10 border-amber-500/30',
 
71
  processors: 'from-pink-500/20 to-rose-500/10 border-pink-500/30',
72
  };
73
  return colors[category] || 'from-gray-500/20 to-gray-500/10 border-gray-500/30';
 
164
  Plugins
165
  </h1>
166
  <p className="text-gray-400 mt-1">
167
+ Extend ScrapeRL with APIs, MCP tools, and processors
168
  </p>
169
  </div>
170
 
 
223
  >
224
  All
225
  </button>
226
+ {(pluginsData?.categories || ['apis', 'mcps', 'processors']).map((cat) => (
227
  <button
228
  key={cat}
229
  onClick={() => setSelectedCategory(cat)}