NeerajCodz Copilot commited on
Commit
4b354aa
·
1 Parent(s): b136a9f

fix: resolve bare-domain assets to direct URLs

Browse files

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

backend/app/api/routes/scrape.py CHANGED
@@ -282,7 +282,12 @@ def _create_tool_call_step(
282
  ) -> dict[str, Any]:
283
  """Create a tool call step event."""
284
  step_number = len(session.get("steps", [])) + 1
285
- message = f"{tool_name}({', '.join(f'{k}={repr(v)[:20]}' for k, v in parameters.items())})"
 
 
 
 
 
286
  if status == "completed" and result:
287
  result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
288
  message = f"{tool_name}() → {result_preview[:50]}"
@@ -515,8 +520,42 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
515
  def _is_url_asset(asset: str) -> bool:
516
  """Check whether an asset string is a URL."""
517
 
518
- parsed = urlparse(asset.strip())
519
- return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
 
521
 
522
  def _discover_assets_for_query(query: str) -> list[str]:
@@ -610,8 +649,11 @@ async def _resolve_assets(
610
  candidate = asset.strip()
611
  if not candidate:
612
  continue
613
- if _is_url_asset(candidate):
614
- resolved.append(candidate)
 
 
 
615
  continue
616
 
617
  discovered: list[str] = []
 
282
  ) -> dict[str, Any]:
283
  """Create a tool call step event."""
284
  step_number = len(session.get("steps", [])) + 1
285
+
286
+ def _format_arg(value: Any) -> str:
287
+ rendered = json.dumps(value, default=str)
288
+ return rendered if len(rendered) <= 40 else f"{rendered[:37]}..."
289
+
290
+ message = f"{tool_name}({', '.join(f'{k}={_format_arg(v)}' for k, v in parameters.items())})"
291
  if status == "completed" and result:
292
  result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
293
  message = f"{tool_name}() → {result_preview[:50]}"
 
520
  def _is_url_asset(asset: str) -> bool:
521
  """Check whether an asset string is a URL."""
522
 
523
+ return _coerce_url_asset(asset) is not None
524
+
525
+
526
+ def _looks_like_host(host: str) -> bool:
527
+ """Return True when host resembles a real domain, localhost, or IPv4."""
528
+
529
+ lowered = host.lower()
530
+ if lowered == "localhost":
531
+ return True
532
+
533
+ if re.match(r"^\d{1,3}(?:\.\d{1,3}){3}$", lowered):
534
+ return True
535
+
536
+ return bool(re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", lowered))
537
+
538
+
539
+ def _coerce_url_asset(asset: str) -> str | None:
540
+ """Normalize URL-like asset strings (supports bare domains such as github.com)."""
541
+
542
+ candidate = asset.strip()
543
+ if not candidate or any(ch.isspace() for ch in candidate):
544
+ return None
545
+
546
+ normalized = candidate
547
+ if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
548
+ normalized = f"https://{normalized}"
549
+
550
+ parsed = urlparse(normalized)
551
+ if parsed.scheme not in {"http", "https"} or not parsed.netloc:
552
+ return None
553
+
554
+ host = (parsed.hostname or "").strip().lower()
555
+ if not host or not _looks_like_host(host):
556
+ return None
557
+
558
+ return normalized
559
 
560
 
561
  def _discover_assets_for_query(query: str) -> list[str]:
 
649
  candidate = asset.strip()
650
  if not candidate:
651
  continue
652
+
653
+ normalized_url = _coerce_url_asset(candidate)
654
+ if normalized_url:
655
+ if normalized_url not in resolved:
656
+ resolved.append(normalized_url)
657
  continue
658
 
659
  discovered: list[str] = []
backend/app/sites/registry.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  from __future__ import annotations
4
 
 
5
  from typing import Any
6
  from urllib.parse import urlparse
7
 
@@ -49,15 +50,41 @@ def _normalize_domain(value: str) -> str:
49
  return lowered
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def _extract_domains_from_assets(assets: list[str]) -> list[str]:
53
  """Extract normalized domains from URL assets."""
54
 
55
  domains: list[str] = []
56
  for asset in assets:
57
- parsed = urlparse(asset.strip())
58
- if parsed.scheme not in {"http", "https"} or not parsed.netloc:
59
  continue
60
- domain = _normalize_domain(parsed.netloc)
 
61
  if domain not in domains:
62
  domains.append(domain)
63
  return domains
 
2
 
3
  from __future__ import annotations
4
 
5
+ import re
6
  from typing import Any
7
  from urllib.parse import urlparse
8
 
 
50
  return lowered
51
 
52
 
53
+ def _coerce_asset_to_url(asset: str) -> str | None:
54
+ """Normalize URL-like assets, including bare domains such as github.com."""
55
+
56
+ candidate = asset.strip()
57
+ if not candidate or any(ch.isspace() for ch in candidate):
58
+ return None
59
+
60
+ normalized = candidate
61
+ if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
62
+ normalized = f"https://{normalized}"
63
+
64
+ parsed = urlparse(normalized)
65
+ if parsed.scheme not in {"http", "https"} or not parsed.netloc:
66
+ return None
67
+
68
+ host = (parsed.hostname or "").lower()
69
+ if host != "localhost" and not re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", host) and not re.match(
70
+ r"^\d{1,3}(?:\.\d{1,3}){3}$",
71
+ host,
72
+ ):
73
+ return None
74
+
75
+ return normalized
76
+
77
+
78
  def _extract_domains_from_assets(assets: list[str]) -> list[str]:
79
  """Extract normalized domains from URL assets."""
80
 
81
  domains: list[str] = []
82
  for asset in assets:
83
+ normalized_url = _coerce_asset_to_url(asset)
84
+ if not normalized_url:
85
  continue
86
+ parsed = urlparse(normalized_url)
87
+ domain = _normalize_domain(parsed.hostname or parsed.netloc)
88
  if domain not in domains:
89
  domains.append(domain)
90
  return domains
backend/tests/test_api/test_asset_resolution.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Regression tests for asset URL resolution and site-template matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from app.api.routes import scrape as scrape_routes
8
+ from app.sites import match_site_template
9
+
10
+
11
+ @pytest.mark.asyncio
12
+ async def test_resolve_assets_treats_bare_domain_as_url() -> None:
13
+ """Bare domains should resolve directly, without query fallback/search discovery."""
14
+
15
+ resolved, discoveries = await scrape_routes._resolve_assets(["github.com"], enabled_plugins=[])
16
+
17
+ assert resolved == ["https://github.com"]
18
+ assert discoveries == []
19
+
20
+
21
+ def test_match_site_template_with_bare_domain() -> None:
22
+ """Site template matching should work when assets omit URL scheme."""
23
+
24
+ template = match_site_template("top 10 trending repos", ["github.com"])
25
+
26
+ assert template is not None
27
+ assert template.site_id == "github"
28
+