Spaces:

NeerajCodz
/

scrapeRL

Running

App Files Files Community

NeerajCodz Copilot commited on about 1 month ago

Commit

4b354aa

1 Parent(s): b136a9f

fix: resolve bare-domain assets to direct URLs

Browse files

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Files changed (3) hide show

backend/app/api/routes/scrape.py +47 -5
backend/app/sites/registry.py +30 -3
backend/tests/test_api/test_asset_resolution.py +28 -0

backend/app/api/routes/scrape.py CHANGED Viewed

@@ -282,7 +282,12 @@ def _create_tool_call_step(
 ) -> dict[str, Any]:
     """Create a tool call step event."""
     step_number = len(session.get("steps", [])) + 1
-    message = f"{tool_name}({', '.join(f'{k}={repr(v)[:20]}' for k, v in parameters.items())})"
     if status == "completed" and result:
         result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
         message = f"{tool_name}() → {result_preview[:50]}"
@@ -515,8 +520,42 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
 def _is_url_asset(asset: str) -> bool:
     """Check whether an asset string is a URL."""
-    parsed = urlparse(asset.strip())
-    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
 def _discover_assets_for_query(query: str) -> list[str]:
@@ -610,8 +649,11 @@ async def _resolve_assets(
         candidate = asset.strip()
         if not candidate:
             continue
-        if _is_url_asset(candidate):
-            resolved.append(candidate)
             continue
         discovered: list[str] = []

 ) -> dict[str, Any]:
     """Create a tool call step event."""
     step_number = len(session.get("steps", [])) + 1
+    def _format_arg(value: Any) -> str:
+        rendered = json.dumps(value, default=str)
+        return rendered if len(rendered) <= 40 else f"{rendered[:37]}..."
+    message = f"{tool_name}({', '.join(f'{k}={_format_arg(v)}' for k, v in parameters.items())})"
     if status == "completed" and result:
         result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
         message = f"{tool_name}() → {result_preview[:50]}"
 def _is_url_asset(asset: str) -> bool:
     """Check whether an asset string is a URL."""
+    return _coerce_url_asset(asset) is not None
+def _looks_like_host(host: str) -> bool:
+    """Return True when host resembles a real domain, localhost, or IPv4."""
+    lowered = host.lower()
+    if lowered == "localhost":
+        return True
+    if re.match(r"^\d{1,3}(?:\.\d{1,3}){3}$", lowered):
+        return True
+    return bool(re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", lowered))
+def _coerce_url_asset(asset: str) -> str | None:
+    """Normalize URL-like asset strings (supports bare domains such as github.com)."""
+    candidate = asset.strip()
+    if not candidate or any(ch.isspace() for ch in candidate):
+        return None
+    normalized = candidate
+    if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
+        normalized = f"https://{normalized}"
+    parsed = urlparse(normalized)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        return None
+    host = (parsed.hostname or "").strip().lower()
+    if not host or not _looks_like_host(host):
+        return None
+    return normalized
 def _discover_assets_for_query(query: str) -> list[str]:
         candidate = asset.strip()
         if not candidate:
             continue
+        normalized_url = _coerce_url_asset(candidate)
+        if normalized_url:
+            if normalized_url not in resolved:
+                resolved.append(normalized_url)
             continue
         discovered: list[str] = []

backend/app/sites/registry.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 from typing import Any
 from urllib.parse import urlparse
@@ -49,15 +50,41 @@ def _normalize_domain(value: str) -> str:
     return lowered
 def _extract_domains_from_assets(assets: list[str]) -> list[str]:
     """Extract normalized domains from URL assets."""
     domains: list[str] = []
     for asset in assets:
-        parsed = urlparse(asset.strip())
-        if parsed.scheme not in {"http", "https"} or not parsed.netloc:
             continue
-        domain = _normalize_domain(parsed.netloc)
         if domain not in domains:
             domains.append(domain)
     return domains

 from __future__ import annotations
+import re
 from typing import Any
 from urllib.parse import urlparse
     return lowered
+def _coerce_asset_to_url(asset: str) -> str | None:
+    """Normalize URL-like assets, including bare domains such as github.com."""
+    candidate = asset.strip()
+    if not candidate or any(ch.isspace() for ch in candidate):
+        return None
+    normalized = candidate
+    if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
+        normalized = f"https://{normalized}"
+    parsed = urlparse(normalized)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        return None
+    host = (parsed.hostname or "").lower()
+    if host != "localhost" and not re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", host) and not re.match(
+        r"^\d{1,3}(?:\.\d{1,3}){3}$",
+        host,
+    ):
+        return None
+    return normalized
 def _extract_domains_from_assets(assets: list[str]) -> list[str]:
     """Extract normalized domains from URL assets."""
     domains: list[str] = []
     for asset in assets:
+        normalized_url = _coerce_asset_to_url(asset)
+        if not normalized_url:
             continue
+        parsed = urlparse(normalized_url)
+        domain = _normalize_domain(parsed.hostname or parsed.netloc)
         if domain not in domains:
             domains.append(domain)
     return domains

backend/tests/test_api/test_asset_resolution.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Regression tests for asset URL resolution and site-template matching."""
+from __future__ import annotations
+import pytest
+from app.api.routes import scrape as scrape_routes
+from app.sites import match_site_template
+@pytest.mark.asyncio
+async def test_resolve_assets_treats_bare_domain_as_url() -> None:
+    """Bare domains should resolve directly, without query fallback/search discovery."""
+    resolved, discoveries = await scrape_routes._resolve_assets(["github.com"], enabled_plugins=[])
+    assert resolved == ["https://github.com"]
+    assert discoveries == []
+def test_match_site_template_with_bare_domain() -> None:
+    """Site template matching should work when assets omit URL scheme."""
+    template = match_site_template("top 10 trending repos", ["github.com"])
+    assert template is not None
+    assert template.site_id == "github"