Spaces:
Running
Running
Commit ·
4b354aa
1
Parent(s): b136a9f
fix: resolve bare-domain assets to direct URLs
Browse filesCo-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
backend/app/api/routes/scrape.py
CHANGED
|
@@ -282,7 +282,12 @@ def _create_tool_call_step(
|
|
| 282 |
) -> dict[str, Any]:
|
| 283 |
"""Create a tool call step event."""
|
| 284 |
step_number = len(session.get("steps", [])) + 1
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
if status == "completed" and result:
|
| 287 |
result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
|
| 288 |
message = f"{tool_name}() → {result_preview[:50]}"
|
|
@@ -515,8 +520,42 @@ def _create_intelligent_navigation_plan(instructions: str, assets: list[str]) ->
|
|
| 515 |
def _is_url_asset(asset: str) -> bool:
|
| 516 |
"""Check whether an asset string is a URL."""
|
| 517 |
|
| 518 |
-
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
|
| 522 |
def _discover_assets_for_query(query: str) -> list[str]:
|
|
@@ -610,8 +649,11 @@ async def _resolve_assets(
|
|
| 610 |
candidate = asset.strip()
|
| 611 |
if not candidate:
|
| 612 |
continue
|
| 613 |
-
|
| 614 |
-
|
|
|
|
|
|
|
|
|
|
| 615 |
continue
|
| 616 |
|
| 617 |
discovered: list[str] = []
|
|
|
|
| 282 |
) -> dict[str, Any]:
|
| 283 |
"""Create a tool call step event."""
|
| 284 |
step_number = len(session.get("steps", [])) + 1
|
| 285 |
+
|
| 286 |
+
def _format_arg(value: Any) -> str:
|
| 287 |
+
rendered = json.dumps(value, default=str)
|
| 288 |
+
return rendered if len(rendered) <= 40 else f"{rendered[:37]}..."
|
| 289 |
+
|
| 290 |
+
message = f"{tool_name}({', '.join(f'{k}={_format_arg(v)}' for k, v in parameters.items())})"
|
| 291 |
if status == "completed" and result:
|
| 292 |
result_preview = ", ".join(f"{k}={v}" for k, v in list(result.items())[:2])
|
| 293 |
message = f"{tool_name}() → {result_preview[:50]}"
|
|
|
|
| 520 |
def _is_url_asset(asset: str) -> bool:
|
| 521 |
"""Check whether an asset string is a URL."""
|
| 522 |
|
| 523 |
+
return _coerce_url_asset(asset) is not None
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def _looks_like_host(host: str) -> bool:
|
| 527 |
+
"""Return True when host resembles a real domain, localhost, or IPv4."""
|
| 528 |
+
|
| 529 |
+
lowered = host.lower()
|
| 530 |
+
if lowered == "localhost":
|
| 531 |
+
return True
|
| 532 |
+
|
| 533 |
+
if re.match(r"^\d{1,3}(?:\.\d{1,3}){3}$", lowered):
|
| 534 |
+
return True
|
| 535 |
+
|
| 536 |
+
return bool(re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", lowered))
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
def _coerce_url_asset(asset: str) -> str | None:
|
| 540 |
+
"""Normalize URL-like asset strings (supports bare domains such as github.com)."""
|
| 541 |
+
|
| 542 |
+
candidate = asset.strip()
|
| 543 |
+
if not candidate or any(ch.isspace() for ch in candidate):
|
| 544 |
+
return None
|
| 545 |
+
|
| 546 |
+
normalized = candidate
|
| 547 |
+
if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
|
| 548 |
+
normalized = f"https://{normalized}"
|
| 549 |
+
|
| 550 |
+
parsed = urlparse(normalized)
|
| 551 |
+
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
| 552 |
+
return None
|
| 553 |
+
|
| 554 |
+
host = (parsed.hostname or "").strip().lower()
|
| 555 |
+
if not host or not _looks_like_host(host):
|
| 556 |
+
return None
|
| 557 |
+
|
| 558 |
+
return normalized
|
| 559 |
|
| 560 |
|
| 561 |
def _discover_assets_for_query(query: str) -> list[str]:
|
|
|
|
| 649 |
candidate = asset.strip()
|
| 650 |
if not candidate:
|
| 651 |
continue
|
| 652 |
+
|
| 653 |
+
normalized_url = _coerce_url_asset(candidate)
|
| 654 |
+
if normalized_url:
|
| 655 |
+
if normalized_url not in resolved:
|
| 656 |
+
resolved.append(normalized_url)
|
| 657 |
continue
|
| 658 |
|
| 659 |
discovered: list[str] = []
|
backend/app/sites/registry.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 5 |
from typing import Any
|
| 6 |
from urllib.parse import urlparse
|
| 7 |
|
|
@@ -49,15 +50,41 @@ def _normalize_domain(value: str) -> str:
|
|
| 49 |
return lowered
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def _extract_domains_from_assets(assets: list[str]) -> list[str]:
|
| 53 |
"""Extract normalized domains from URL assets."""
|
| 54 |
|
| 55 |
domains: list[str] = []
|
| 56 |
for asset in assets:
|
| 57 |
-
|
| 58 |
-
if
|
| 59 |
continue
|
| 60 |
-
|
|
|
|
| 61 |
if domain not in domains:
|
| 62 |
domains.append(domain)
|
| 63 |
return domains
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import re
|
| 6 |
from typing import Any
|
| 7 |
from urllib.parse import urlparse
|
| 8 |
|
|
|
|
| 50 |
return lowered
|
| 51 |
|
| 52 |
|
| 53 |
+
def _coerce_asset_to_url(asset: str) -> str | None:
|
| 54 |
+
"""Normalize URL-like assets, including bare domains such as github.com."""
|
| 55 |
+
|
| 56 |
+
candidate = asset.strip()
|
| 57 |
+
if not candidate or any(ch.isspace() for ch in candidate):
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
normalized = candidate
|
| 61 |
+
if not re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", normalized):
|
| 62 |
+
normalized = f"https://{normalized}"
|
| 63 |
+
|
| 64 |
+
parsed = urlparse(normalized)
|
| 65 |
+
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
host = (parsed.hostname or "").lower()
|
| 69 |
+
if host != "localhost" and not re.match(r"^(?:[a-z0-9-]+\.)+[a-z]{2,63}$", host) and not re.match(
|
| 70 |
+
r"^\d{1,3}(?:\.\d{1,3}){3}$",
|
| 71 |
+
host,
|
| 72 |
+
):
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
return normalized
|
| 76 |
+
|
| 77 |
+
|
| 78 |
def _extract_domains_from_assets(assets: list[str]) -> list[str]:
|
| 79 |
"""Extract normalized domains from URL assets."""
|
| 80 |
|
| 81 |
domains: list[str] = []
|
| 82 |
for asset in assets:
|
| 83 |
+
normalized_url = _coerce_asset_to_url(asset)
|
| 84 |
+
if not normalized_url:
|
| 85 |
continue
|
| 86 |
+
parsed = urlparse(normalized_url)
|
| 87 |
+
domain = _normalize_domain(parsed.hostname or parsed.netloc)
|
| 88 |
if domain not in domains:
|
| 89 |
domains.append(domain)
|
| 90 |
return domains
|
backend/tests/test_api/test_asset_resolution.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Regression tests for asset URL resolution and site-template matching."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from app.api.routes import scrape as scrape_routes
|
| 8 |
+
from app.sites import match_site_template
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.mark.asyncio
|
| 12 |
+
async def test_resolve_assets_treats_bare_domain_as_url() -> None:
|
| 13 |
+
"""Bare domains should resolve directly, without query fallback/search discovery."""
|
| 14 |
+
|
| 15 |
+
resolved, discoveries = await scrape_routes._resolve_assets(["github.com"], enabled_plugins=[])
|
| 16 |
+
|
| 17 |
+
assert resolved == ["https://github.com"]
|
| 18 |
+
assert discoveries == []
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_match_site_template_with_bare_domain() -> None:
|
| 22 |
+
"""Site template matching should work when assets omit URL scheme."""
|
| 23 |
+
|
| 24 |
+
template = match_site_template("top 10 trending repos", ["github.com"])
|
| 25 |
+
|
| 26 |
+
assert template is not None
|
| 27 |
+
assert template.site_id == "github"
|
| 28 |
+
|