Spaces:
Running
Running
Commit ·
b136a9f
1
Parent(s): fa40af9
feat: add validate.url and html.parse tool calls for comprehensive visibility
Browse files
backend/app/api/routes/scrape.py
CHANGED
|
@@ -1746,6 +1746,55 @@ async def _scrape_single_page(
|
|
| 1746 |
),
|
| 1747 |
)
|
| 1748 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1749 |
# Tool call: browser.navigate
|
| 1750 |
step_num += 1
|
| 1751 |
yield _record_step(
|
|
@@ -1808,6 +1857,42 @@ async def _scrape_single_page(
|
|
| 1808 |
if not nav_success or not nav_obs.page_html:
|
| 1809 |
session["errors"].append(f"Failed to navigate to {url}")
|
| 1810 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1811 |
|
| 1812 |
# Extract fields
|
| 1813 |
extracted = {}
|
|
|
|
| 1746 |
),
|
| 1747 |
)
|
| 1748 |
|
| 1749 |
+
# Tool call: browser.navigate
|
| 1750 |
+
# Tool call: validate.url (check URL before navigating)
|
| 1751 |
+
step_num += 1
|
| 1752 |
+
yield _record_step(
|
| 1753 |
+
session,
|
| 1754 |
+
ScrapeStep(
|
| 1755 |
+
step_number=step_num,
|
| 1756 |
+
action="tool_call",
|
| 1757 |
+
url=url,
|
| 1758 |
+
status="running",
|
| 1759 |
+
message="validate.url(url)",
|
| 1760 |
+
extracted_data={
|
| 1761 |
+
"tool_name": "validate.url",
|
| 1762 |
+
"tool_description": "Validate URL format before navigation",
|
| 1763 |
+
"parameters": {"url": url},
|
| 1764 |
+
},
|
| 1765 |
+
timestamp=_now_iso(),
|
| 1766 |
+
),
|
| 1767 |
+
)
|
| 1768 |
+
|
| 1769 |
+
# Simple URL validation
|
| 1770 |
+
parsed_url = urlparse(url)
|
| 1771 |
+
url_valid = bool(parsed_url.scheme and parsed_url.netloc)
|
| 1772 |
+
|
| 1773 |
+
yield _record_step(
|
| 1774 |
+
session,
|
| 1775 |
+
ScrapeStep(
|
| 1776 |
+
step_number=step_num,
|
| 1777 |
+
action="tool_call",
|
| 1778 |
+
url=url,
|
| 1779 |
+
status="completed" if url_valid else "failed",
|
| 1780 |
+
message=f"validate.url() → {'valid' if url_valid else 'invalid'}",
|
| 1781 |
+
reward=0.02 if url_valid else 0.0,
|
| 1782 |
+
extracted_data={
|
| 1783 |
+
"tool_name": "validate.url",
|
| 1784 |
+
"result": {
|
| 1785 |
+
"valid": url_valid,
|
| 1786 |
+
"scheme": parsed_url.scheme,
|
| 1787 |
+
"domain": parsed_url.netloc,
|
| 1788 |
+
},
|
| 1789 |
+
},
|
| 1790 |
+
timestamp=_now_iso(),
|
| 1791 |
+
),
|
| 1792 |
+
)
|
| 1793 |
+
|
| 1794 |
+
if not url_valid:
|
| 1795 |
+
session["errors"].append(f"Invalid URL: {url}")
|
| 1796 |
+
return
|
| 1797 |
+
|
| 1798 |
# Tool call: browser.navigate
|
| 1799 |
step_num += 1
|
| 1800 |
yield _record_step(
|
|
|
|
| 1857 |
if not nav_success or not nav_obs.page_html:
|
| 1858 |
session["errors"].append(f"Failed to navigate to {url}")
|
| 1859 |
return
|
| 1860 |
+
|
| 1861 |
+
# Tool call: html.parse (parse HTML into DOM)
|
| 1862 |
+
step_num += 1
|
| 1863 |
+
yield _record_step(
|
| 1864 |
+
session,
|
| 1865 |
+
ScrapeStep(
|
| 1866 |
+
step_number=step_num,
|
| 1867 |
+
action="tool_call",
|
| 1868 |
+
url=url,
|
| 1869 |
+
status="running",
|
| 1870 |
+
message="html.parse(content)",
|
| 1871 |
+
extracted_data={
|
| 1872 |
+
"tool_name": "html.parse",
|
| 1873 |
+
"tool_description": "Parse HTML document into DOM structure",
|
| 1874 |
+
"parameters": {"parser": "html.parser", "content_length": len(nav_obs.page_html)},
|
| 1875 |
+
},
|
| 1876 |
+
timestamp=_now_iso(),
|
| 1877 |
+
),
|
| 1878 |
+
)
|
| 1879 |
+
|
| 1880 |
+
yield _record_step(
|
| 1881 |
+
session,
|
| 1882 |
+
ScrapeStep(
|
| 1883 |
+
step_number=step_num,
|
| 1884 |
+
action="tool_call",
|
| 1885 |
+
url=url,
|
| 1886 |
+
status="completed",
|
| 1887 |
+
message="html.parse() → DOM ready",
|
| 1888 |
+
reward=0.05,
|
| 1889 |
+
extracted_data={
|
| 1890 |
+
"tool_name": "html.parse",
|
| 1891 |
+
"result": {"parsed": True, "html_length": len(nav_obs.page_html)},
|
| 1892 |
+
},
|
| 1893 |
+
timestamp=_now_iso(),
|
| 1894 |
+
),
|
| 1895 |
+
)
|
| 1896 |
|
| 1897 |
# Extract fields
|
| 1898 |
extracted = {}
|