NeerajCodz commited on
Commit
b136a9f
·
1 Parent(s): fa40af9

feat: add validate.url and html.parse tool calls for comprehensive visibility

Browse files
Files changed (1) hide show
  1. backend/app/api/routes/scrape.py +85 -0
backend/app/api/routes/scrape.py CHANGED
@@ -1746,6 +1746,55 @@ async def _scrape_single_page(
1746
  ),
1747
  )
1748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1749
  # Tool call: browser.navigate
1750
  step_num += 1
1751
  yield _record_step(
@@ -1808,6 +1857,42 @@ async def _scrape_single_page(
1808
  if not nav_success or not nav_obs.page_html:
1809
  session["errors"].append(f"Failed to navigate to {url}")
1810
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
 
1812
  # Extract fields
1813
  extracted = {}
 
1746
  ),
1747
  )
1748
 
1749
+ # Tool call: browser.navigate
1750
+ # Tool call: validate.url (check URL before navigating)
1751
+ step_num += 1
1752
+ yield _record_step(
1753
+ session,
1754
+ ScrapeStep(
1755
+ step_number=step_num,
1756
+ action="tool_call",
1757
+ url=url,
1758
+ status="running",
1759
+ message="validate.url(url)",
1760
+ extracted_data={
1761
+ "tool_name": "validate.url",
1762
+ "tool_description": "Validate URL format before navigation",
1763
+ "parameters": {"url": url},
1764
+ },
1765
+ timestamp=_now_iso(),
1766
+ ),
1767
+ )
1768
+
1769
+ # Simple URL validation
1770
+ parsed_url = urlparse(url)
1771
+ url_valid = bool(parsed_url.scheme and parsed_url.netloc)
1772
+
1773
+ yield _record_step(
1774
+ session,
1775
+ ScrapeStep(
1776
+ step_number=step_num,
1777
+ action="tool_call",
1778
+ url=url,
1779
+ status="completed" if url_valid else "failed",
1780
+ message=f"validate.url() → {'valid' if url_valid else 'invalid'}",
1781
+ reward=0.02 if url_valid else 0.0,
1782
+ extracted_data={
1783
+ "tool_name": "validate.url",
1784
+ "result": {
1785
+ "valid": url_valid,
1786
+ "scheme": parsed_url.scheme,
1787
+ "domain": parsed_url.netloc,
1788
+ },
1789
+ },
1790
+ timestamp=_now_iso(),
1791
+ ),
1792
+ )
1793
+
1794
+ if not url_valid:
1795
+ session["errors"].append(f"Invalid URL: {url}")
1796
+ return
1797
+
1798
  # Tool call: browser.navigate
1799
  step_num += 1
1800
  yield _record_step(
 
1857
  if not nav_success or not nav_obs.page_html:
1858
  session["errors"].append(f"Failed to navigate to {url}")
1859
  return
1860
+
1861
+ # Tool call: html.parse (parse HTML into DOM)
1862
+ step_num += 1
1863
+ yield _record_step(
1864
+ session,
1865
+ ScrapeStep(
1866
+ step_number=step_num,
1867
+ action="tool_call",
1868
+ url=url,
1869
+ status="running",
1870
+ message="html.parse(content)",
1871
+ extracted_data={
1872
+ "tool_name": "html.parse",
1873
+ "tool_description": "Parse HTML document into DOM structure",
1874
+ "parameters": {"parser": "html.parser", "content_length": len(nav_obs.page_html)},
1875
+ },
1876
+ timestamp=_now_iso(),
1877
+ ),
1878
+ )
1879
+
1880
+ yield _record_step(
1881
+ session,
1882
+ ScrapeStep(
1883
+ step_number=step_num,
1884
+ action="tool_call",
1885
+ url=url,
1886
+ status="completed",
1887
+ message="html.parse() → DOM ready",
1888
+ reward=0.05,
1889
+ extracted_data={
1890
+ "tool_name": "html.parse",
1891
+ "result": {"parsed": True, "html_length": len(nav_obs.page_html)},
1892
+ },
1893
+ timestamp=_now_iso(),
1894
+ ),
1895
+ )
1896
 
1897
  # Extract fields
1898
  extracted = {}