meirk-brd
fix coerce url
59e2cd4
raw
history blame
2.68 kB
from __future__ import annotations
import json
import os
from typing import Optional
import requests
from smolagents.tools import Tool
class BrightDataScraperTool(Tool):
name = "brightdata_web_scraper"
description = """
Scrape any webpage and return content in Markdown format.
This tool can bypass bot detection and CAPTCHAs.
Use this when you need to extract content from websites.
"""
output_type = "string"
def __init__(self) -> None:
self.inputs = {
"url": {
"type": "string",
"description": "The URL of the webpage to scrape",
}
}
super().__init__()
def forward(self, url) -> str:
url_str = self._coerce_url_input(url)
if not url_str:
return json.dumps({"error": "No valid URL provided"})
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
if not api_token:
raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
api_url = "https://api.brightdata.com/request"
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"url": url_str,
"zone": unlocker_zone,
"format": "raw",
"data_format": "markdown",
}
try:
response = requests.post(api_url, json=payload, headers=headers, timeout=30)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as exc:
details = exc.response.text if getattr(exc, "response", None) is not None else ""
return json.dumps({"error": str(exc), "details": details})
def _coerce_url_input(self, raw) -> Optional[str]:
if isinstance(raw, str):
return self._ensure_scheme(raw)
if isinstance(raw, dict):
orig_name = raw.get("orig_name")
if isinstance(orig_name, str) and orig_name:
return self._ensure_scheme(orig_name)
url_value = raw.get("url")
if isinstance(url_value, str):
if url_value.startswith(("http://", "https://")):
return url_value
return None
return None
def _parse_file_dict_string(self, value: str) -> Optional[dict]:
import ast
try:
parsed = ast.literal_eval(value)
return parsed if isinstance(parsed, dict) else None
except (ValueError, SyntaxError):
return None