MHamdan's picture
Upload tool
3058953 verified
from smolagents import Tool
from typing import Any, Optional
class SimpleTool(Tool):
name = "extract_web_content"
description = "Extracts and processes content from a given webpage."
inputs = {"url":{"type":"string","description":"The webpage URL to scrape."},"content_type":{"type":"string","nullable":True,"description":"Type of content to extract ('all', 'text', 'links', 'headers'). Defaults to 'all'."}}
output_type = "string"
def forward(self, url: str, content_type: Optional[str] = "all") -> str:
"""Extracts and processes content from a given webpage.
Args:
url: The webpage URL to scrape.
content_type: Type of content to extract ('all', 'text', 'links', 'headers').
Defaults to 'all'.
Returns:
str: Extracted and processed content from the webpage.
"""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
try:
# Validate URL
parsed_url = urlparse(url)
if not all([parsed_url.scheme, parsed_url.netloc]):
return "Error: Invalid URL format. Please provide a valid URL."
# Fetch webpage
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Parse content
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts and styles
for tag in soup(['script', 'style']):
tag.decompose()
# Handle different content types
if content_type == "text":
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip()
return f"Text Content:\n{text[:2000]}..."
elif content_type == "links":
links = []
for link in soup.find_all('a', href=True):
if link.text.strip() and link['href'].startswith(('http', 'https')):
text = re.sub(r'\s+', ' ', link.text).strip()
links.append(f"- {text}: {link['href']}")
return "Found Links:\n" + "\n".join(links[:10])
elif content_type == "headers":
headers = []
for h in soup.find_all(['h1', 'h2', 'h3']):
text = re.sub(r'\s+', ' ', h.text).strip()
if text:
headers.append(f"- {text}")
return "Page Headers:\n" + "\n".join(headers)
else:
# Get basic info
title = soup.title.string if soup.title else "No title found"
title = re.sub(r'\s+', ' ', title).strip() if title else "No title found"
# Get text content
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip()
# Format output
output = [
f"URL: {url}",
f"Title: {title}",
"\nContent Preview:",
text[:1000] + "..."
]
return "\n".join(output)
except requests.exceptions.RequestException as e:
return f"Error accessing webpage: {str(e)}"
except Exception as e:
return f"Error processing webpage: {str(e)}"