|
""" |
|
Web scraping tools for extracting content from web pages. |
|
""" |
|
|
|
from smolagents import tool |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import urllib.parse |
|
|
|
|
|
@tool |
|
def scrape_webpage_content(url: str, content_selector: str = None) -> str: |
|
""" |
|
Scrape content from a webpage and extract the main text content. |
|
|
|
Args: |
|
url: The URL of the webpage to scrape |
|
content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content') |
|
|
|
Returns: |
|
The extracted text content from the webpage |
|
""" |
|
try: |
|
|
|
parsed_url = urllib.parse.urlparse(url) |
|
if not parsed_url.scheme or not parsed_url.netloc: |
|
return f"Invalid URL: {url}" |
|
|
|
|
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
'Accept-Language': 'en-US,en;q=0.5', |
|
'Accept-Encoding': 'gzip, deflate', |
|
'Connection': 'keep-alive', |
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
for script in soup(["script", "style", "nav", "header", "footer", "aside"]): |
|
script.decompose() |
|
|
|
|
|
if content_selector: |
|
|
|
content_element = soup.select_one(content_selector) |
|
if content_element: |
|
text_content = content_element.get_text(strip=True, separator=' ') |
|
else: |
|
return f"No content found with selector '{content_selector}' on {url}" |
|
else: |
|
|
|
content_selectors = [ |
|
'article', |
|
'.article__content', |
|
'.content', |
|
'.post-content', |
|
'.entry-content', |
|
'#content', |
|
'main', |
|
'.main-content', |
|
'[role="main"]' |
|
] |
|
|
|
text_content = None |
|
for selector in content_selectors: |
|
element = soup.select_one(selector) |
|
if element: |
|
text_content = element.get_text(strip=True, separator=' ') |
|
break |
|
|
|
|
|
if not text_content: |
|
body = soup.find('body') |
|
if body: |
|
text_content = body.get_text(strip=True, separator=' ') |
|
else: |
|
text_content = soup.get_text(strip=True, separator=' ') |
|
|
|
|
|
if text_content: |
|
|
|
lines = [line.strip() for line in text_content.split('\n') if line.strip()] |
|
cleaned_text = '\n'.join(lines) |
|
|
|
|
|
if len(cleaned_text) > 5000: |
|
cleaned_text = cleaned_text[:5000] + "... [Content truncated]" |
|
|
|
return f"Content from {url}:\n\n{cleaned_text}" |
|
else: |
|
return f"No readable content found on {url}" |
|
|
|
except requests.exceptions.RequestException as e: |
|
return f"Error fetching webpage {url}: {str(e)}" |
|
except Exception as e: |
|
return f"Error scraping webpage {url}: {str(e)}" |
|
|
|
|
|
@tool |
|
def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str: |
|
""" |
|
Extract links from a webpage, optionally filtering by link text. |
|
|
|
Args: |
|
url: The URL of the webpage to scrape |
|
link_text_filter: Optional text to filter links by (case-insensitive) |
|
|
|
Returns: |
|
A formatted string containing the extracted links |
|
""" |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
links = soup.find_all('a', href=True) |
|
|
|
extracted_links = [] |
|
for link in links: |
|
href = link['href'] |
|
text = link.get_text(strip=True) |
|
|
|
|
|
if href.startswith('/'): |
|
parsed_base = urllib.parse.urlparse(url) |
|
href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}" |
|
elif href.startswith('#'): |
|
continue |
|
|
|
|
|
if link_text_filter: |
|
if link_text_filter.lower() not in text.lower(): |
|
continue |
|
|
|
if text and href.startswith('http'): |
|
extracted_links.append(f"• {text}: {href}") |
|
|
|
if extracted_links: |
|
result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20]) |
|
if len(extracted_links) > 20: |
|
result += f"\n... and {len(extracted_links) - 20} more links" |
|
return result |
|
else: |
|
return f"No links found on {url}" |
|
|
|
except Exception as e: |
|
return f"Error extracting links from {url}: {str(e)}" |
|
|
|
|
|
@tool |
|
def get_webpage_metadata(url: str) -> str: |
|
""" |
|
Extract metadata from a webpage (title, description, etc.). |
|
|
|
Args: |
|
url: The URL of the webpage to analyze |
|
|
|
Returns: |
|
A formatted string containing the webpage metadata |
|
""" |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
metadata = [] |
|
|
|
|
|
title = soup.find('title') |
|
if title: |
|
metadata.append(f"Title: {title.get_text(strip=True)}") |
|
|
|
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'}) |
|
if meta_desc and meta_desc.get('content'): |
|
metadata.append(f"Description: {meta_desc['content']}") |
|
|
|
|
|
meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) |
|
if meta_keywords and meta_keywords.get('content'): |
|
metadata.append(f"Keywords: {meta_keywords['content']}") |
|
|
|
|
|
meta_author = soup.find('meta', attrs={'name': 'author'}) |
|
if meta_author and meta_author.get('content'): |
|
metadata.append(f"Author: {meta_author['content']}") |
|
|
|
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'}) |
|
if og_title and og_title.get('content'): |
|
metadata.append(f"OG Title: {og_title['content']}") |
|
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'}) |
|
if og_desc and og_desc.get('content'): |
|
metadata.append(f"OG Description: {og_desc['content']}") |
|
|
|
if metadata: |
|
return f"Metadata from {url}:\n\n" + '\n'.join(metadata) |
|
else: |
|
return f"No metadata found on {url}" |
|
|
|
except Exception as e: |
|
return f"Error extracting metadata from {url}: {str(e)}" |
|
|