File size: 8,079 Bytes
8d4d62e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
"""
Web scraping tools for extracting content from web pages.
"""
from smolagents import tool
import requests
from bs4 import BeautifulSoup
import urllib.parse
@tool
def scrape_webpage_content(url: str, content_selector: str = None) -> str:
"""
Scrape content from a webpage and extract the main text content.
Args:
url: The URL of the webpage to scrape
content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content')
Returns:
The extracted text content from the webpage
"""
try:
# Validate URL
parsed_url = urllib.parse.urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
return f"Invalid URL: {url}"
# Set headers to mimic a real browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
# Make the request
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
# Parse the HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
script.decompose()
# Extract content based on selector or find main content
if content_selector:
# Use the provided CSS selector
content_element = soup.select_one(content_selector)
if content_element:
text_content = content_element.get_text(strip=True, separator=' ')
else:
return f"No content found with selector '{content_selector}' on {url}"
else:
# Try common content selectors
content_selectors = [
'article',
'.article__content',
'.content',
'.post-content',
'.entry-content',
'#content',
'main',
'.main-content',
'[role="main"]'
]
text_content = None
for selector in content_selectors:
element = soup.select_one(selector)
if element:
text_content = element.get_text(strip=True, separator=' ')
break
# If no specific content area found, get body text
if not text_content:
body = soup.find('body')
if body:
text_content = body.get_text(strip=True, separator=' ')
else:
text_content = soup.get_text(strip=True, separator=' ')
# Clean up the text
if text_content:
# Remove excessive whitespace
lines = [line.strip() for line in text_content.split('\n') if line.strip()]
cleaned_text = '\n'.join(lines)
# Limit length to prevent overwhelming responses
if len(cleaned_text) > 5000:
cleaned_text = cleaned_text[:5000] + "... [Content truncated]"
return f"Content from {url}:\n\n{cleaned_text}"
else:
return f"No readable content found on {url}"
except requests.exceptions.RequestException as e:
return f"Error fetching webpage {url}: {str(e)}"
except Exception as e:
return f"Error scraping webpage {url}: {str(e)}"
@tool
def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str:
"""
Extract links from a webpage, optionally filtering by link text.
Args:
url: The URL of the webpage to scrape
link_text_filter: Optional text to filter links by (case-insensitive)
Returns:
A formatted string containing the extracted links
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find all links
links = soup.find_all('a', href=True)
extracted_links = []
for link in links:
href = link['href']
text = link.get_text(strip=True)
# Convert relative URLs to absolute
if href.startswith('/'):
parsed_base = urllib.parse.urlparse(url)
href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
elif href.startswith('#'):
continue # Skip anchor links
# Filter by text if specified
if link_text_filter:
if link_text_filter.lower() not in text.lower():
continue
if text and href.startswith('http'):
extracted_links.append(f"β’ {text}: {href}")
if extracted_links:
result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20]) # Limit to 20 links
if len(extracted_links) > 20:
result += f"\n... and {len(extracted_links) - 20} more links"
return result
else:
return f"No links found on {url}"
except Exception as e:
return f"Error extracting links from {url}: {str(e)}"
@tool
def get_webpage_metadata(url: str) -> str:
"""
Extract metadata from a webpage (title, description, etc.).
Args:
url: The URL of the webpage to analyze
Returns:
A formatted string containing the webpage metadata
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
metadata = []
# Title
title = soup.find('title')
if title:
metadata.append(f"Title: {title.get_text(strip=True)}")
# Meta description
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc and meta_desc.get('content'):
metadata.append(f"Description: {meta_desc['content']}")
# Meta keywords
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
if meta_keywords and meta_keywords.get('content'):
metadata.append(f"Keywords: {meta_keywords['content']}")
# Author
meta_author = soup.find('meta', attrs={'name': 'author'})
if meta_author and meta_author.get('content'):
metadata.append(f"Author: {meta_author['content']}")
# Open Graph metadata
og_title = soup.find('meta', attrs={'property': 'og:title'})
if og_title and og_title.get('content'):
metadata.append(f"OG Title: {og_title['content']}")
og_desc = soup.find('meta', attrs={'property': 'og:description'})
if og_desc and og_desc.get('content'):
metadata.append(f"OG Description: {og_desc['content']}")
if metadata:
return f"Metadata from {url}:\n\n" + '\n'.join(metadata)
else:
return f"No metadata found on {url}"
except Exception as e:
return f"Error extracting metadata from {url}: {str(e)}"
|