qa-agent

Runtime error

qa-agent / tools /web_scraping.py

Jan Krüger

QA Agent for Certification

8d4d62e 8 days ago

8.08 kB

	"""
	Web scraping tools for extracting content from web pages.
	"""

	from smolagents import tool
	import requests
	from bs4 import BeautifulSoup
	import urllib.parse


	@tool
	def scrape_webpage_content(url: str, content_selector: str = None) -> str:
	"""
	Scrape content from a webpage and extract the main text content.

	Args:
	url: The URL of the webpage to scrape
	content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content')

	Returns:
	The extracted text content from the webpage
	"""
	try:
	# Validate URL
	parsed_url = urllib.parse.urlparse(url)
	if not parsed_url.scheme or not parsed_url.netloc:
	return f"Invalid URL: {url}"

	# Set headers to mimic a real browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	}

	# Make the request
	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	# Parse the HTML
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
	script.decompose()

	# Extract content based on selector or find main content
	if content_selector:
	# Use the provided CSS selector
	content_element = soup.select_one(content_selector)
	if content_element:
	text_content = content_element.get_text(strip=True, separator=' ')
	else:
	return f"No content found with selector '{content_selector}' on {url}"
	else:
	# Try common content selectors
	content_selectors = [
	'article',
	'.article__content',
	'.content',
	'.post-content',
	'.entry-content',
	'#content',
	'main',
	'.main-content',
	'[role="main"]'
	]

	text_content = None
	for selector in content_selectors:
	element = soup.select_one(selector)
	if element:
	text_content = element.get_text(strip=True, separator=' ')
	break

	# If no specific content area found, get body text
	if not text_content:
	body = soup.find('body')
	if body:
	text_content = body.get_text(strip=True, separator=' ')
	else:
	text_content = soup.get_text(strip=True, separator=' ')

	# Clean up the text
	if text_content:
	# Remove excessive whitespace
	lines = [line.strip() for line in text_content.split('\n') if line.strip()]
	cleaned_text = '\n'.join(lines)

	# Limit length to prevent overwhelming responses
	if len(cleaned_text) > 5000:
	cleaned_text = cleaned_text[:5000] + "... [Content truncated]"

	return f"Content from {url}:\n\n{cleaned_text}"
	else:
	return f"No readable content found on {url}"

	except requests.exceptions.RequestException as e:
	return f"Error fetching webpage {url}: {str(e)}"
	except Exception as e:
	return f"Error scraping webpage {url}: {str(e)}"


	@tool
	def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str:
	"""
	Extract links from a webpage, optionally filtering by link text.

	Args:
	url: The URL of the webpage to scrape
	link_text_filter: Optional text to filter links by (case-insensitive)

	Returns:
	A formatted string containing the extracted links
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all links
	links = soup.find_all('a', href=True)

	extracted_links = []
	for link in links:
	href = link['href']
	text = link.get_text(strip=True)

	# Convert relative URLs to absolute
	if href.startswith('/'):
	parsed_base = urllib.parse.urlparse(url)
	href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
	elif href.startswith('#'):
	continue # Skip anchor links

	# Filter by text if specified
	if link_text_filter:
	if link_text_filter.lower() not in text.lower():
	continue

	if text and href.startswith('http'):
	extracted_links.append(f"• {text}: {href}")

	if extracted_links:
	result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20]) # Limit to 20 links
	if len(extracted_links) > 20:
	result += f"\n... and {len(extracted_links) - 20} more links"
	return result
	else:
	return f"No links found on {url}"

	except Exception as e:
	return f"Error extracting links from {url}: {str(e)}"


	@tool
	def get_webpage_metadata(url: str) -> str:
	"""
	Extract metadata from a webpage (title, description, etc.).

	Args:
	url: The URL of the webpage to analyze

	Returns:
	A formatted string containing the webpage metadata
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	metadata = []

	# Title
	title = soup.find('title')
	if title:
	metadata.append(f"Title: {title.get_text(strip=True)}")

	# Meta description
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	if meta_desc and meta_desc.get('content'):
	metadata.append(f"Description: {meta_desc['content']}")

	# Meta keywords
	meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
	if meta_keywords and meta_keywords.get('content'):
	metadata.append(f"Keywords: {meta_keywords['content']}")

	# Author
	meta_author = soup.find('meta', attrs={'name': 'author'})
	if meta_author and meta_author.get('content'):
	metadata.append(f"Author: {meta_author['content']}")

	# Open Graph metadata
	og_title = soup.find('meta', attrs={'property': 'og:title'})
	if og_title and og_title.get('content'):
	metadata.append(f"OG Title: {og_title['content']}")

	og_desc = soup.find('meta', attrs={'property': 'og:description'})
	if og_desc and og_desc.get('content'):
	metadata.append(f"OG Description: {og_desc['content']}")

	if metadata:
	return f"Metadata from {url}:\n\n" + '\n'.join(metadata)
	else:
	return f"No metadata found on {url}"

	except Exception as e:
	return f"Error extracting metadata from {url}: {str(e)}"