""" Web scraping tools for extracting content from web pages. """ from smolagents import tool import requests from bs4 import BeautifulSoup import urllib.parse @tool def scrape_webpage_content(url: str, content_selector: str = None) -> str: """ Scrape content from a webpage and extract the main text content. Args: url: The URL of the webpage to scrape content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content') Returns: The extracted text content from the webpage """ try: # Validate URL parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme or not parsed_url.netloc: return f"Invalid URL: {url}" # Set headers to mimic a real browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', } # Make the request response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() # Parse the HTML soup = BeautifulSoup(response.content, 'html.parser') # Remove script and style elements for script in soup(["script", "style", "nav", "header", "footer", "aside"]): script.decompose() # Extract content based on selector or find main content if content_selector: # Use the provided CSS selector content_element = soup.select_one(content_selector) if content_element: text_content = content_element.get_text(strip=True, separator=' ') else: return f"No content found with selector '{content_selector}' on {url}" else: # Try common content selectors content_selectors = [ 'article', '.article__content', '.content', '.post-content', '.entry-content', '#content', 'main', '.main-content', '[role="main"]' ] text_content = None for selector in content_selectors: element = soup.select_one(selector) if element: text_content = element.get_text(strip=True, separator=' ') break # If no specific content area found, get body text if not text_content: body = soup.find('body') if body: text_content = body.get_text(strip=True, separator=' ') else: text_content = soup.get_text(strip=True, separator=' ') # Clean up the text if text_content: # Remove excessive whitespace lines = [line.strip() for line in text_content.split('\n') if line.strip()] cleaned_text = '\n'.join(lines) # Limit length to prevent overwhelming responses if len(cleaned_text) > 5000: cleaned_text = cleaned_text[:5000] + "... [Content truncated]" return f"Content from {url}:\n\n{cleaned_text}" else: return f"No readable content found on {url}" except requests.exceptions.RequestException as e: return f"Error fetching webpage {url}: {str(e)}" except Exception as e: return f"Error scraping webpage {url}: {str(e)}" @tool def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str: """ Extract links from a webpage, optionally filtering by link text. Args: url: The URL of the webpage to scrape link_text_filter: Optional text to filter links by (case-insensitive) Returns: A formatted string containing the extracted links """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find all links links = soup.find_all('a', href=True) extracted_links = [] for link in links: href = link['href'] text = link.get_text(strip=True) # Convert relative URLs to absolute if href.startswith('/'): parsed_base = urllib.parse.urlparse(url) href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}" elif href.startswith('#'): continue # Skip anchor links # Filter by text if specified if link_text_filter: if link_text_filter.lower() not in text.lower(): continue if text and href.startswith('http'): extracted_links.append(f"• {text}: {href}") if extracted_links: result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20]) # Limit to 20 links if len(extracted_links) > 20: result += f"\n... and {len(extracted_links) - 20} more links" return result else: return f"No links found on {url}" except Exception as e: return f"Error extracting links from {url}: {str(e)}" @tool def get_webpage_metadata(url: str) -> str: """ Extract metadata from a webpage (title, description, etc.). Args: url: The URL of the webpage to analyze Returns: A formatted string containing the webpage metadata """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') metadata = [] # Title title = soup.find('title') if title: metadata.append(f"Title: {title.get_text(strip=True)}") # Meta description meta_desc = soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): metadata.append(f"Description: {meta_desc['content']}") # Meta keywords meta_keywords = soup.find('meta', attrs={'name': 'keywords'}) if meta_keywords and meta_keywords.get('content'): metadata.append(f"Keywords: {meta_keywords['content']}") # Author meta_author = soup.find('meta', attrs={'name': 'author'}) if meta_author and meta_author.get('content'): metadata.append(f"Author: {meta_author['content']}") # Open Graph metadata og_title = soup.find('meta', attrs={'property': 'og:title'}) if og_title and og_title.get('content'): metadata.append(f"OG Title: {og_title['content']}") og_desc = soup.find('meta', attrs={'property': 'og:description'}) if og_desc and og_desc.get('content'): metadata.append(f"OG Description: {og_desc['content']}") if metadata: return f"Metadata from {url}:\n\n" + '\n'.join(metadata) else: return f"No metadata found on {url}" except Exception as e: return f"Error extracting metadata from {url}: {str(e)}"