qa-agent

Runtime error

File size: 8,079 Bytes

8d4d62e

"""
Web scraping tools for extracting content from web pages.
"""

from smolagents import tool
import requests
from bs4 import BeautifulSoup
import urllib.parse


@tool
def scrape_webpage_content(url: str, content_selector: str = None) -> str:
    """
    Scrape content from a webpage and extract the main text content.
    
    Args:
        url: The URL of the webpage to scrape
        content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content')
        
    Returns:
        The extracted text content from the webpage
    """
    try:
        # Validate URL
        parsed_url = urllib.parse.urlparse(url)
        if not parsed_url.scheme or not parsed_url.netloc:
            return f"Invalid URL: {url}"
        
        # Set headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make the request
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
            script.decompose()
        
        # Extract content based on selector or find main content
        if content_selector:
            # Use the provided CSS selector
            content_element = soup.select_one(content_selector)
            if content_element:
                text_content = content_element.get_text(strip=True, separator=' ')
            else:
                return f"No content found with selector '{content_selector}' on {url}"
        else:
            # Try common content selectors
            content_selectors = [
                'article',
                '.article__content',
                '.content',
                '.post-content',
                '.entry-content',
                '#content',
                'main',
                '.main-content',
                '[role="main"]'
            ]
            
            text_content = None
            for selector in content_selectors:
                element = soup.select_one(selector)
                if element:
                    text_content = element.get_text(strip=True, separator=' ')
                    break
            
            # If no specific content area found, get body text
            if not text_content:
                body = soup.find('body')
                if body:
                    text_content = body.get_text(strip=True, separator=' ')
                else:
                    text_content = soup.get_text(strip=True, separator=' ')
        
        # Clean up the text
        if text_content:
            # Remove excessive whitespace
            lines = [line.strip() for line in text_content.split('\n') if line.strip()]
            cleaned_text = '\n'.join(lines)
            
            # Limit length to prevent overwhelming responses
            if len(cleaned_text) > 5000:
                cleaned_text = cleaned_text[:5000] + "... [Content truncated]"
            
            return f"Content from {url}:\n\n{cleaned_text}"
        else:
            return f"No readable content found on {url}"
            
    except requests.exceptions.RequestException as e:
        return f"Error fetching webpage {url}: {str(e)}"
    except Exception as e:
        return f"Error scraping webpage {url}: {str(e)}"


@tool
def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str:
    """
    Extract links from a webpage, optionally filtering by link text.
    
    Args:
        url: The URL of the webpage to scrape
        link_text_filter: Optional text to filter links by (case-insensitive)
        
    Returns:
        A formatted string containing the extracted links
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all links
        links = soup.find_all('a', href=True)
        
        extracted_links = []
        for link in links:
            href = link['href']
            text = link.get_text(strip=True)
            
            # Convert relative URLs to absolute
            if href.startswith('/'):
                parsed_base = urllib.parse.urlparse(url)
                href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
            elif href.startswith('#'):
                continue  # Skip anchor links
            
            # Filter by text if specified
            if link_text_filter:
                if link_text_filter.lower() not in text.lower():
                    continue
            
            if text and href.startswith('http'):
                extracted_links.append(f"• {text}: {href}")
        
        if extracted_links:
            result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20])  # Limit to 20 links
            if len(extracted_links) > 20:
                result += f"\n... and {len(extracted_links) - 20} more links"
            return result
        else:
            return f"No links found on {url}"
            
    except Exception as e:
        return f"Error extracting links from {url}: {str(e)}"


@tool
def get_webpage_metadata(url: str) -> str:
    """
    Extract metadata from a webpage (title, description, etc.).
    
    Args:
        url: The URL of the webpage to analyze
        
    Returns:
        A formatted string containing the webpage metadata
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        metadata = []
        
        # Title
        title = soup.find('title')
        if title:
            metadata.append(f"Title: {title.get_text(strip=True)}")
        
        # Meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc and meta_desc.get('content'):
            metadata.append(f"Description: {meta_desc['content']}")
        
        # Meta keywords
        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
        if meta_keywords and meta_keywords.get('content'):
            metadata.append(f"Keywords: {meta_keywords['content']}")
        
        # Author
        meta_author = soup.find('meta', attrs={'name': 'author'})
        if meta_author and meta_author.get('content'):
            metadata.append(f"Author: {meta_author['content']}")
        
        # Open Graph metadata
        og_title = soup.find('meta', attrs={'property': 'og:title'})
        if og_title and og_title.get('content'):
            metadata.append(f"OG Title: {og_title['content']}")
        
        og_desc = soup.find('meta', attrs={'property': 'og:description'})
        if og_desc and og_desc.get('content'):
            metadata.append(f"OG Description: {og_desc['content']}")
        
        if metadata:
            return f"Metadata from {url}:\n\n" + '\n'.join(metadata)
        else:
            return f"No metadata found on {url}"
            
    except Exception as e:
        return f"Error extracting metadata from {url}: {str(e)}"