Spaces:

pareshmishra
/

MT564AITraining

Running

File size: 6,021 Bytes

2c72e40

import logging
import re
from typing import Dict, Any, List
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class BlogScraper(BaseScraper):
    """Scraper for blog websites"""
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse blog content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract metadata
            result = {
                "type": "blog",
                "title": self._extract_title(soup),
                "publish_date": self._extract_publish_date(soup),
                "author": self._extract_author(soup),
                "categories": self._extract_categories(soup),
                "tags": self._extract_tags(soup),
                "summary": self._extract_summary(text_content),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing blog content: {str(e)}")
            return {"type": "blog", "error_parsing": str(e)}
    
    def _extract_title(self, soup: BeautifulSoup) -> str:
        """Extract title from blog post"""
        # Try different methods to find title
        title = None
        
        # Method 1: Look for <h1> tags in article or entry
        article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)'))
        if article:
            h1 = article.find('h1')
            if h1:
                title = h1.get_text().strip()
        
        # Method 2: Look for any h1 if above failed
        if not title:
            h1_tags = soup.find_all('h1')
            if h1_tags and len(h1_tags) > 0:
                title = h1_tags[0].get_text().strip()
        
        # Method 3: Look for blog titles in meta tags
        if not title:
            og_title = soup.find('meta', property='og:title')
            if og_title and og_title.get('content'):
                title = og_title['content'].strip()
        
        return title or "Unknown Title"
    
    def _extract_publish_date(self, soup: BeautifulSoup) -> str:
        """Extract publication date"""
        # Try various methods to find date
        date = None
        
        # Method 1: Look for common date meta tags
        date_meta = soup.find('meta', property='article:published_time')
        if date_meta and date_meta.get('content'):
            date = date_meta['content']
        
        # Method 2: Look for common blog date classes
        if not date:
            date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
            for class_name in date_classes:
                date_element = soup.find(class_=re.compile(class_name, re.I))
                if date_element:
                    date = date_element.get_text().strip()
                    break
        
        return date or "Unknown Date"
    
    def _extract_author(self, soup: BeautifulSoup) -> str:
        """Extract author information"""
        # Try various methods to find author
        author = None
        
        # Method 1: Look for author meta tags
        author_meta = soup.find('meta', property='article:author')
        if author_meta and author_meta.get('content'):
            author = author_meta['content']
        
        # Method 2: Look for blog-specific author classes
        if not author:
            author_classes = ['author', 'byline', 'entry-author', 'post-author']
            for class_name in author_classes:
                author_element = soup.find(class_=re.compile(class_name, re.I))
                if author_element:
                    author = author_element.get_text().strip()
                    break
        
        return author or "Unknown Author"
    
    def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
        """Extract blog post categories"""
        categories = []
        
        # Method 1: Look for category links
        category_elements = soup.find_all('a', class_=re.compile('category'))
        if category_elements:
            for element in category_elements:
                cat_text = element.get_text().strip()
                if cat_text and cat_text not in categories:
                    categories.append(cat_text)
        
        # Method 2: Look for category meta tag
        if not categories:
            category_meta = soup.find('meta', property='article:section')
            if category_meta and category_meta.get('content'):
                categories.append(category_meta['content'].strip())
        
        return categories
    
    def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
        """Extract blog post tags"""
        tags = []
        
        # Look for tag links
        tag_elements = soup.find_all('a', class_=re.compile('tag'))
        if tag_elements:
            for element in tag_elements:
                tag_text = element.get_text().strip()
                if tag_text and tag_text not in tags:
                    tags.append(tag_text)
        
        return tags
    
    def _extract_summary(self, text_content: str) -> str:
        """Extract or create a summary from the blog post text"""
        if not text_content:
            return "No summary available"
        
        # Take first paragraph or first few sentences (up to 300 chars)
        paragraphs = text_content.split('\n\n')
        if paragraphs:
            summary = paragraphs[0]
            if len(summary) > 300:
                summary = summary[:297] + "..."
            return summary
        
        return "No summary available"
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"