File size: 6,021 Bytes
2c72e40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import logging
import re
from typing import Dict, Any, List
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper

logger = logging.getLogger(__name__)

class BlogScraper(BaseScraper):
    """Scraper for blog websites"""
    
    def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
        """Parse blog content and extract structured data"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Extract metadata
            result = {
                "type": "blog",
                "title": self._extract_title(soup),
                "publish_date": self._extract_publish_date(soup),
                "author": self._extract_author(soup),
                "categories": self._extract_categories(soup),
                "tags": self._extract_tags(soup),
                "summary": self._extract_summary(text_content),
                "source": self._extract_domain(url),
            }
            
            return result
        except Exception as e:
            logger.error(f"Error parsing blog content: {str(e)}")
            return {"type": "blog", "error_parsing": str(e)}
    
    def _extract_title(self, soup: BeautifulSoup) -> str:
        """Extract title from blog post"""
        # Try different methods to find title
        title = None
        
        # Method 1: Look for <h1> tags in article or entry
        article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)'))
        if article:
            h1 = article.find('h1')
            if h1:
                title = h1.get_text().strip()
        
        # Method 2: Look for any h1 if above failed
        if not title:
            h1_tags = soup.find_all('h1')
            if h1_tags and len(h1_tags) > 0:
                title = h1_tags[0].get_text().strip()
        
        # Method 3: Look for blog titles in meta tags
        if not title:
            og_title = soup.find('meta', property='og:title')
            if og_title and og_title.get('content'):
                title = og_title['content'].strip()
        
        return title or "Unknown Title"
    
    def _extract_publish_date(self, soup: BeautifulSoup) -> str:
        """Extract publication date"""
        # Try various methods to find date
        date = None
        
        # Method 1: Look for common date meta tags
        date_meta = soup.find('meta', property='article:published_time')
        if date_meta and date_meta.get('content'):
            date = date_meta['content']
        
        # Method 2: Look for common blog date classes
        if not date:
            date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
            for class_name in date_classes:
                date_element = soup.find(class_=re.compile(class_name, re.I))
                if date_element:
                    date = date_element.get_text().strip()
                    break
        
        return date or "Unknown Date"
    
    def _extract_author(self, soup: BeautifulSoup) -> str:
        """Extract author information"""
        # Try various methods to find author
        author = None
        
        # Method 1: Look for author meta tags
        author_meta = soup.find('meta', property='article:author')
        if author_meta and author_meta.get('content'):
            author = author_meta['content']
        
        # Method 2: Look for blog-specific author classes
        if not author:
            author_classes = ['author', 'byline', 'entry-author', 'post-author']
            for class_name in author_classes:
                author_element = soup.find(class_=re.compile(class_name, re.I))
                if author_element:
                    author = author_element.get_text().strip()
                    break
        
        return author or "Unknown Author"
    
    def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
        """Extract blog post categories"""
        categories = []
        
        # Method 1: Look for category links
        category_elements = soup.find_all('a', class_=re.compile('category'))
        if category_elements:
            for element in category_elements:
                cat_text = element.get_text().strip()
                if cat_text and cat_text not in categories:
                    categories.append(cat_text)
        
        # Method 2: Look for category meta tag
        if not categories:
            category_meta = soup.find('meta', property='article:section')
            if category_meta and category_meta.get('content'):
                categories.append(category_meta['content'].strip())
        
        return categories
    
    def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
        """Extract blog post tags"""
        tags = []
        
        # Look for tag links
        tag_elements = soup.find_all('a', class_=re.compile('tag'))
        if tag_elements:
            for element in tag_elements:
                tag_text = element.get_text().strip()
                if tag_text and tag_text not in tags:
                    tags.append(tag_text)
        
        return tags
    
    def _extract_summary(self, text_content: str) -> str:
        """Extract or create a summary from the blog post text"""
        if not text_content:
            return "No summary available"
        
        # Take first paragraph or first few sentences (up to 300 chars)
        paragraphs = text_content.split('\n\n')
        if paragraphs:
            summary = paragraphs[0]
            if len(summary) > 300:
                summary = summary[:297] + "..."
            return summary
        
        return "No summary available"
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            parsed_url = urlparse(url)
            return parsed_url.netloc
        except Exception:
            return "Unknown Source"