Spaces:
Running
Running
File size: 6,021 Bytes
2c72e40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import logging
import re
from typing import Dict, Any, List
from bs4 import BeautifulSoup
from .base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class BlogScraper(BaseScraper):
"""Scraper for blog websites"""
def parse_content(self, html_content: str, text_content: str, url: str) -> Dict[str, Any]:
"""Parse blog content and extract structured data"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract metadata
result = {
"type": "blog",
"title": self._extract_title(soup),
"publish_date": self._extract_publish_date(soup),
"author": self._extract_author(soup),
"categories": self._extract_categories(soup),
"tags": self._extract_tags(soup),
"summary": self._extract_summary(text_content),
"source": self._extract_domain(url),
}
return result
except Exception as e:
logger.error(f"Error parsing blog content: {str(e)}")
return {"type": "blog", "error_parsing": str(e)}
def _extract_title(self, soup: BeautifulSoup) -> str:
"""Extract title from blog post"""
# Try different methods to find title
title = None
# Method 1: Look for <h1> tags in article or entry
article = soup.find(['article', 'div'], class_=re.compile('(post|entry|article)'))
if article:
h1 = article.find('h1')
if h1:
title = h1.get_text().strip()
# Method 2: Look for any h1 if above failed
if not title:
h1_tags = soup.find_all('h1')
if h1_tags and len(h1_tags) > 0:
title = h1_tags[0].get_text().strip()
# Method 3: Look for blog titles in meta tags
if not title:
og_title = soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
title = og_title['content'].strip()
return title or "Unknown Title"
def _extract_publish_date(self, soup: BeautifulSoup) -> str:
"""Extract publication date"""
# Try various methods to find date
date = None
# Method 1: Look for common date meta tags
date_meta = soup.find('meta', property='article:published_time')
if date_meta and date_meta.get('content'):
date = date_meta['content']
# Method 2: Look for common blog date classes
if not date:
date_classes = ['date', 'post-date', 'entry-date', 'published', 'post-meta']
for class_name in date_classes:
date_element = soup.find(class_=re.compile(class_name, re.I))
if date_element:
date = date_element.get_text().strip()
break
return date or "Unknown Date"
def _extract_author(self, soup: BeautifulSoup) -> str:
"""Extract author information"""
# Try various methods to find author
author = None
# Method 1: Look for author meta tags
author_meta = soup.find('meta', property='article:author')
if author_meta and author_meta.get('content'):
author = author_meta['content']
# Method 2: Look for blog-specific author classes
if not author:
author_classes = ['author', 'byline', 'entry-author', 'post-author']
for class_name in author_classes:
author_element = soup.find(class_=re.compile(class_name, re.I))
if author_element:
author = author_element.get_text().strip()
break
return author or "Unknown Author"
def _extract_categories(self, soup: BeautifulSoup) -> List[str]:
"""Extract blog post categories"""
categories = []
# Method 1: Look for category links
category_elements = soup.find_all('a', class_=re.compile('category'))
if category_elements:
for element in category_elements:
cat_text = element.get_text().strip()
if cat_text and cat_text not in categories:
categories.append(cat_text)
# Method 2: Look for category meta tag
if not categories:
category_meta = soup.find('meta', property='article:section')
if category_meta and category_meta.get('content'):
categories.append(category_meta['content'].strip())
return categories
def _extract_tags(self, soup: BeautifulSoup) -> List[str]:
"""Extract blog post tags"""
tags = []
# Look for tag links
tag_elements = soup.find_all('a', class_=re.compile('tag'))
if tag_elements:
for element in tag_elements:
tag_text = element.get_text().strip()
if tag_text and tag_text not in tags:
tags.append(tag_text)
return tags
def _extract_summary(self, text_content: str) -> str:
"""Extract or create a summary from the blog post text"""
if not text_content:
return "No summary available"
# Take first paragraph or first few sentences (up to 300 chars)
paragraphs = text_content.split('\n\n')
if paragraphs:
summary = paragraphs[0]
if len(summary) > 300:
summary = summary[:297] + "..."
return summary
return "No summary available"
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
try:
from urllib.parse import urlparse
parsed_url = urlparse(url)
return parsed_url.netloc
except Exception:
return "Unknown Source"
|