Spaces:
Sleeping
Sleeping
""" | |
Web scraper for collecting Iain Morris articles from Light Reading | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import time | |
import re | |
from urllib.parse import urljoin, urlparse | |
from typing import List, Dict, Optional | |
import logging | |
from tqdm import tqdm | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class LightReadingScraper: | |
def __init__(self, delay: float = 2.0): | |
""" | |
Initialize the scraper with respectful rate limiting | |
Args: | |
delay: Delay between requests in seconds | |
""" | |
self.base_url = "https://www.lightreading.com" | |
self.delay = delay | |
self.session = requests.Session() | |
self.session.headers.update({ | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
}) | |
def search_author_articles(self, author_name: str, max_pages: int = 10) -> List[str]: | |
""" | |
Search for articles by a specific author | |
Args: | |
author_name: Name of the author to search for | |
max_pages: Maximum number of search result pages to process | |
Returns: | |
List of article URLs | |
""" | |
article_urls = [] | |
# Try different search approaches | |
search_queries = [ | |
f'author:"{author_name}"', | |
f'"{author_name}"', | |
author_name.replace(' ', '+') | |
] | |
for query in search_queries: | |
logger.info(f"Searching with query: {query}") | |
for page in range(1, max_pages + 1): | |
search_url = f"{self.base_url}/search?q={query}&page={page}" | |
try: | |
response = self.session.get(search_url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find article links in search results | |
article_links = soup.find_all('a', href=True) | |
page_urls = [] | |
for link in article_links: | |
href = link.get('href') | |
if href and ('/news/' in href or '/blog/' in href or '/opinion/' in href): | |
full_url = urljoin(self.base_url, href) | |
if full_url not in article_urls: | |
page_urls.append(full_url) | |
if not page_urls: | |
logger.info(f"No more articles found on page {page}") | |
break | |
article_urls.extend(page_urls) | |
logger.info(f"Found {len(page_urls)} articles on page {page}") | |
time.sleep(self.delay) | |
except requests.RequestException as e: | |
logger.error(f"Error searching page {page}: {e}") | |
continue | |
# Remove duplicates while preserving order | |
unique_urls = list(dict.fromkeys(article_urls)) | |
logger.info(f"Total unique articles found: {len(unique_urls)}") | |
return unique_urls | |
def get_author_page_articles(self, author_name: str) -> List[str]: | |
""" | |
Try to find articles from author's dedicated page | |
Args: | |
author_name: Name of the author | |
Returns: | |
List of article URLs | |
""" | |
article_urls = [] | |
# Try common author page patterns | |
author_slug = author_name.lower().replace(' ', '-') | |
author_pages = [ | |
f"{self.base_url}/author/{author_slug}", | |
f"{self.base_url}/authors/{author_slug}", | |
f"{self.base_url}/contributor/{author_slug}" | |
] | |
for author_url in author_pages: | |
try: | |
response = self.session.get(author_url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find article links | |
article_links = soup.find_all('a', href=True) | |
for link in article_links: | |
href = link.get('href') | |
if href and ('/news/' in href or '/blog/' in href or '/opinion/' in href): | |
full_url = urljoin(self.base_url, href) | |
article_urls.append(full_url) | |
logger.info(f"Found {len(article_urls)} articles from author page") | |
break | |
except requests.RequestException as e: | |
logger.debug(f"Author page {author_url} not accessible: {e}") | |
continue | |
time.sleep(self.delay) | |
return list(dict.fromkeys(article_urls)) # Remove duplicates | |
def scrape_article(self, url: str) -> Optional[Dict]: | |
""" | |
Scrape a single article | |
Args: | |
url: URL of the article to scrape | |
Returns: | |
Dictionary containing article data or None if failed | |
""" | |
try: | |
response = self.session.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract article data | |
article_data = { | |
'url': url, | |
'title': '', | |
'author': '', | |
'date': '', | |
'content': '', | |
'summary': '' | |
} | |
# Title | |
title_selectors = [ | |
'h1.article-title', | |
'h1.entry-title', | |
'h1.post-title', | |
'h1', | |
'.article-header h1', | |
'.post-header h1' | |
] | |
for selector in title_selectors: | |
title_elem = soup.select_one(selector) | |
if title_elem: | |
article_data['title'] = title_elem.get_text().strip() | |
break | |
# Author | |
author_selectors = [ | |
'.author-name', | |
'.byline', | |
'.article-author', | |
'.post-author', | |
'[rel="author"]' | |
] | |
for selector in author_selectors: | |
author_elem = soup.select_one(selector) | |
if author_elem: | |
article_data['author'] = author_elem.get_text().strip() | |
break | |
# Date | |
date_selectors = [ | |
'.article-date', | |
'.post-date', | |
'.published', | |
'time', | |
'.date' | |
] | |
for selector in date_selectors: | |
date_elem = soup.select_one(selector) | |
if date_elem: | |
article_data['date'] = date_elem.get_text().strip() | |
break | |
# Content | |
content_selectors = [ | |
'.article-content', | |
'.post-content', | |
'.entry-content', | |
'.article-body', | |
'.content' | |
] | |
content_text = "" | |
for selector in content_selectors: | |
content_elem = soup.select_one(selector) | |
if content_elem: | |
# Remove script and style elements | |
for script in content_elem(["script", "style"]): | |
script.decompose() | |
content_text = content_elem.get_text() | |
break | |
if not content_text: | |
# Fallback: try to get all paragraph text | |
paragraphs = soup.find_all('p') | |
content_text = '\n'.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()]) | |
article_data['content'] = self.clean_text(content_text) | |
# Summary (first paragraph or meta description) | |
summary_elem = soup.select_one('meta[name="description"]') | |
if summary_elem: | |
article_data['summary'] = summary_elem.get('content', '').strip() | |
elif article_data['content']: | |
# Use first paragraph as summary | |
first_para = article_data['content'].split('\n')[0] | |
article_data['summary'] = first_para[:300] + '...' if len(first_para) > 300 else first_para | |
# Validate article has minimum required content | |
if len(article_data['content']) < 200: | |
logger.warning(f"Article too short, skipping: {url}") | |
return None | |
# Note: Removed author matching check since we're scraping specific URLs | |
# that may include articles by various authors | |
return article_data | |
except requests.RequestException as e: | |
logger.error(f"Error scraping {url}: {e}") | |
return None | |
except Exception as e: | |
logger.error(f"Unexpected error scraping {url}: {e}") | |
return None | |
def clean_text(self, text: str) -> str: | |
""" | |
Clean and normalize text content | |
Args: | |
text: Raw text to clean | |
Returns: | |
Cleaned text | |
""" | |
if not text: | |
return "" | |
# Remove extra whitespace | |
text = re.sub(r'\s+', ' ', text) | |
# Remove common artifacts | |
text = re.sub(r'\[.*?\]', '', text) # Remove [brackets] | |
text = re.sub(r'Share this article.*$', '', text, flags=re.IGNORECASE) | |
text = re.sub(r'Related articles.*$', '', text, flags=re.IGNORECASE) | |
return text.strip() | |
def scrape_author_articles(self, author_name: str, max_articles: int = 200) -> List[Dict]: | |
""" | |
Scrape all articles by a specific author | |
Args: | |
author_name: Name of the author | |
max_articles: Maximum number of articles to scrape | |
Returns: | |
List of article dictionaries | |
""" | |
logger.info(f"Starting to scrape articles by {author_name}") | |
# Get article URLs from multiple sources | |
all_urls = [] | |
# Try author page first | |
author_page_urls = self.get_author_page_articles(author_name) | |
all_urls.extend(author_page_urls) | |
# Then try search | |
search_urls = self.search_author_articles(author_name) | |
all_urls.extend(search_urls) | |
# Remove duplicates | |
unique_urls = list(dict.fromkeys(all_urls)) | |
if len(unique_urls) > max_articles: | |
unique_urls = unique_urls[:max_articles] | |
logger.info(f"Found {len(unique_urls)} unique article URLs to scrape") | |
# Scrape articles | |
articles = [] | |
failed_count = 0 | |
for url in tqdm(unique_urls, desc="Scraping articles"): | |
article_data = self.scrape_article(url) | |
if article_data: | |
articles.append(article_data) | |
logger.debug(f"Successfully scraped: {article_data['title']}") | |
else: | |
failed_count += 1 | |
time.sleep(self.delay) | |
logger.info(f"Successfully scraped {len(articles)} articles") | |
logger.info(f"Failed to scrape {failed_count} articles") | |
return articles | |
def load_urls_from_file(self, filename: str) -> List[str]: | |
""" | |
Load URLs from a text file | |
Args: | |
filename: Path to the file containing URLs (one per line) | |
Returns: | |
List of URLs | |
""" | |
urls = [] | |
try: | |
with open(filename, 'r', encoding='utf-8') as f: | |
for line in f: | |
url = line.strip() | |
if url and not url.startswith('#'): # Skip empty lines and comments | |
urls.append(url) | |
logger.info(f"Loaded {len(urls)} URLs from {filename}") | |
return urls | |
except FileNotFoundError: | |
logger.error(f"URL file not found: {filename}") | |
return [] | |
except Exception as e: | |
logger.error(f"Error reading URL file {filename}: {e}") | |
return [] | |
def scrape_urls_from_file(self, filename: str) -> List[Dict]: | |
""" | |
Scrape articles from URLs listed in a file | |
Args: | |
filename: Path to the file containing URLs | |
Returns: | |
List of article dictionaries | |
""" | |
urls = self.load_urls_from_file(filename) | |
if not urls: | |
logger.error("No URLs to scrape") | |
return [] | |
logger.info(f"Starting to scrape {len(urls)} articles from URL file") | |
articles = [] | |
failed_count = 0 | |
for url in tqdm(urls, desc="Scraping articles"): | |
article_data = self.scrape_article(url) | |
if article_data: | |
articles.append(article_data) | |
logger.debug(f"Successfully scraped: {article_data['title']}") | |
else: | |
failed_count += 1 | |
logger.warning(f"Failed to scrape: {url}") | |
time.sleep(self.delay) | |
logger.info(f"Successfully scraped {len(articles)} articles") | |
logger.info(f"Failed to scrape {failed_count} articles") | |
return articles | |
def save_articles(self, articles: List[Dict], filename: str): | |
""" | |
Save articles to JSON file | |
Args: | |
articles: List of article dictionaries | |
filename: Output filename | |
""" | |
with open(filename, 'w', encoding='utf-8') as f: | |
json.dump(articles, f, indent=2, ensure_ascii=False) | |
logger.info(f"Saved {len(articles)} articles to {filename}") | |
def main(): | |
""" | |
Main function to run the scraper | |
""" | |
scraper = LightReadingScraper(delay=2.0) | |
# Scrape articles from URLs in urls.txt | |
articles = scraper.scrape_urls_from_file("urls.txt") | |
if articles: | |
# Save raw articles | |
scraper.save_articles(articles, "data/raw_articles.json") | |
# Print summary | |
print(f"\nScraping Summary:") | |
print(f"Total articles collected: {len(articles)}") | |
print(f"Average article length: {sum(len(a['content']) for a in articles) // len(articles)} characters") | |
# Show sample titles | |
print(f"\nSample article titles:") | |
for i, article in enumerate(articles[:5]): | |
print(f"{i+1}. {article['title']}") | |
else: | |
print("No articles were successfully scraped.") | |
if __name__ == "__main__": | |
main() | |