LibRAG / image_scraper.py
bmv2021's picture
upload correct file
f78d694
import requests from bs4 import BeautifulSoup import os import json import re from typing import List, Dict import logging from urllib.parse import urljoin, urlparse class DigitalCommonwealthScraper: def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"): """ Initialize the scraper with base URL and logging :param base_url: Base URL for Digital Commonwealth """ self.base_url = base_url logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) # Headers to mimic browser request self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def fetch_page(self, url: str) -> requests.Response: """ Fetch webpage content with error handling :param url: URL to fetch :return: Response object """ try: response = requests.get(url, headers=self.headers) response.raise_for_status() return response except requests.RequestException as e: self.logger.error(f"Error fetching {url}: {e}") return None def extract_json_metadata(self, url: str) -> Dict: """ Extract JSON metadata from the page :param url: URL of the page :return: Dictionary of metadata """ json_url = f"{url}.json" response = self.fetch_page(json_url) if response: try: return response.json() except json.JSONDecodeError: self.logger.error(f"Could not parse JSON from {json_url}") return {} return {} def extract_images(self, url: str) -> List[Dict]: """ Extract images from the page :param url: URL of the page to scrape :return: List of image dictionaries """ # Fetch page content response = self.fetch_page(url) if not response: return [] # Parse HTML soup = BeautifulSoup(response.text, 'html.parser') # Extract JSON metadata metadata = self.extract_json_metadata(url) # List to store images images = [] # Strategy 1: Look for image viewers or specific image containers image_containers = [ soup.find('div', class_='viewer-container'), soup.find('div', class_='image-viewer'), soup.find('div', id='image-container') ] # Strategy 2: Find all image tags img_tags = soup.find_all('img') # Combine image sources for img in img_tags: # Get image source src = img.get('src') if not src: continue # Resolve relative URLs full_src = urljoin(url, src) # Extract alt text or use filename alt = img.get('alt', os.path.basename(urlparse(full_src).path)) # Create image dictionary image_info = { 'url': full_src, 'alt': alt, 'source_page': url } # Try to add metadata if available if metadata: try: # Extract relevant metadata from JSON if possible image_info['metadata'] = { 'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'), 'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'), 'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim') } except Exception as e: self.logger.warning(f"Error extracting metadata: {e}") images.append(image_info) return images def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]: """ Download images to local directory :param images: List of image dictionaries :param output_dir: Directory to save images :return: List of downloaded file paths """ # Create output directory os.makedirs(output_dir, exist_ok=True) downloaded_files = [] for i, image in enumerate(images): try: response = requests.get(image['url'], headers=self.headers) response.raise_for_status() # Generate filename ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg' filename = os.path.join(output_dir, f'image_{i}{ext}') with open(filename, 'wb') as f: f.write(response.content) downloaded_files.append(filename) self.logger.info(f"Downloaded: {filename}") except Exception as e: self.logger.error(f"Error downloading {image['url']}: {e}") return downloaded_files