Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import json | |
| import re | |
| from typing import List, Dict | |
| import logging | |
| from urllib.parse import urljoin, urlparse | |
| class DigitalCommonwealthScraper: | |
| def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"): | |
| """ | |
| Initialize the scraper with base URL and logging | |
| :param base_url: Base URL for Digital Commonwealth | |
| """ | |
| self.base_url = base_url | |
| logging.basicConfig(level=logging.INFO) | |
| self.logger = logging.getLogger(__name__) | |
| # Headers to mimic browser request | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| def fetch_page(self, url: str) -> requests.Response: | |
| """ | |
| Fetch webpage content with error handling | |
| :param url: URL to fetch | |
| :return: Response object | |
| """ | |
| try: | |
| response = requests.get(url, headers=self.headers) | |
| response.raise_for_status() | |
| return response | |
| except requests.RequestException as e: | |
| self.logger.error(f"Error fetching {url}: {e}") | |
| return None | |
| def extract_json_metadata(self, url: str) -> Dict: | |
| """ | |
| Extract JSON metadata from the page | |
| :param url: URL of the page | |
| :return: Dictionary of metadata | |
| """ | |
| json_url = f"{url}.json" | |
| response = self.fetch_page(json_url) | |
| if response: | |
| try: | |
| return response.json() | |
| except json.JSONDecodeError: | |
| self.logger.error(f"Could not parse JSON from {json_url}") | |
| return {} | |
| return {} | |
| def extract_images(self, url: str) -> List[Dict]: | |
| """ | |
| Extract images from the page | |
| :param url: URL of the page to scrape | |
| :return: List of image dictionaries | |
| """ | |
| # Fetch page content | |
| response = self.fetch_page(url) | |
| if not response: | |
| return [] | |
| # Parse HTML | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract JSON metadata | |
| metadata = self.extract_json_metadata(url) | |
| # List to store images | |
| images = [] | |
| # Strategy 1: Look for image viewers or specific image containers | |
| image_containers = [ | |
| soup.find('div', class_='viewer-container'), | |
| soup.find('div', class_='image-viewer'), | |
| soup.find('div', id='image-container') | |
| ] | |
| # Strategy 2: Find all image tags | |
| img_tags = soup.find_all('img') | |
| # Combine image sources | |
| for img in img_tags: | |
| # Get image source | |
| src = img.get('src') | |
| if not src: | |
| continue | |
| # Resolve relative URLs | |
| full_src = urljoin(url, src) | |
| # Extract alt text or use filename | |
| alt = img.get('alt', os.path.basename(urlparse(full_src).path)) | |
| # Create image dictionary | |
| image_info = { | |
| 'url': full_src, | |
| 'alt': alt, | |
| 'source_page': url | |
| } | |
| # Try to add metadata if available | |
| if metadata: | |
| try: | |
| # Extract relevant metadata from JSON if possible | |
| image_info['metadata'] = { | |
| 'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'), | |
| 'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'), | |
| 'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim') | |
| } | |
| except Exception as e: | |
| self.logger.warning(f"Error extracting metadata: {e}") | |
| images.append(image_info) | |
| return images | |
| def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]: | |
| """ | |
| Download images to local directory | |
| :param images: List of image dictionaries | |
| :param output_dir: Directory to save images | |
| :return: List of downloaded file paths | |
| """ | |
| # Create output directory | |
| os.makedirs(output_dir, exist_ok=True) | |
| downloaded_files = [] | |
| for i, image in enumerate(images): | |
| try: | |
| response = requests.get(image['url'], headers=self.headers) | |
| response.raise_for_status() | |
| # Generate filename | |
| ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg' | |
| filename = os.path.join(output_dir, f'image_{i}{ext}') | |
| with open(filename, 'wb') as f: | |
| f.write(response.content) | |
| downloaded_files.append(filename) | |
| self.logger.info(f"Downloaded: {filename}") | |
| except Exception as e: | |
| self.logger.error(f"Error downloading {image['url']}: {e}") | |
| return downloaded_files | |
| #def main(): | |
| # Example usage | |
| # scraper = DigitalCommonwealthScraper() | |
| # | |
| # Example URL from input | |
| # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738" | |
| # Extract images | |
| #images = scraper.extract_images(url) | |
| # Print image information | |
| #for img in images: | |
| # print(json.dumps(img, indent=2)) | |
| # Optional: Download images | |
| #scraper.download_images(images) | |
| #if __name__ == "__main__": | |
| # main() |