Spaces:

pidoko
/

mosaicGenerator

Sleeping

mosaicGenerator / scraper2.py

Peter Idoko

Initial fresh commit

b8bad50 5 months ago

3.29 kB

	# Web Scraper by Peter Chibuikem Idoko
	# 2025-02-03
	# Scraped a 1000 images to be quantized before use in mosaic generator

	import os
	import time
	import logging
	import requests
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager as driver
	from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
	from selenium.webdriver.support.ui import WebDriverWait as wait
	from selenium.webdriver.support import expected_conditions as EC

	# Collect info, warning and error logs in scraper.log
	logging.basicConfig(
	filename="scraper.log",
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	)

	# Constants for this scraper
	BASE_URL = "https://www.shopify.com/stock-photos/photos"
	OUTPUT_DIR = "scraped_photos"
	MAX_IMAGES = 1000
	NUM_PAGES = 16
	HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0)"}

	# Initialize Selenium's WebDriver
	options = webdriver.ChromeOptions()
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	options.add_argument("--window-size=1920x1080")

	# Function to handle clicking next
	def click_next_page():
	"""Clicks the Next button to navigate pages, returning False if no button exists"""
	try:
	next_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]")))
	next_button.click()
	logging.info("Navigated to the next page.")
	return True
	except (NoSuchElementException, TimeoutError, ElementClickInterceptedException):
	logging.warning("Next button not found or not clickable.")
	return False

	# Function to download images efficiently
	def download_image(img_url, img_name):
	"""Downloads an image with error handling"""
	try:
	response = requests.get(img_url, headers=HEADERS, stream=True, timeout=10)
	response.raise_for_status()
	with open(os.path.join(OUTPUT_DIR, img_name), "wb") as file:
	for chunk in response.iter_content(1024):
	file.write(chunk)
	logging.info(f"Downloaded image: {img_name}")
	return True
	except requests.exceptions.RequestException as e:
	logging.error(f"Error downloading {img_url}: {e}")
	return False

	# Start scraping
	downloaded = len(os.listdir(OUTPUT_DIR)) # Track existing images
	# Loop through pages and images and track progress
	for page in range(NUM_PAGES):
	logging.info(f"Processing page {page + 1}/{NUM_PAGES}...")

	# Extract image elements
	images = driver.find_elements(By.TAG_NAME, "img")

	# Download images
	for img in images:
	if downloaded >= MAX_IMAGES:
	logging.info("Reached max image limit.")
	break
	img_url = img.get_attribute("src")
	if img_url and img_url.startswith("https"):
	img_name = f"image_{downloaded + 1}.jpg"
	if download_image(img_url, img_name):
	downloaded += 1

	if not click_next_page():
	logging.info("No more pages available.")
	break

	driver.quit()
	logging.info("Web scraping completed successfully!")
	print("Web scraping completed. Check scraper.log for details.")