Spaces:
Sleeping
Sleeping
# Web Scraper by Peter Chibuikem Idoko | |
# 2025-02-03 | |
# Scraped a 1000 images to be quantized before use in mosaic generator | |
import os | |
import time | |
import logging | |
import requests | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager as driver | |
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException | |
from selenium.webdriver.support.ui import WebDriverWait as wait | |
from selenium.webdriver.support import expected_conditions as EC | |
# Collect info, warning and error logs in scraper.log | |
logging.basicConfig( | |
filename="scraper.log", | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
) | |
# Constants for this scraper | |
BASE_URL = "https://www.shopify.com/stock-photos/photos" | |
OUTPUT_DIR = "scraped_photos" | |
MAX_IMAGES = 1000 | |
NUM_PAGES = 16 | |
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0)"} | |
# Initialize Selenium's WebDriver | |
options = webdriver.ChromeOptions() | |
options.add_argument("--headless") | |
options.add_argument("--disable-gpu") | |
options.add_argument("--window-size=1920x1080") | |
# Function to handle clicking next | |
def click_next_page(): | |
"""Clicks the Next button to navigate pages, returning False if no button exists""" | |
try: | |
next_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))) | |
next_button.click() | |
logging.info("Navigated to the next page.") | |
return True | |
except (NoSuchElementException, TimeoutError, ElementClickInterceptedException): | |
logging.warning("Next button not found or not clickable.") | |
return False | |
# Function to download images efficiently | |
def download_image(img_url, img_name): | |
"""Downloads an image with error handling""" | |
try: | |
response = requests.get(img_url, headers=HEADERS, stream=True, timeout=10) | |
response.raise_for_status() | |
with open(os.path.join(OUTPUT_DIR, img_name), "wb") as file: | |
for chunk in response.iter_content(1024): | |
file.write(chunk) | |
logging.info(f"Downloaded image: {img_name}") | |
return True | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Error downloading {img_url}: {e}") | |
return False | |
# Start scraping | |
downloaded = len(os.listdir(OUTPUT_DIR)) # Track existing images | |
# Loop through pages and images and track progress | |
for page in range(NUM_PAGES): | |
logging.info(f"Processing page {page + 1}/{NUM_PAGES}...") | |
# Extract image elements | |
images = driver.find_elements(By.TAG_NAME, "img") | |
# Download images | |
for img in images: | |
if downloaded >= MAX_IMAGES: | |
logging.info("Reached max image limit.") | |
break | |
img_url = img.get_attribute("src") | |
if img_url and img_url.startswith("https"): | |
img_name = f"image_{downloaded + 1}.jpg" | |
if download_image(img_url, img_name): | |
downloaded += 1 | |
if not click_next_page(): | |
logging.info("No more pages available.") | |
break | |
driver.quit() | |
logging.info("Web scraping completed successfully!") | |
print("Web scraping completed. Check scraper.log for details.") | |