#!/usr/bin/env python3 import os import json import io import tempfile from pathlib import Path from huggingface_hub import HfApi, HfFolder, hf_hub_download print("Starting image download from Hugging Face dataset") # Get environment variables HF_USERNAME = os.environ.get("HF_USERNAME", "") HF_TOKEN = os.environ.get("HF_TOKEN", "") DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data") # Validate required environment variables if not HF_USERNAME: print("ERROR: HF_USERNAME environment variable is not set") exit(1) if not HF_TOKEN: print("ERROR: HF_TOKEN environment variable is not set") exit(1) print(f"Using Hugging Face credentials for user: {HF_USERNAME}") print(f"Dataset repository: {DATASET_REPO}") # Set HF cache directory to a writable location os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface") os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join( tempfile.gettempdir(), "huggingface", "hub" ) os.makedirs(os.environ["HF_HOME"], exist_ok=True) os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True) # Constants IMAGES_PATH = "images" METADATA_PATH = "metadata" UPLOAD_DIR = Path("static/uploads") METADATA_DIR = Path("static/metadata") METADATA_FILE = METADATA_DIR / "image_metadata.json" # Alternative metadata location with guaranteed write permissions HOME_DIR = Path(os.environ.get("HOME", "/tmp")) ALT_METADATA_DIR = HOME_DIR / ".image_uploader" ALT_METADATA_DIR.mkdir(parents=True, exist_ok=True) ALT_METADATA_FILE = ALT_METADATA_DIR / "image_metadata.json" # Create directories if they don't exist UPLOAD_DIR.mkdir(parents=True, exist_ok=True) METADATA_DIR.mkdir(parents=True, exist_ok=True) # Function to get the appropriate metadata file def get_metadata_file(): # Try to write to the primary location try: if not METADATA_FILE.exists(): with open(METADATA_FILE, "w") as f: json.dump({}, f) # Test write permission if os.access(METADATA_FILE, os.W_OK): return METADATA_FILE raise PermissionError(f"No write permission for {METADATA_FILE}") except (PermissionError, OSError) as e: print(f"Warning: Cannot use {METADATA_FILE}: {e}") print(f"Using alternative location: {ALT_METADATA_FILE}") return ALT_METADATA_FILE # Initialize HfApi hf_api = HfApi(token=HF_TOKEN) try: # Check if repo exists print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists") hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset") print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists") # Download metadata first print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}") try: metadata_file_path = hf_api.hf_hub_download( repo_id=f"{HF_USERNAME}/{DATASET_REPO}", filename=f"{METADATA_PATH}/image_metadata.json", repo_type="dataset", token=HF_TOKEN, local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"), ) print(f"Metadata downloaded to {metadata_file_path}") with open(metadata_file_path, "r") as f: metadata = json.load(f) # Save metadata locally to the appropriate file save_path = get_metadata_file() with open(save_path, "w") as f: json.dump(metadata, f) print(f"Metadata saved to {save_path}") except Exception as e: print(f"Error downloading metadata: {e}") print("Creating empty metadata file") metadata = {} # Initialize metadata file save_path = get_metadata_file() with open(save_path, "w") as f: json.dump({}, f) print(f"Created empty metadata file at {save_path}") # List all files in the dataset print("Listing files in the dataset") files = hf_api.list_repo_files( repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN ) # Filter only image files image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")] print(f"Found {len(image_files)} images") # Download each image success_count = 0 for i, image_file in enumerate(image_files): try: filename = os.path.basename(image_file) print(f"[{i+1}/{len(image_files)}] Downloading {filename}") # Download file download_path = hf_api.hf_hub_download( repo_id=f"{HF_USERNAME}/{DATASET_REPO}", filename=image_file, repo_type="dataset", token=HF_TOKEN, local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"), ) # Copy to uploads directory destination = UPLOAD_DIR / filename with open(download_path, "rb") as src, open(destination, "wb") as dst: dst.write(src.read()) print(f"Saved {filename} to {destination}") success_count += 1 except Exception as e: print(f"Error downloading {image_file}: {e}") print( f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images." ) except Exception as e: print(f"Error: {e}") print("Creating empty metadata file") save_path = get_metadata_file() with open(save_path, "w") as f: json.dump({}, f) print(f"Created empty metadata file at {save_path}")