Spaces:
Running
Running
File size: 4,305 Bytes
57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/usr/bin/env python3
import os
import json
import io
import tempfile
from pathlib import Path
from huggingface_hub import HfApi, HfFolder, hf_hub_download
print("Starting image download from Hugging Face dataset")
# Get environment variables
HF_USERNAME = os.environ.get("HF_USERNAME", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data")
# Validate required environment variables
if not HF_USERNAME:
print("ERROR: HF_USERNAME environment variable is not set")
exit(1)
if not HF_TOKEN:
print("ERROR: HF_TOKEN environment variable is not set")
exit(1)
print(f"Using Hugging Face credentials for user: {HF_USERNAME}")
print(f"Dataset repository: {DATASET_REPO}")
# Set HF cache directory to a writable location
os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(
tempfile.gettempdir(), "huggingface", "hub"
)
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True)
# Constants
IMAGES_PATH = "images"
METADATA_PATH = "metadata"
UPLOAD_DIR = Path("static/uploads")
METADATA_DIR = Path("static/metadata")
METADATA_FILE = METADATA_DIR / "image_metadata.json"
# Create directories if they don't exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)
# Initialize HfApi
hf_api = HfApi(token=HF_TOKEN)
try:
# Check if repo exists
print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists")
hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset")
print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists")
# Download metadata first
print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}")
try:
metadata_file = hf_api.hf_hub_download(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
filename=f"{METADATA_PATH}/image_metadata.json",
repo_type="dataset",
token=HF_TOKEN,
local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
)
print(f"Metadata downloaded to {metadata_file}")
with open(metadata_file, "r") as f:
metadata = json.load(f)
# Save metadata locally
with open(METADATA_FILE, "w") as f:
json.dump(metadata, f)
print(f"Metadata saved to {METADATA_FILE}")
except Exception as e:
print(f"Error downloading metadata: {e}")
print("Creating empty metadata file")
metadata = {}
# Initialize metadata file if it doesn't exist
with open(METADATA_FILE, "w") as f:
json.dump({}, f)
# List all files in the dataset
print("Listing files in the dataset")
files = hf_api.list_repo_files(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN
)
# Filter only image files
image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")]
print(f"Found {len(image_files)} images")
# Download each image
success_count = 0
for i, image_file in enumerate(image_files):
try:
filename = os.path.basename(image_file)
print(f"[{i+1}/{len(image_files)}] Downloading {filename}")
# Download file
download_path = hf_api.hf_hub_download(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
filename=image_file,
repo_type="dataset",
token=HF_TOKEN,
local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
)
# Copy to uploads directory
destination = UPLOAD_DIR / filename
with open(download_path, "rb") as src, open(destination, "wb") as dst:
dst.write(src.read())
print(f"Saved {filename} to {destination}")
success_count += 1
except Exception as e:
print(f"Error downloading {image_file}: {e}")
print(
f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images."
)
except Exception as e:
print(f"Error: {e}")
print("Creating empty metadata file")
with open(METADATA_FILE, "w") as f:
json.dump({}, f)
|