Spaces:
Running
Running
File size: 5,461 Bytes
57de00b 69b0c8e 57de00b 387d45c 57de00b 387d45c 57de00b 69b0c8e 57de00b 387d45c 57de00b 387d45c 57de00b 387d45c 57de00b 387d45c 57de00b 69b0c8e 57de00b 387d45c 57de00b 387d45c 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 69b0c8e 57de00b 387d45c 57de00b 387d45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/usr/bin/env python3
import os
import json
import io
import tempfile
from pathlib import Path
from huggingface_hub import HfApi, HfFolder, hf_hub_download
print("Starting image download from Hugging Face dataset")
# Get environment variables
HF_USERNAME = os.environ.get("HF_USERNAME", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data")
# Validate required environment variables
if not HF_USERNAME:
print("ERROR: HF_USERNAME environment variable is not set")
exit(1)
if not HF_TOKEN:
print("ERROR: HF_TOKEN environment variable is not set")
exit(1)
print(f"Using Hugging Face credentials for user: {HF_USERNAME}")
print(f"Dataset repository: {DATASET_REPO}")
# Set HF cache directory to a writable location
os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(
tempfile.gettempdir(), "huggingface", "hub"
)
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True)
# Constants
IMAGES_PATH = "images"
METADATA_PATH = "metadata"
UPLOAD_DIR = Path("static/uploads")
METADATA_DIR = Path("static/metadata")
METADATA_FILE = METADATA_DIR / "image_metadata.json"
# Alternative metadata location with guaranteed write permissions
HOME_DIR = Path(os.environ.get("HOME", "/tmp"))
ALT_METADATA_DIR = HOME_DIR / ".image_uploader"
ALT_METADATA_DIR.mkdir(parents=True, exist_ok=True)
ALT_METADATA_FILE = ALT_METADATA_DIR / "image_metadata.json"
# Create directories if they don't exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)
# Function to get the appropriate metadata file
def get_metadata_file():
# Try to write to the primary location
try:
if not METADATA_FILE.exists():
with open(METADATA_FILE, "w") as f:
json.dump({}, f)
# Test write permission
if os.access(METADATA_FILE, os.W_OK):
return METADATA_FILE
raise PermissionError(f"No write permission for {METADATA_FILE}")
except (PermissionError, OSError) as e:
print(f"Warning: Cannot use {METADATA_FILE}: {e}")
print(f"Using alternative location: {ALT_METADATA_FILE}")
return ALT_METADATA_FILE
# Initialize HfApi
hf_api = HfApi(token=HF_TOKEN)
try:
# Check if repo exists
print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists")
hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset")
print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists")
# Download metadata first
print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}")
try:
metadata_file_path = hf_api.hf_hub_download(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
filename=f"{METADATA_PATH}/image_metadata.json",
repo_type="dataset",
token=HF_TOKEN,
local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
)
print(f"Metadata downloaded to {metadata_file_path}")
with open(metadata_file_path, "r") as f:
metadata = json.load(f)
# Save metadata locally to the appropriate file
save_path = get_metadata_file()
with open(save_path, "w") as f:
json.dump(metadata, f)
print(f"Metadata saved to {save_path}")
except Exception as e:
print(f"Error downloading metadata: {e}")
print("Creating empty metadata file")
metadata = {}
# Initialize metadata file
save_path = get_metadata_file()
with open(save_path, "w") as f:
json.dump({}, f)
print(f"Created empty metadata file at {save_path}")
# List all files in the dataset
print("Listing files in the dataset")
files = hf_api.list_repo_files(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN
)
# Filter only image files
image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")]
print(f"Found {len(image_files)} images")
# Download each image
success_count = 0
for i, image_file in enumerate(image_files):
try:
filename = os.path.basename(image_file)
print(f"[{i+1}/{len(image_files)}] Downloading {filename}")
# Download file
download_path = hf_api.hf_hub_download(
repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
filename=image_file,
repo_type="dataset",
token=HF_TOKEN,
local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
)
# Copy to uploads directory
destination = UPLOAD_DIR / filename
with open(download_path, "rb") as src, open(destination, "wb") as dst:
dst.write(src.read())
print(f"Saved {filename} to {destination}")
success_count += 1
except Exception as e:
print(f"Error downloading {image_file}: {e}")
print(
f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images."
)
except Exception as e:
print(f"Error: {e}")
print("Creating empty metadata file")
save_path = get_metadata_file()
with open(save_path, "w") as f:
json.dump({}, f)
print(f"Created empty metadata file at {save_path}")
|