File size: 4,305 Bytes
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
 
 
 
 
 
 
 
 
 
 
 
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
57de00b
 
69b0c8e
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
 
57de00b
 
69b0c8e
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
57de00b
 
 
69b0c8e
 
 
57de00b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
import os
import json
import io
import tempfile
from pathlib import Path
from huggingface_hub import HfApi, HfFolder, hf_hub_download

print("Starting image download from Hugging Face dataset")

# Get environment variables
HF_USERNAME = os.environ.get("HF_USERNAME", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data")

# Validate required environment variables
if not HF_USERNAME:
    print("ERROR: HF_USERNAME environment variable is not set")
    exit(1)

if not HF_TOKEN:
    print("ERROR: HF_TOKEN environment variable is not set")
    exit(1)

print(f"Using Hugging Face credentials for user: {HF_USERNAME}")
print(f"Dataset repository: {DATASET_REPO}")

# Set HF cache directory to a writable location
os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(
    tempfile.gettempdir(), "huggingface", "hub"
)
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True)

# Constants
IMAGES_PATH = "images"
METADATA_PATH = "metadata"
UPLOAD_DIR = Path("static/uploads")
METADATA_DIR = Path("static/metadata")
METADATA_FILE = METADATA_DIR / "image_metadata.json"

# Create directories if they don't exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)

# Initialize HfApi
hf_api = HfApi(token=HF_TOKEN)

try:
    # Check if repo exists
    print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists")
    hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset")
    print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists")

    # Download metadata first
    print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}")
    try:
        metadata_file = hf_api.hf_hub_download(
            repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
            filename=f"{METADATA_PATH}/image_metadata.json",
            repo_type="dataset",
            token=HF_TOKEN,
            local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
        )

        print(f"Metadata downloaded to {metadata_file}")
        with open(metadata_file, "r") as f:
            metadata = json.load(f)

        # Save metadata locally
        with open(METADATA_FILE, "w") as f:
            json.dump(metadata, f)
            print(f"Metadata saved to {METADATA_FILE}")
    except Exception as e:
        print(f"Error downloading metadata: {e}")
        print("Creating empty metadata file")
        metadata = {}
        # Initialize metadata file if it doesn't exist
        with open(METADATA_FILE, "w") as f:
            json.dump({}, f)

    # List all files in the dataset
    print("Listing files in the dataset")
    files = hf_api.list_repo_files(
        repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN
    )

    # Filter only image files
    image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")]
    print(f"Found {len(image_files)} images")

    # Download each image
    success_count = 0
    for i, image_file in enumerate(image_files):
        try:
            filename = os.path.basename(image_file)
            print(f"[{i+1}/{len(image_files)}] Downloading {filename}")

            # Download file
            download_path = hf_api.hf_hub_download(
                repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
                filename=image_file,
                repo_type="dataset",
                token=HF_TOKEN,
                local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
            )

            # Copy to uploads directory
            destination = UPLOAD_DIR / filename
            with open(download_path, "rb") as src, open(destination, "wb") as dst:
                dst.write(src.read())

            print(f"Saved {filename} to {destination}")
            success_count += 1
        except Exception as e:
            print(f"Error downloading {image_file}: {e}")

    print(
        f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images."
    )

except Exception as e:
    print(f"Error: {e}")
    print("Creating empty metadata file")
    with open(METADATA_FILE, "w") as f:
        json.dump({}, f)