File size: 5,461 Bytes
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
 
 
 
 
 
 
 
 
 
 
 
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387d45c
 
 
 
 
 
57de00b
 
 
 
387d45c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57de00b
 
 
 
 
 
 
69b0c8e
57de00b
 
 
 
387d45c
57de00b
 
 
 
 
 
 
387d45c
 
57de00b
 
387d45c
 
 
57de00b
387d45c
57de00b
 
69b0c8e
57de00b
387d45c
 
 
57de00b
387d45c
57de00b
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
 
57de00b
 
69b0c8e
57de00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69b0c8e
57de00b
 
 
69b0c8e
 
 
57de00b
 
 
 
387d45c
 
57de00b
387d45c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
import os
import json
import io
import tempfile
from pathlib import Path
from huggingface_hub import HfApi, HfFolder, hf_hub_download

print("Starting image download from Hugging Face dataset")

# Get environment variables
HF_USERNAME = os.environ.get("HF_USERNAME", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
DATASET_REPO = os.environ.get("HF_DATASET_REPO", "image-uploader-data")

# Validate required environment variables
if not HF_USERNAME:
    print("ERROR: HF_USERNAME environment variable is not set")
    exit(1)

if not HF_TOKEN:
    print("ERROR: HF_TOKEN environment variable is not set")
    exit(1)

print(f"Using Hugging Face credentials for user: {HF_USERNAME}")
print(f"Dataset repository: {DATASET_REPO}")

# Set HF cache directory to a writable location
os.environ["HF_HOME"] = os.path.join(tempfile.gettempdir(), "huggingface")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(
    tempfile.gettempdir(), "huggingface", "hub"
)
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
os.makedirs(os.environ["HUGGINGFACE_HUB_CACHE"], exist_ok=True)

# Constants
IMAGES_PATH = "images"
METADATA_PATH = "metadata"
UPLOAD_DIR = Path("static/uploads")
METADATA_DIR = Path("static/metadata")
METADATA_FILE = METADATA_DIR / "image_metadata.json"

# Alternative metadata location with guaranteed write permissions
HOME_DIR = Path(os.environ.get("HOME", "/tmp"))
ALT_METADATA_DIR = HOME_DIR / ".image_uploader"
ALT_METADATA_DIR.mkdir(parents=True, exist_ok=True)
ALT_METADATA_FILE = ALT_METADATA_DIR / "image_metadata.json"

# Create directories if they don't exist
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)


# Function to get the appropriate metadata file
def get_metadata_file():
    # Try to write to the primary location
    try:
        if not METADATA_FILE.exists():
            with open(METADATA_FILE, "w") as f:
                json.dump({}, f)
        # Test write permission
        if os.access(METADATA_FILE, os.W_OK):
            return METADATA_FILE
        raise PermissionError(f"No write permission for {METADATA_FILE}")
    except (PermissionError, OSError) as e:
        print(f"Warning: Cannot use {METADATA_FILE}: {e}")
        print(f"Using alternative location: {ALT_METADATA_FILE}")
        return ALT_METADATA_FILE


# Initialize HfApi
hf_api = HfApi(token=HF_TOKEN)

try:
    # Check if repo exists
    print(f"Checking if repository {HF_USERNAME}/{DATASET_REPO} exists")
    hf_api.repo_info(repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset")
    print(f"Repository {HF_USERNAME}/{DATASET_REPO} exists")

    # Download metadata first
    print(f"Downloading metadata from {HF_USERNAME}/{DATASET_REPO}")
    try:
        metadata_file_path = hf_api.hf_hub_download(
            repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
            filename=f"{METADATA_PATH}/image_metadata.json",
            repo_type="dataset",
            token=HF_TOKEN,
            local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
        )

        print(f"Metadata downloaded to {metadata_file_path}")
        with open(metadata_file_path, "r") as f:
            metadata = json.load(f)

        # Save metadata locally to the appropriate file
        save_path = get_metadata_file()
        with open(save_path, "w") as f:
            json.dump(metadata, f)
            print(f"Metadata saved to {save_path}")
    except Exception as e:
        print(f"Error downloading metadata: {e}")
        print("Creating empty metadata file")
        metadata = {}
        # Initialize metadata file
        save_path = get_metadata_file()
        with open(save_path, "w") as f:
            json.dump({}, f)
            print(f"Created empty metadata file at {save_path}")

    # List all files in the dataset
    print("Listing files in the dataset")
    files = hf_api.list_repo_files(
        repo_id=f"{HF_USERNAME}/{DATASET_REPO}", repo_type="dataset", token=HF_TOKEN
    )

    # Filter only image files
    image_files = [f for f in files if f.startswith(f"{IMAGES_PATH}/")]
    print(f"Found {len(image_files)} images")

    # Download each image
    success_count = 0
    for i, image_file in enumerate(image_files):
        try:
            filename = os.path.basename(image_file)
            print(f"[{i+1}/{len(image_files)}] Downloading {filename}")

            # Download file
            download_path = hf_api.hf_hub_download(
                repo_id=f"{HF_USERNAME}/{DATASET_REPO}",
                filename=image_file,
                repo_type="dataset",
                token=HF_TOKEN,
                local_dir=os.path.join(tempfile.gettempdir(), "hf_downloads"),
            )

            # Copy to uploads directory
            destination = UPLOAD_DIR / filename
            with open(download_path, "rb") as src, open(destination, "wb") as dst:
                dst.write(src.read())

            print(f"Saved {filename} to {destination}")
            success_count += 1
        except Exception as e:
            print(f"Error downloading {image_file}: {e}")

    print(
        f"Image download completed. Successfully downloaded {success_count}/{len(image_files)} images."
    )

except Exception as e:
    print(f"Error: {e}")
    print("Creating empty metadata file")
    save_path = get_metadata_file()
    with open(save_path, "w") as f:
        json.dump({}, f)
        print(f"Created empty metadata file at {save_path}")