Spaces:
Sleeping
Sleeping
""" | |
Copyright (c) 2022, salesforce.com, inc. | |
All rights reserved. | |
SPDX-License-Identifier: BSD-3-Clause | |
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause | |
""" | |
import io | |
import os | |
import pathlib | |
import urllib | |
import tqdm | |
from concurrent.futures import ThreadPoolExecutor | |
from lavis.common.utils import get_abs_path, get_cache_path | |
from lavis.datasets.builders import load_dataset | |
from omegaconf import OmegaConf | |
from PIL import Image | |
# DATA_URL = {"train": "http://www.cs.rice.edu/~vo9/sbucaptions/sbu_images.tar"} | |
USER_AGENT = ( | |
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" | |
) | |
def fetch_single_image(image_url, timeout=None, retries=0): | |
for _ in range(retries + 1): | |
try: | |
request = urllib.request.Request( | |
image_url, | |
data=None, | |
headers={"user-agent": USER_AGENT}, | |
) | |
with urllib.request.urlopen(request, timeout=timeout) as req: | |
image = Image.open(io.BytesIO(req.read())) | |
break | |
except Exception: | |
image = None | |
return image | |
def download_and_save_image(ann, save_dir, timeout=None, retries=0): | |
image = fetch_single_image(ann["url"], timeout=timeout, retries=retries) | |
if image is not None: | |
image_path = os.path.join(save_dir, ann["image"]) | |
print(image_path) | |
image.save(image_path) | |
if __name__ == "__main__": | |
config_path = get_abs_path("configs/datasets/sbu_caption/defaults.yaml") | |
storage_dir = OmegaConf.load( | |
config_path | |
).datasets.sbu_caption.build_info.images.storage | |
storage_dir = pathlib.Path(get_cache_path(storage_dir)) | |
if storage_dir.exists(): | |
print(f"Dataset already exists at {storage_dir}. Aborting.") | |
exit(0) | |
storage_dir.mkdir(parents=True, exist_ok=True) | |
num_threads = 20 | |
dset = load_dataset("sbu_caption")["train"].annotation | |
print("Downloading dataset...") | |
# multiprocessing | |
with ThreadPoolExecutor(max_workers=num_threads) as executor: | |
for ann in tqdm.tqdm(dset): | |
executor.submit( | |
download_and_save_image, | |
ann, | |
storage_dir, | |
timeout=30, | |
retries=10, | |
) | |