John6666's picture
Upload 351 files
e84842d verified
raw
history blame
2.34 kB
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import io
import os
import pathlib
import urllib
import tqdm
from concurrent.futures import ThreadPoolExecutor
from lavis.common.utils import get_abs_path, get_cache_path
from lavis.datasets.builders import load_dataset
from omegaconf import OmegaConf
from PIL import Image
# DATA_URL = {"train": "http://www.cs.rice.edu/~vo9/sbucaptions/sbu_images.tar"}
USER_AGENT = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
)
def fetch_single_image(image_url, timeout=None, retries=0):
for _ in range(retries + 1):
try:
request = urllib.request.Request(
image_url,
data=None,
headers={"user-agent": USER_AGENT},
)
with urllib.request.urlopen(request, timeout=timeout) as req:
image = Image.open(io.BytesIO(req.read()))
break
except Exception:
image = None
return image
def download_and_save_image(ann, save_dir, timeout=None, retries=0):
image = fetch_single_image(ann["url"], timeout=timeout, retries=retries)
if image is not None:
image_path = os.path.join(save_dir, ann["image"])
print(image_path)
image.save(image_path)
if __name__ == "__main__":
config_path = get_abs_path("configs/datasets/sbu_caption/defaults.yaml")
storage_dir = OmegaConf.load(
config_path
).datasets.sbu_caption.build_info.images.storage
storage_dir = pathlib.Path(get_cache_path(storage_dir))
if storage_dir.exists():
print(f"Dataset already exists at {storage_dir}. Aborting.")
exit(0)
storage_dir.mkdir(parents=True, exist_ok=True)
num_threads = 20
dset = load_dataset("sbu_caption")["train"].annotation
print("Downloading dataset...")
# multiprocessing
with ThreadPoolExecutor(max_workers=num_threads) as executor:
for ann in tqdm.tqdm(dset):
executor.submit(
download_and_save_image,
ann,
storage_dir,
timeout=30,
retries=10,
)