""" Copyright (c) 2022, salesforce.com, inc. All rights reserved. SPDX-License-Identifier: BSD-3-Clause For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause """ import os from pathlib import Path from omegaconf import OmegaConf from lavis.common.utils import ( cleanup_dir, download_and_extract_archive, get_abs_path, get_cache_path, ) # TODO # 1. Go to https://www.mediafire.com/file/czh8sezbo9s4692/test_videos.zip/file # and https://www.mediafire.com/file/x3rrbe4hwp04e6w/train_val_videos.zip/file # 2. Right-click the Download button and copy the link address # e.g. # DATA_URL = { # "train": "https://download1602.mediafire.com/xxxxxxxxxxxx/x3rrbe4hwp04e6w/train_val_videos.zip", # "test": "https://download2390.mediafire.com/xxxxxxxxxxxx/czh8sezbo9s4692/test_videos.zip", # } # 3. Paste the link address to DATA_URL DATA_URL = { "train": "https://download2295.mediafire.com/4bb7p74xrbgg/x3rrbe4hwp04e6w/train_val_videos.zip", "test": "https://download2390.mediafire.com/79hfq3592lqg/czh8sezbo9s4692/test_videos.zip", } def download_datasets(root, url): """ Download the Imagenet-R dataset archives and expand them in the folder provided as parameter """ download_and_extract_archive(url=url, download_root=root) def merge_datasets(download_path, storage_path): """ Merge datasets in download_path to storage_path """ # Merge train and test datasets train_path = os.path.join(download_path, "TrainValVideo") test_path = os.path.join(download_path, "TestVideo") train_test_path = storage_path print("Merging to {}".format(train_test_path)) os.makedirs(train_test_path, exist_ok=True) for file_name in os.listdir(train_path): os.rename( os.path.join(train_path, file_name), os.path.join(train_test_path, file_name), ) for file_name in os.listdir(test_path): os.rename( os.path.join(test_path, file_name), os.path.join(train_test_path, file_name), ) if __name__ == "__main__": config_path = get_abs_path("configs/datasets/msrvtt/defaults_cap.yaml") storage_dir = OmegaConf.load( config_path ).datasets.msrvtt_cap.build_info.videos.storage download_dir = Path(get_cache_path(storage_dir)).parent / "download" storage_dir = Path(get_cache_path(storage_dir)) if storage_dir.exists(): print(f"Dataset already exists at {storage_dir}. Aborting.") exit(0) try: for k, v in DATA_URL.items(): print("Downloading {} to {}".format(v, k)) download_datasets(download_dir, v) except Exception as e: # remove download dir if failed cleanup_dir(download_dir) print("Failed to download or extracting datasets. Aborting.") try: merge_datasets(download_dir, storage_dir) except Exception as e: # remove storage dir if failed cleanup_dir(download_dir) cleanup_dir(storage_dir) print("Failed to merging datasets. Aborting.") cleanup_dir(download_dir)