realfake / realfake /bin /imagenet.py
devforfu
Init
ea847ad
"""
Unpacks tar files from Imagenet-1k dataset while keeping the original directory structure.
The script only unpacks the files from training subset.
"""
import tarfile
from pathlib import Path
from joblib import delayed, Parallel
from realfake.utils import inject_args, Args
class ImagenetArgs(Args):
imagenet_dir: Path
unpacked_dir: Path
@inject_args
def main(args: ImagenetArgs) -> None:
train_dir = args.imagenet_dir/"train"
assert train_dir.exists(), f"Directory {train_dir} does not exist"
archives = train_dir.glob("*.tar")
Parallel(n_jobs=-1, verbose=100)(delayed(unpack_tar)(tar_file, args.unpacked_dir) for tar_file in archives)
def unpack_tar(tar_file: Path, output_dir: Path) -> None:
output_subdir = output_dir/tar_file.stem
with tarfile.open(tar_file) as tar:
tar.extractall(output_subdir)
if __name__ == "__main__":
main()