import json import zipfile from itertools import chain from pathlib import Path from joblib import Parallel, delayed from realfake.utils import get_user_name, inject_args, Args class UnpackParams(Args): meta_file: Path jsonl_file: Path num_workers: int = 16 def unpack(zip_path: Path, output_dir: Path): print("extracting", zip_path) with zipfile.ZipFile(zip_path, "r") as arch: paths = [str(output_dir/fn) for fn in arch.namelist() if fn.endswith(".png")] arch.extractall(output_dir) return paths @inject_args def main(params: UnpackParams) -> None: subset_name = params.meta_file.stem output_dir = Path(f"/fsx/{get_user_name()}/data/fake_{subset_name}") output_dir.mkdir(parents=True, exist_ok=True) meta = json.loads(params.meta_file.read_text()) with Parallel(n_jobs=params.num_workers, verbose=100) as parallel: results = parallel(delayed(unpack)(Path(m["path"]), output_dir) for m in meta if m["ok"]) records = [ {"path": str(fn), "label": "fake", "class": None, "valid": None} for fn in chain.from_iterable(results) ] with params.jsonl_file.open("w") as fp: for record in records: fp.write(json.dumps(record) + "\n") if __name__ == "__main__": main()