""" Check that files that are references from JSONL file are valid. """ import json from pathlib import Path import numpy as np import PIL.Image from joblib import delayed, Parallel from realfake.utils import inject_args, Args, read_jsonl class CheckFilesArgs(Args): jsonl_file: Path @inject_args def main(args: CheckFilesArgs) -> None: records = read_jsonl(args.jsonl_file) results = Parallel(n_jobs=-1, verbose=100)(delayed(check_file)(record) for record in records) failed = [result for result in results if result["error"] is not None] if not failed: print("All files are valid") else: saved_file = args.jsonl_file.with_suffix(".failed.jsonl") print(f"{len(failed)} files are invalid, saved errors to {saved_file}") with open(saved_file, "w") as f: for record in failed: f.write(json.dumps(record) + "\n") def check_file(record: dict) -> dict: path = Path(record["path"]) error = None if not path.exists(): error = "File does not exist" elif not path.is_file(): error = "Path is not a file" elif path.suffix.lower() not in (".jpg", ".jpeg", ".png"): error = "File is not an image file" else: try: np.asarray(PIL.Image.open(path)) except Exception as e: error = f"Image cannot be opened: {e}" return dict(record, error=error) if __name__ == '__main__': main()