realfake / realfake /bin /check_files.py
devforfu
Init
ea847ad
"""
Check that files that are references from JSONL file are valid.
"""
import json
from pathlib import Path
import numpy as np
import PIL.Image
from joblib import delayed, Parallel
from realfake.utils import inject_args, Args, read_jsonl
class CheckFilesArgs(Args):
jsonl_file: Path
@inject_args
def main(args: CheckFilesArgs) -> None:
records = read_jsonl(args.jsonl_file)
results = Parallel(n_jobs=-1, verbose=100)(delayed(check_file)(record) for record in records)
failed = [result for result in results if result["error"] is not None]
if not failed:
print("All files are valid")
else:
saved_file = args.jsonl_file.with_suffix(".failed.jsonl")
print(f"{len(failed)} files are invalid, saved errors to {saved_file}")
with open(saved_file, "w") as f:
for record in failed:
f.write(json.dumps(record) + "\n")
def check_file(record: dict) -> dict:
path = Path(record["path"])
error = None
if not path.exists():
error = "File does not exist"
elif not path.is_file():
error = "Path is not a file"
elif path.suffix.lower() not in (".jpg", ".jpeg", ".png"):
error = "File is not an image file"
else:
try:
np.asarray(PIL.Image.open(path))
except Exception as e:
error = f"Image cannot be opened: {e}"
return dict(record, error=error)
if __name__ == '__main__':
main()