|
""" |
|
Check that files that are references from JSONL file are valid. |
|
""" |
|
import json |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import PIL.Image |
|
from joblib import delayed, Parallel |
|
|
|
from realfake.utils import inject_args, Args, read_jsonl |
|
|
|
|
|
class CheckFilesArgs(Args): |
|
jsonl_file: Path |
|
|
|
|
|
@inject_args |
|
def main(args: CheckFilesArgs) -> None: |
|
records = read_jsonl(args.jsonl_file) |
|
results = Parallel(n_jobs=-1, verbose=100)(delayed(check_file)(record) for record in records) |
|
failed = [result for result in results if result["error"] is not None] |
|
if not failed: |
|
print("All files are valid") |
|
else: |
|
saved_file = args.jsonl_file.with_suffix(".failed.jsonl") |
|
print(f"{len(failed)} files are invalid, saved errors to {saved_file}") |
|
with open(saved_file, "w") as f: |
|
for record in failed: |
|
f.write(json.dumps(record) + "\n") |
|
|
|
|
|
def check_file(record: dict) -> dict: |
|
path = Path(record["path"]) |
|
error = None |
|
if not path.exists(): |
|
error = "File does not exist" |
|
elif not path.is_file(): |
|
error = "Path is not a file" |
|
elif path.suffix.lower() not in (".jpg", ".jpeg", ".png"): |
|
error = "File is not an image file" |
|
else: |
|
try: |
|
np.asarray(PIL.Image.open(path)) |
|
except Exception as e: |
|
error = f"Image cannot be opened: {e}" |
|
return dict(record, error=error) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|