NDLOCR / src /ndl_layout /tools /preprocess.py
3v324v23's picture
Add files
c9019cd
#!/usr/bin/env python
# Copyright (c) 2022, National Diet Library, Japan
#
# This software is released under the CC BY 4.0.
# https://creativecommons.org/licenses/by/4.0/
from .utils import auto_run
from pathlib import Path
def iterfiles(root_dir: Path):
"""
Iterate all subfiles under root_dir.
This is not a pathlib implementation because pathlib does not support globs across symbols..
"""
from glob import glob
return [Path(p) for p in glob(f'{root_dir}/**', recursive=True)]
def main(root_dir: str, output_dir: str, use_link: bool = False, use_shrink: bool = False):
import os
import subprocess
from pathlib import Path
def call(*args):
print(*args)
subprocess.call(args, stderr=None)
scale = 1.0
if use_shrink:
scale = 0.5
os.makedirs(output_dir, exist_ok=True)
if use_link:
for path in iterfiles(root_dir):
if path.is_dir() and (path / "img").exists() and (path / "xml").exists():
for img_path in (path / "img").iterdir():
if img_path.is_file() and not str(img_path.name).startswith('._'):
if img_path.suffix == '.jpg':
call('ln', '-s', str(img_path.resolve()),
str(Path(output_dir) / (path.name + "_" + img_path.name)))
else:
import cv2
img = cv2.imread(str(img_path.resolve()))
dst_path = (
Path(output_dir) / (path.name + "_" + img_path.name)).with_suffix('.jpg')
cv2.imwrite(str(dst_path), img)
if use_shrink:
call('python', './tools/shrink2.py', output_dir, output_dir + "_half")
xml_paths = []
img_dirs = []
for path in iterfiles(root_dir):
if path.is_dir() and (path / "img").exists() and (path / "xml").exists():
xml_path = [p for p in (
path / "xml").iterdir() if p.suffix == ".xml" and not p.name.startswith('._')]
assert len(xml_path) == 1
xml_paths.append(str(xml_path[0]))
img_dirs.append(str(path / "img"))
# xml_paths = xml_paths[0:1]
# img_dirs = img_dirs[0:1]
dataset_name = Path(root_dir).name
train_json_path = f'generated/{dataset_name}_train.json'
test_json_path = f'generated/{dataset_name}_test.json'
if use_link:
if use_shrink:
train_json_path = str(Path(output_dir + "_half") / "train.json")
test_json_path = str(Path(output_dir + "_half") / "test.json")
else:
train_json_path = str(Path(output_dir) / "train.json")
test_json_path = str(Path(output_dir) / "test.json")
print("train_json_path :", train_json_path)
print("test_json_path :", test_json_path)
cmd = ['python', '-m', 'tools.ndl_parser', '--xml_paths', *xml_paths, '--img_dirs', *img_dirs, '--add_prefix', 'True',
'--fx', str(scale), '--fy', str(scale), '--train_json_path', train_json_path, '--test_json_path', test_json_path]
cmd_len = sum(len(line) for line in cmd)
if cmd_len > 2_000_000:
xml_list_filename = 'tmp_xml_paths.list'
img_list_filename = 'tmp_img_dirs.list'
print('xml_paths or img_dirs is long.')
print('export xml_paths and img_dirs to {} and {}'.format(
xml_list_filename, img_list_filename))
with open(xml_list_filename, 'w') as f:
for path in xml_paths:
f.write(path+'\n')
with open(img_list_filename, 'w') as f:
for path in img_dirs:
f.write(path+'\n')
call('python', './tools/ndl_parser.py', '--xml_list_path', xml_list_filename, '--img_list_path', img_list_filename, '--add_prefix',
'True', '--fx', str(scale), '--fy', str(scale), '--train_json_path', train_json_path, '--test_json_path', test_json_path)
else:
call('python', './tools/ndl_parser.py', '--xml_paths', *xml_paths, '--img_dirs', *img_dirs, '--add_prefix', 'True',
'--fx', str(scale), '--fy', str(scale), '--train_json_path', train_json_path, '--test_json_path', test_json_path)
auto_run(main)