Spaces:

tomofi
/

NDLOCR

Build error

NDLOCR / src /text_recognition /create_xmldataset.py

Your Name

Add files

c9019cd about 2 years ago

No virus

2.32 kB

	# Copyright (c) 2022, National Diet Library, Japan
	#
	# This software is released under the CC BY 4.0.
	# https://creativecommons.org/licenses/by/4.0/


	import pathlib
	import json
	import lmdb

	from xmlparser import XMLRawDataset, ListRawDataset

	import argparse


	class Env:
	def __init__(self, output_path, interval_writeCache=1000):
	self.output_path = output_path
	self.env = lmdb.open(str(output_path), map_size=1099511627776)
	self.cache = dict()
	self.n = 0
	self.interval = interval_writeCache

	def finish_line(self):
	self.n += 1
	if self.n % 1000 == 0:
	self.writeCache()

	def writeCache(self):
	with self.env.begin(write=True) as txn:
	for k, v in self.cache.items():
	txn.put(k, v)
	self.cache = {}
	print(f'Written {self.n} lines @ {self.output_path}')


	def createDataset(input_path, output_path, db_type='xml', dry_run=False):
	p = pathlib.Path(output_path)
	p.mkdir(parents=True, exist_ok=True)

	if db_type == 'xml':
	generator = XMLRawDataset.from_list(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
	elif db_type == 'list':
	generator = ListRawDataset(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
	if dry_run:
	return

	# generate database
	env = Env(output_path[0])
	env.cache['dbtype'.encode()] = 'xml'.encode()

	for il, (g, line) in enumerate(generator):
	env.cache[f'{env.n:09d}-direction'.encode()] = line.get('direction').encode()
	env.cache[f'{env.n:09d}-label'.encode()] = line.get('label').encode()
	env.cache[f'{env.n:09d}-cattrs'.encode()] = json.dumps(line.get('cattrs')).encode()
	env.cache[f'{env.n:09d}-image'.encode()] = g
	env.finish_line()

	env.cache['n_line'.encode()] = str(env.n).encode()
	env.writeCache()


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--input_path', nargs='+', required=True)
	parser.add_argument('--output_path', required=True)
	parser.add_argument('--db_type', default='xml', choices=['xml', 'list'])
	parser.add_argument('--dry-run', action='store_true')
	opt = parser.parse_args()
	createDataset(opt.input_path, opt.output_path, opt.db_type, dry_run=opt.dry_run)