Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

document-similarity-matching-using-visual-layout-features-archive / model /layout-parser /utils /cocosplit.py

Charles Kabui

Add 'model/layout-parser/' from commit 'b9fad076596272e427612d5e848da1ba8ea06b97'

399308e over 1 year ago

3.15 kB

	# Modified based on https://github.com/akarazniewicz/cocosplit/blob/master/cocosplit.py

	import json
	import argparse
	import funcy
	from sklearn.model_selection import train_test_split

	parser = argparse.ArgumentParser(
	description="Splits COCO annotations file into training and test sets."
	)
	parser.add_argument(
	"--annotation-path",
	metavar="coco_annotations",
	type=str,
	help="Path to COCO annotations file.",
	)
	parser.add_argument(
	"--train", type=str, help="Where to store COCO training annotations"
	)
	parser.add_argument("--test", type=str, help="Where to store COCO test annotations")
	parser.add_argument(
	"--split-ratio",
	dest="split_ratio",
	type=float,
	required=True,
	help="A percentage of a split; a number in (0, 1)",
	)
	parser.add_argument(
	"--having-annotations",
	dest="having_annotations",
	action="store_true",
	help="Ignore all images without annotations. Keep only these with at least one annotation",
	)


	def save_coco(file, tagged_data):
	with open(file, "wt", encoding="UTF-8") as coco:
	json.dump(tagged_data, coco, indent=2, sort_keys=True)


	def filter_annotations(annotations, images):
	image_ids = funcy.lmap(lambda i: int(i["id"]), images)
	return funcy.lfilter(lambda a: int(a["image_id"]) in image_ids, annotations)


	def main(
	annotation_path,
	split_ratio,
	having_annotations,
	train_save_path,
	test_save_path,
	random_state=None,
	):

	with open(annotation_path, "rt", encoding="UTF-8") as annotations:
	coco = json.load(annotations)

	images = coco["images"]
	annotations = coco["annotations"]

	ids_with_annotations = funcy.lmap(lambda a: int(a["image_id"]), annotations)

	# Images with annotations
	img_ann = funcy.lremove(lambda i: i["id"] not in ids_with_annotations, images)
	tr_ann, ts_ann = train_test_split(
	img_ann, train_size=split_ratio, random_state=random_state
	)

	img_wo_ann = funcy.lremove(lambda i: i["id"] in ids_with_annotations, images)
	if len(img_wo_ann) > 0:
	tr_wo_ann, ts_wo_ann = train_test_split(
	img_wo_ann, train_size=split_ratio, random_state=random_state
	)
	else:
	tr_wo_ann, ts_wo_ann = [], [] # Images without annotations

	if having_annotations:
	tr, ts = tr_ann, ts_ann

	else:
	# Merging the 2 image lists (i.e. with and without annotation)
	tr_ann.extend(tr_wo_ann)
	ts_ann.extend(ts_wo_ann)

	tr, ts = tr_ann, ts_ann

	# Train Data
	coco.update({"images": tr, "annotations": filter_annotations(annotations, tr)})
	save_coco(train_save_path, coco)

	# Test Data
	coco.update({"images": ts, "annotations": filter_annotations(annotations, ts)})
	save_coco(test_save_path, coco)

	print(
	"Saved {} entries in {} and {} in {}".format(
	len(tr), train_save_path, len(ts), test_save_path
	)
	)


	if __name__ == "__main__":
	args = parser.parse_args()

	main(
	args.annotation_path,
	args.split_ratio,
	args.having_annotations,
	args.train,
	args.test,
	random_state=24,
	)