Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features

Sleeping

App Files Files Community

mckabue commited on Apr 5, 2024

Commit

7378eef

verified ·

1 Parent(s): 1c0c18d

RE_UPLOAD-REBUILD-RESTART

Browse files

Files changed (1) hide show

model/layout-model-training/tools/convert_prima_to_coco.py +225 -0

model/layout-model-training/tools/convert_prima_to_coco.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import os, re, json
+import imagesize
+from glob import glob
+from bs4 import BeautifulSoup
+import numpy as np
+from PIL import Image
+import argparse
+from tqdm import tqdm
+import sys
+sys.path.append('..')
+from utils import cocosplit
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(NpEncoder, self).default(obj)
+def cvt_coords_to_array(obj):
+    return np.array(
+            [(float(pt['x']), float(pt['y']))
+                 for pt in obj.find_all("Point")]
+        )
+def cal_ployarea(points):
+    x = points[:,0]
+    y = points[:,1]
+    return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))
+def _create_category(schema=0):
+    if schema==0:
+        categories = \
+            [{"supercategory": "layout", "id": 0, "name": "Background"},
+             {"supercategory": "layout", "id": 1, "name": "TextRegion"},
+             {"supercategory": "layout", "id": 2, "name": "ImageRegion"},
+             {"supercategory": "layout", "id": 3, "name": "TableRegion"},
+             {"supercategory": "layout", "id": 4, "name": "MathsRegion"},
+             {"supercategory": "layout", "id": 5, "name": "SeparatorRegion"},
+             {"supercategory": "layout", "id": 6, "name": "OtherRegion"}]
+        find_categories = lambda name: \
+            [val["id"] for val in categories if val['name'] == name][0]
+        conversion = \
+            {
+                'TextRegion':       find_categories("TextRegion"),
+                'TableRegion':      find_categories("TableRegion"),
+                'MathsRegion':      find_categories("MathsRegion"),
+                'ChartRegion':      find_categories("ImageRegion"),
+                'GraphicRegion':    find_categories("ImageRegion"),
+                'ImageRegion':      find_categories("ImageRegion"),
+                'LineDrawingRegion':find_categories("OtherRegion"),
+                'SeparatorRegion':  find_categories("SeparatorRegion"),
+                'NoiseRegion':      find_categories("OtherRegion"),
+                'FrameRegion':      find_categories("OtherRegion"),
+            }
+        return categories, conversion
+_categories, _categories_conversion = _create_category(schema=0)
+_info = {
+    "description": "PRIMA Layout Analysis Dataset",
+    "url": "https://www.primaresearch.org/datasets/Layout_Analysis",
+    "version": "1.0",
+    "year": 2010,
+    "contributor": "PRIMA Research",
+    "date_created": "2020/09/01",
+}
+def _load_soup(filename):
+    with open(filename, "r") as fp:
+        soup = BeautifulSoup(fp.read(),'xml')
+    return soup
+def _image_template(image_id, image_path):
+    width, height = imagesize.get(image_path)
+    return {
+        "file_name": os.path.basename(image_path),
+        "height": height,
+        "width": width,
+        "id": int(image_id)
+    }
+def _anno_template(anno_id, image_id, pts, obj_tag):
+    x_1, x_2 = pts[:,0].min(), pts[:,0].max()
+    y_1, y_2 = pts[:,1].min(), pts[:,1].max()
+    height = y_2 - y_1
+    width  = x_2 - x_1
+    return {
+        "segmentation": [pts.flatten().tolist()],
+        "area": cal_ployarea(pts),
+        "iscrowd": 0,
+        "image_id": image_id,
+        "bbox": [x_1, y_1, width, height],
+        "category_id": _categories_conversion[obj_tag],
+        "id": anno_id
+    }
+class PRIMADataset():
+    def __init__(self, base_path, anno_path='XML',
+                                  image_path='Images'):
+        self.base_path = base_path
+        self.anno_path = os.path.join(base_path, anno_path)
+        self.image_path = os.path.join(base_path, image_path)
+        self._ids = self.find_all_image_ids()
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx):
+        return self.load_image_and_annotaiton(idx)
+    def find_all_annotation_files(self):
+        return glob(os.path.join(self.anno_path, '*.xml'))
+    def find_all_image_ids(self):
+        replacer = lambda s: os.path.basename(s).replace('pc-', '').replace('.xml', '')
+        return [replacer(s) for s in self.find_all_annotation_files()]
+    def load_image_and_annotaiton(self, idx):
+        image_id = self._ids[idx]
+        image_path = os.path.join(self.image_path, f'{image_id}.tif')
+        image = Image.open(image_path)
+        anno = self.load_annotation(idx)
+        return image, anno
+    def load_annotation(self, idx):
+        image_id = self._ids[idx]
+        anno_path  = os.path.join(self.anno_path,  f'pc-{image_id}.xml')
+        # A dirtly hack to load the files w/wo pc- simualtaneously
+        if not os.path.exists(anno_path):
+            anno_path = os.path.join(self.anno_path,  f'{image_id}.xml')
+            assert os.path.exists(anno_path), "Invalid path"
+        anno = _load_soup(anno_path)
+        return anno
+    def convert_to_COCO(self, save_path):
+        all_image_infos = []
+        all_anno_infos  = []
+        anno_id = 0
+        for idx, image_id in enumerate(tqdm(self._ids)):
+            # We use the idx as the image id
+            image_path = os.path.join(self.image_path, f'{image_id}.tif')
+            image_info = _image_template(idx, image_path)
+            all_image_infos.append(image_info)
+            anno = self.load_annotation(idx)
+            for item in anno.find_all(re.compile(".*Region")):
+                pts = cvt_coords_to_array(item.Coords)
+                if 0 not in pts.shape:
+                    # Sometimes there will be polygons with less
+                    # than 4 edges, and they could not be appropriately
+                    # handled by the COCO format. So we just drop them.
+                    if pts.shape[0] >= 4:
+                        anno_info = _anno_template(anno_id, idx, pts, item.name)
+                        all_anno_infos.append(anno_info)
+                        anno_id += 1
+        final_annotation = {
+            "info": _info,
+            "licenses": [],
+            "images": all_image_infos,
+            "annotations": all_anno_infos,
+            "categories": _categories}
+        with open(save_path, 'w') as fp:
+            json.dump(final_annotation, fp, cls=NpEncoder)
+        return final_annotation
+parser = argparse.ArgumentParser()
+parser.add_argument('--prima_datapath', type=str, default='./data/prima', help='the path to the prima data folders')
+parser.add_argument('--anno_savepath',  type=str, default='./annotations.json', help='the path to save the new annotations')
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print("Start running the conversion script")
+    print(f"Loading the information from the path {args.prima_datapath}")
+    dataset = PRIMADataset(args.prima_datapath)
+    print(f"Saving the annotation to {args.anno_savepath}")
+    res = dataset.convert_to_COCO(args.anno_savepath)
+    cocosplit.main(
+        args.anno_savepath,
+        split_ratio=0.8,
+        having_annotations=True,
+        train_save_path=args.anno_savepath.replace('.json', '-train.json'),
+        test_save_path=args.anno_savepath.replace('.json', '-val.json'),
+        random_state=24)