# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ conver table label to html """ import json import argparse from tqdm import tqdm def save_pred_txt(key, val, tmp_file_path): with open(tmp_file_path, 'a+', encoding='utf-8') as f: f.write('{}\t{}\n'.format(key, val)) def skip_char(text, sp_char_list): """ skip empty cell @param text: text in cell @param sp_char_list: style char and special code @return: """ for sp_char in sp_char_list: text = text.replace(sp_char, '') return text def gen_html(img): ''' Formats HTML code from tokenized annotation of img ''' html_code = img['html']['structure']['tokens'].copy() to_insert = [i for i, tag in enumerate(html_code) if tag in ('', '>')] for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): if cell['tokens']: text = ''.join(cell['tokens']) # skip empty text sp_char_list = ['', '', '\u2028', ' ', '', ''] text_remove_style = skip_char(text, sp_char_list) if len(text_remove_style) == 0: continue html_code.insert(i + 1, text) html_code = ''.join(html_code) html_code = '{}
'.format(html_code) return html_code def load_gt_data(gt_path): """ load gt @param gt_path: @return: """ data_list = {} with open(gt_path, 'rb') as f: lines = f.readlines() for line in tqdm(lines): data_line = line.decode('utf-8').strip("\n") info = json.loads(data_line) data_list[info['filename']] = info return data_list def convert(origin_gt_path, save_path): """ gen html from label file @param origin_gt_path: @param save_path: @return: """ data_dict = load_gt_data(origin_gt_path) for img_name, gt in tqdm(data_dict.items()): html = gen_html(gt) save_pred_txt(img_name, html, save_path) print('conver finish') def parse_args(): parser = argparse.ArgumentParser(description="args for paddleserving") parser.add_argument( "--ori_gt_path", type=str, required=True, help="label gt path") parser.add_argument( "--save_path", type=str, required=True, help="path to save file") args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() convert(args.ori_gt_path, args.save_path)