|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
conver table label to html |
|
""" |
|
|
|
import json |
|
import argparse |
|
from tqdm import tqdm |
|
|
|
|
|
def save_pred_txt(key, val, tmp_file_path): |
|
with open(tmp_file_path, 'a+', encoding='utf-8') as f: |
|
f.write('{}\t{}\n'.format(key, val)) |
|
|
|
|
|
def skip_char(text, sp_char_list): |
|
""" |
|
skip empty cell |
|
@param text: text in cell |
|
@param sp_char_list: style char and special code |
|
@return: |
|
""" |
|
for sp_char in sp_char_list: |
|
text = text.replace(sp_char, '') |
|
return text |
|
|
|
|
|
def gen_html(img): |
|
''' |
|
Formats HTML code from tokenized annotation of img |
|
''' |
|
html_code = img['html']['structure']['tokens'].copy() |
|
to_insert = [i for i, tag in enumerate(html_code) if tag in ('<td>', '>')] |
|
for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): |
|
if cell['tokens']: |
|
text = ''.join(cell['tokens']) |
|
|
|
sp_char_list = ['<b>', '</b>', '\u2028', ' ', '<i>', '</i>'] |
|
text_remove_style = skip_char(text, sp_char_list) |
|
if len(text_remove_style) == 0: |
|
continue |
|
html_code.insert(i + 1, text) |
|
html_code = ''.join(html_code) |
|
html_code = '<html><body><table>{}</table></body></html>'.format(html_code) |
|
return html_code |
|
|
|
|
|
def load_gt_data(gt_path): |
|
""" |
|
load gt |
|
@param gt_path: |
|
@return: |
|
""" |
|
data_list = {} |
|
with open(gt_path, 'rb') as f: |
|
lines = f.readlines() |
|
for line in tqdm(lines): |
|
data_line = line.decode('utf-8').strip("\n") |
|
info = json.loads(data_line) |
|
data_list[info['filename']] = info |
|
return data_list |
|
|
|
|
|
def convert(origin_gt_path, save_path): |
|
""" |
|
gen html from label file |
|
@param origin_gt_path: |
|
@param save_path: |
|
@return: |
|
""" |
|
data_dict = load_gt_data(origin_gt_path) |
|
for img_name, gt in tqdm(data_dict.items()): |
|
html = gen_html(gt) |
|
save_pred_txt(img_name, html, save_path) |
|
print('conver finish') |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description="args for paddleserving") |
|
parser.add_argument( |
|
"--ori_gt_path", type=str, required=True, help="label gt path") |
|
parser.add_argument( |
|
"--save_path", type=str, required=True, help="path to save file") |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
if __name__ == '__main__': |
|
args = parse_args() |
|
convert(args.ori_gt_path, args.save_path) |
|
|