|
import gradio as gr |
|
from PIL import Image, ImageDraw, ImageFont |
|
import random |
|
import pandas as pd |
|
import numpy as np |
|
from datasets import concatenate_datasets |
|
from operator import itemgetter |
|
import collections |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
dataset_small = load_dataset("pierreguillou/DocLayNet-small") |
|
dataset_base = load_dataset("pierreguillou/DocLayNet-base") |
|
|
|
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)} |
|
label2id = {label:idx for idx,label in id2label.items()} |
|
labels = [label for idx, label in id2label.items()] |
|
|
|
|
|
def convert_box(box): |
|
x, y, w, h = tuple(box) |
|
actual_box = [x, y, x+w, y+h] |
|
return actual_box |
|
|
|
|
|
def original_box(box, original_width, original_height, coco_width, coco_height): |
|
return [ |
|
int(original_width * (box[0] / coco_width)), |
|
int(original_height * (box[1] / coco_height)), |
|
int(original_width * (box[2] / coco_width)), |
|
int(original_height* (box[3] / coco_height)), |
|
] |
|
|
|
|
|
def get_sorted_boxes(bboxes): |
|
|
|
|
|
bboxes = sorted(bboxes, key=itemgetter(1), reverse=False) |
|
y_list = [bbox[1] for bbox in bboxes] |
|
|
|
|
|
if len(list(set(y_list))) != len(y_list): |
|
y_list_duplicates_indexes = dict() |
|
y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1] |
|
for item in y_list_duplicates: |
|
y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item] |
|
bbox_list_y_duplicates = sorted(np.array(bboxes)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False) |
|
np_array_bboxes = np.array(bboxes) |
|
np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates) |
|
bboxes = np_array_bboxes.tolist() |
|
|
|
return bboxes |
|
|
|
|
|
label2color = { |
|
'Caption': 'brown', |
|
'Footnote': 'orange', |
|
'Formula': 'gray', |
|
'List-item': 'yellow', |
|
'Page-footer': 'red', |
|
'Page-header': 'red', |
|
'Picture': 'violet', |
|
'Section-header': 'orange', |
|
'Table': 'green', |
|
'Text': 'blue', |
|
'Title': 'pink' |
|
} |
|
|
|
|
|
examples_dir = 'samples/' |
|
images_wo_content = examples_dir + "wo_content.png" |
|
|
|
df_paragraphs_wo_content, df_lines_wo_content = pd.DataFrame(), pd.DataFrame() |
|
|
|
df_paragraphs_wo_content["paragraphs"] = [0] |
|
df_paragraphs_wo_content["categories"] = ["no content"] |
|
df_paragraphs_wo_content["texts"] = ["no content"] |
|
df_paragraphs_wo_content["bounding boxes"] = ["no content"] |
|
|
|
df_lines_wo_content["lines"] = [0] |
|
df_lines_wo_content["categories"] = ["no content"] |
|
df_lines_wo_content["texts"] = ["no content"] |
|
df_lines_wo_content["bounding boxes"] = ["no content"] |
|
|
|
|
|
font = ImageFont.load_default() |
|
|
|
dataset_names = ["small", "base"] |
|
splits = ["all", "train", "validation", "test"] |
|
domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"] |
|
domains_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains] |
|
categories = labels + ["all"] |
|
|
|
|
|
def generate_annotated_image(dataset_name, split, domain, category): |
|
|
|
|
|
msg_error = "" |
|
|
|
|
|
if dataset_name == "small": example = dataset_small |
|
else: example = dataset_base |
|
|
|
|
|
if split == "all": |
|
example = concatenate_datasets([example["train"], example["validation"], example["test"]]) |
|
else: |
|
example = example[split] |
|
|
|
|
|
domain_name = domains_names[domains.index(domain)] |
|
if domain_name != "all": |
|
example = example.filter(lambda example: example["doc_category"] == domain_name) |
|
if len(example) == 0: |
|
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").' |
|
example = dict() |
|
|
|
|
|
idx_list = list() |
|
if category != "all": |
|
for idx, categories_list in enumerate(example["categories"]): |
|
if int(label2id[category]) in categories_list: |
|
idx_list.append(idx) |
|
if len(idx_list) > 0: |
|
example = example.select(idx_list) |
|
else: |
|
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").' |
|
example = dict() |
|
|
|
if len(msg_error) > 0: |
|
|
|
Image.open(images_wo_content).save("wo_content.png") |
|
|
|
df_paragraphs_wo_content.to_csv("paragraphs_wo_content.csv", encoding="utf-8", index=False) |
|
df_lines_wo_content.to_csv("lines_wo_content.csv", encoding="utf-8", index=False) |
|
|
|
return msg_error, "wo_content.png", images_wo_content, images_wo_content, "wo_content.png", "wo_content.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs_wo_content.csv", visible=False), gr.File.update(value="lines_wo_content.csv", visible=False) |
|
else: |
|
|
|
index = random.randint(0, len(example)) |
|
image = example[index]["image"] |
|
coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"] |
|
original_width, original_height = example[index]["original_width"], example[index]["original_height"] |
|
original_filename = example[index]["original_filename"] |
|
page_no = example[index]["page_no"] |
|
num_pages = example[index]["num_pages"] |
|
|
|
|
|
image = image.resize((original_width, original_height)) |
|
|
|
|
|
img_file = original_filename.replace(".pdf", ".png") |
|
image.save(img_file) |
|
|
|
|
|
texts = example[index]["texts"] |
|
bboxes_block = example[index]["bboxes_block"] |
|
bboxes_line = example[index]["bboxes_line"] |
|
categories = example[index]["categories"] |
|
domain = example[index]["doc_category"] |
|
|
|
|
|
original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block] |
|
original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line] |
|
original_bboxes = [original_bboxes_block, original_bboxes_line] |
|
|
|
|
|
|
|
|
|
original_blocks = dict() |
|
original_bboxes_block_list = list() |
|
original_bbox_block_prec = list() |
|
for count_block, original_bbox_block in enumerate(original_bboxes_block): |
|
if original_bbox_block != original_bbox_block_prec: |
|
original_bbox_block_indexes = [i for i, original_bbox in enumerate(original_bboxes_block) if original_bbox == original_bbox_block] |
|
original_blocks[count_block] = original_bbox_block_indexes |
|
original_bboxes_block_list.append(original_bbox_block) |
|
original_bbox_block_prec = original_bbox_block |
|
|
|
|
|
category_block_list, text_block_list = list(), list() |
|
for original_bbox_block in original_bboxes_block_list: |
|
count_block = original_bboxes_block.index(original_bbox_block) |
|
original_bbox_block_indexes = original_blocks[count_block] |
|
category_block = categories[original_bbox_block_indexes[0]] |
|
category_block_list.append(category_block) |
|
if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote": |
|
text_block = ' '.join(np.array(texts)[original_bbox_block_indexes].tolist()) |
|
elif id2label[category_block] == "Section-header" or id2label[category_block] == "Title" or id2label[category_block] == "Picture" or id2label[category_block] == "Formula" or id2label[category_block] == "List-item" or id2label[category_block] == "Table" or id2label[category_block] == "Page-header" or id2label[category_block] == "Page-footer": |
|
text_block = '\n'.join(np.array(texts)[original_bbox_block_indexes].tolist()) |
|
text_block_list.append(text_block) |
|
|
|
|
|
sorted_original_bboxes_block_list = get_sorted_boxes(original_bboxes_block_list) |
|
sorted_original_bboxes_block_list_indexes = [original_bboxes_block_list.index(item) for item in sorted_original_bboxes_block_list] |
|
sorted_category_block_list = np.array(category_block_list)[sorted_original_bboxes_block_list_indexes].tolist() |
|
sorted_text_block_list = np.array(text_block_list)[sorted_original_bboxes_block_list_indexes].tolist() |
|
|
|
|
|
|
|
|
|
original_bboxes_line_list = original_bboxes_line |
|
category_line_list = categories |
|
text_line_list = texts |
|
sorted_original_bboxes_line_list = get_sorted_boxes(original_bboxes_line_list) |
|
sorted_original_bboxes_line_list_indexes = [original_bboxes_line_list.index(item) for item in sorted_original_bboxes_line_list] |
|
sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist() |
|
sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist() |
|
|
|
|
|
columns = 2 |
|
images = [image.copy(), image.copy()] |
|
num_imgs = len(images) |
|
|
|
imgs, df_paragraphs, df_lines = dict(), pd.DataFrame(), pd.DataFrame() |
|
for i, img in enumerate(images): |
|
|
|
draw = ImageDraw.Draw(img) |
|
|
|
for box, label_idx, text in zip(original_bboxes[i], categories, texts): |
|
label = id2label[label_idx] |
|
color = label2color[label] |
|
draw.rectangle(box, outline=color) |
|
text = text.encode('latin-1', 'replace').decode('latin-1') |
|
draw.text((box[0] + 10, box[1] - 10), text=label, fill=color, font=font) |
|
|
|
if i == 0: |
|
imgs["paragraphs"] = img |
|
|
|
|
|
img_paragraphs = "img_paragraphs_" + original_filename.replace(".pdf", ".png") |
|
img.save(img_paragraphs) |
|
|
|
df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list))) |
|
df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list] |
|
df_paragraphs["texts"] = sorted_text_block_list |
|
df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list] |
|
|
|
|
|
csv_paragraphs = "csv_paragraphs_" + original_filename.replace(".pdf", ".csv") |
|
df_paragraphs.to_csv(csv_paragraphs, encoding="utf-8", index=False) |
|
|
|
else: |
|
imgs["lines"] = img |
|
|
|
|
|
img_lines = "img_lines_" + original_filename.replace(".pdf", ".png") |
|
img.save(img_lines) |
|
|
|
df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list))) |
|
df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list] |
|
df_lines["texts"] = sorted_text_line_list |
|
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list] |
|
|
|
|
|
csv_lines = "csv_lines_" + original_filename.replace(".pdf", ".csv") |
|
df_lines.to_csv(csv_lines, encoding="utf-8", index=False) |
|
|
|
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.' |
|
|
|
return msg, img_file, imgs["paragraphs"], imgs["lines"], img_paragraphs, img_lines, df_paragraphs, df_lines, gr.File.update(value=csv_paragraphs, visible=True), gr.File.update(value=csv_lines, visible=True) |
|
|
|
|
|
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo: |
|
gr.HTML(""" |
|
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div> |
|
<div style="margin-top: 40px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset and a data extraction tool.</p></div> |
|
<div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div> |
|
<div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div> |
|
<div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div> |
|
<div><p><b>WARNING</b>: if the app crashes or runs without providing a result, refresh the page (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/DocLayNet-image-viewer">DocLayNet image viewer</a>) and run a search again. If the same problem occurs again, prefer the DocLayNet small. Thanks.</p></div> |
|
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the following blog post: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></div> |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset") |
|
with gr.Column(): |
|
split_gr = gr.Dropdown(splits, value="all", label="Split") |
|
with gr.Column(): |
|
domain_gr = gr.Dropdown(domains, value="all", label="Domain") |
|
with gr.Column(): |
|
category_gr = gr.Dropdown(categories, value="all", label="Category") |
|
btn = gr.Button("Display labeled PDF image & data") |
|
with gr.Row(): |
|
with gr.Column(): |
|
output_msg = gr.Textbox(label="Output message") |
|
with gr.Column(): |
|
img_file = gr.File(visible=True, label="Image file of the PDF") |
|
with gr.Row(): |
|
with gr.Column(): |
|
img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)") |
|
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True) |
|
with gr.Column(): |
|
img_lines_file = gr.File(visible=True, label="Image file (labeled lines)") |
|
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True) |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
csv_paragraphs = gr.File(visible=False, label="CSV file (paragraphs)") |
|
with gr.Row(): |
|
df_paragraphs = gr.Dataframe( |
|
headers=["paragraphs", "categories", "texts", "bounding boxes"], |
|
datatype=["number", "str", "str", "str"], |
|
col_count=(4, "fixed"), |
|
visible=True, |
|
label="Paragraphs data", |
|
type="pandas", |
|
wrap=True |
|
) |
|
with gr.Column(): |
|
with gr.Row(): |
|
csv_lines = gr.File(visible=False, label="CSV file (lines)") |
|
with gr.Row(): |
|
df_lines = gr.Dataframe( |
|
headers=["lines", "categories", "texts", "bounding boxes"], |
|
datatype=["number", "str", "str", "str"], |
|
col_count=(4, "fixed"), |
|
visible=True, |
|
label="Lines data", |
|
type="pandas", |
|
wrap=True |
|
) |
|
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines]) |
|
|
|
gr.Markdown("## Example") |
|
gr.Examples( |
|
[["small", "all", "all", "all"]], |
|
[dataset_name_gr, split_gr, domain_gr, category_gr], |
|
[output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines], |
|
fn=generate_annotated_image, |
|
cache_examples=True, |
|
) |
|
|
|
demo.launch() |