|
import os |
|
from pathlib import Path |
|
import pandas as pd |
|
import gradio as gr |
|
from collections import OrderedDict |
|
from PIL import Image, ImageDraw, ImageFont |
|
from io import BytesIO |
|
import PyPDF2 |
|
import pdf2image |
|
|
|
MAX_PAGES = 50 |
|
MAX_PDF_SIZE = 100000000 |
|
MIN_WIDTH, MIN_HEIGHT = 150, 150 |
|
|
|
""" |
|
Load diagnostic dataset |
|
|
|
Have pointer to local PDF/grid files |
|
|
|
Visualize PDF/grid files based on slider values and (randonly) sampled combination of sliders |
|
|
|
--> truly interactive visualization of diagnostic samples and their questions |
|
|
|
""" |
|
|
|
PDF_PATH = Path("/home/jordy/Downloads/DUDE_train-val-test_binaries/PDF") |
|
DIAGNOSTIC_PATH = "/home/jordy/code/DUchallenge/DUeval/diagnostic_test-updated.csv" |
|
|
|
answer_types = { |
|
"abstractive": "Abstractive", |
|
"extractive": "Extractive", |
|
"not-answerable": "Not Answerable", |
|
"list/abstractive": "Abstractive List", |
|
"list/extractive": "Extractive List", |
|
} |
|
|
|
DIAGNOSTIC_TEST = None |
|
if os.path.exists(DIAGNOSTIC_PATH): |
|
DIAGNOSTIC_TEST = pd.read_csv(DIAGNOSTIC_PATH) |
|
|
|
meta_cats = OrderedDict( |
|
{ |
|
"complexity": ["meta", "multihop", "other_hard", "simple", None], |
|
"evidence": [ |
|
"handwriting", |
|
"layout", |
|
"plain", |
|
"table_or_list", |
|
"visual_chart", |
|
"visual_checkbox", |
|
"visual_color", |
|
"visual_image", |
|
"visual_logo", |
|
"visual_map", |
|
"visual_other", |
|
"visual_signature", |
|
"visual_stamp", |
|
None, |
|
], |
|
"form": ["date", "numeric", "other", "proper", None], |
|
"operation": ["arithmetic", "comparison", "counting", "normalization", None], |
|
"type": ["abstractive", "extractive", None], |
|
} |
|
) |
|
diagnostic_cats = [ |
|
"complexity_meta", |
|
"complexity_multihop", |
|
"complexity_other_hard", |
|
"complexity_simple", |
|
"evidence_handwriting", |
|
"evidence_layout", |
|
"evidence_plain", |
|
"evidence_table_or_list", |
|
"evidence_visual_chart", |
|
"evidence_visual_checkbox", |
|
"evidence_visual_color", |
|
"evidence_visual_image", |
|
"evidence_visual_logo", |
|
"evidence_visual_map", |
|
"evidence_visual_other", |
|
"evidence_visual_signature", |
|
"evidence_visual_stamp", |
|
"form_date", |
|
"form_numeric", |
|
"form_other", |
|
"form_proper", |
|
"operation_arithmetic", |
|
"operation_comparison", |
|
"operation_counting", |
|
"operation_normalization", |
|
"type_abstractive", |
|
"type_extractive", |
|
|
|
|
|
] |
|
|
|
|
|
sliders = [gr.Dropdown(choices=choices, value=choices[-1], label=label) for label, choices in meta_cats.items()] |
|
|
|
slider_defaults = [None, "visual_checkbox", None, None, None] |
|
|
|
|
|
def equal_image_grid(images): |
|
def compute_grid(n, max_cols=6): |
|
equalDivisor = int(n**0.5) |
|
cols = min(equalDivisor, max_cols) |
|
rows = equalDivisor |
|
if rows * cols >= n: |
|
return rows, cols |
|
cols += 1 |
|
if rows * cols >= n: |
|
return rows, cols |
|
while rows * cols < n: |
|
rows += 1 |
|
return rows, cols |
|
|
|
|
|
rows, cols = compute_grid(len(images)) |
|
|
|
|
|
images = [im for im in images if (im.height > 0) and (im.width > 0)] |
|
|
|
min_width = min(im.width for im in images) |
|
images = [im.resize((min_width, int(im.height * min_width / im.width)), resample=Image.BICUBIC) for im in images] |
|
|
|
w, h = max([img.size[0] for img in images]), max([img.size[1] for img in images]) |
|
|
|
grid = Image.new("RGB", size=(cols * w, rows * h)) |
|
grid_w, grid_h = grid.size |
|
|
|
for i, img in enumerate(images): |
|
grid.paste(img, box=(i % cols * w, i // cols * h)) |
|
return grid |
|
|
|
|
|
def add_pagenumbers(im_list, height_scale=40): |
|
def add_pagenumber(image, i): |
|
width, height = image.size |
|
draw = ImageDraw.Draw(image) |
|
fontsize = int((width * height) ** (0.5) / height_scale) |
|
font = ImageFont.truetype("Arial.ttf", fontsize) |
|
margin = int(2 * fontsize) |
|
draw.text( |
|
(width - margin, height - margin), |
|
str(i + 1), |
|
fill="#D00917", |
|
font=font, |
|
spacing=4, |
|
align="right", |
|
) |
|
|
|
for i, image in enumerate(im_list): |
|
add_pagenumber(image, i) |
|
|
|
|
|
def pdf_to_grid(pdf_path): |
|
reader = PyPDF2.PdfReader(pdf_path) |
|
reached_page_limit = False |
|
images = [] |
|
try: |
|
for p, page in enumerate(reader.pages): |
|
if reached_page_limit: |
|
break |
|
for image in page.images: |
|
im = Image.open(BytesIO(image.data)) |
|
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
|
continue |
|
images.append(im) |
|
except Exception as e: |
|
print(f"{pdf_path} PyPDF get_images {e}") |
|
images = pdf2image.convert_from_path(pdf_path) |
|
|
|
|
|
|
|
|
|
if len(images) == 0: |
|
return None |
|
add_pagenumbers(images) |
|
return equal_image_grid(images) |
|
|
|
|
|
def main(complexity, evidence, form, operation, type): |
|
|
|
|
|
query = " and ".join( |
|
[ |
|
f"{cat}_{val} == {True}" |
|
for cat, val in zip(meta_cats.keys(), [complexity, evidence, form, operation, type]) |
|
if val |
|
] |
|
) |
|
results = DIAGNOSTIC_TEST.query(query) |
|
if len(results) == 0: |
|
return f"No results found for query {query}", "", "", "", "" |
|
|
|
for i, sample in results.sample(frac=1).iterrows(): |
|
print("Sampled: ", sample["nhash"]) |
|
|
|
|
|
PDF, grid = None, None |
|
pdf_path = PDF_PATH / "test" / (sample["nhash"] + ".pdf") |
|
if not os.path.exists(pdf_path): |
|
continue |
|
PDF = pdf_path |
|
grid = pdf_to_grid(pdf_path) |
|
if not grid: |
|
continue |
|
|
|
|
|
question, answer = sample["question"], sample["answer"] |
|
|
|
|
|
diagnostics = ", ".join([cat for cat in diagnostic_cats if sample[cat]]) |
|
|
|
return question, answer, diagnostics, grid, PDF |
|
|
|
|
|
|
|
|
|
|
|
outputs = [ |
|
gr.Textbox(label="question"), |
|
gr.Textbox(label="answer"), |
|
gr.Textbox(label="diagnostics"), |
|
gr.Image(label="image grid of PDF"), |
|
gr.File(label="PDF"), |
|
] |
|
|
|
iface = gr.Interface(fn=main, inputs=sliders, outputs=outputs, description="Visualize diagnostic samples from DUDE") |
|
iface.launch(share=True) |
|
|