Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pdfplumber | |
import textwrap | |
import pprint | |
import json | |
import os | |
from pathlib import Path | |
def table_debugger( | |
file_obj, | |
page_num=0, | |
table_num=0, | |
crop_x0=None, | |
crop_top=None, | |
crop_x1=None, | |
crop_bottom=None, | |
vertical_strategy=None, | |
horizontal_strategy=None, | |
explicit_vertical_lines=None, | |
explicit_horizontal_lines=None, | |
snap_tolerance=None, | |
snap_x_tolerance=None, | |
snap_y_tolerance=None, | |
join_tolerance=None, | |
join_x_tolerance=None, | |
join_y_tolerance=None, | |
text_tolerance=None, | |
text_x_tolerance=None, | |
text_y_tolerance=None, | |
intersection_tolerance=None, | |
intersection_x_tolerance=None, | |
intersection_y_tolerance=None, | |
edge_min_length=None, | |
min_words_vertical=None, | |
min_words_horizontal=None, | |
keep_blank_chars=None, | |
): | |
table_settings = { | |
"vertical_strategy": vertical_strategy, | |
"horizontal_strategy": horizontal_strategy, | |
"explicit_vertical_lines": json.loads(explicit_vertical_lines) | |
if explicit_vertical_lines | |
else None, | |
"explicit_horizontal_lines": json.loads(explicit_horizontal_lines) | |
if explicit_horizontal_lines | |
else None, | |
"snap_tolerance": snap_tolerance, | |
"snap_x_tolerance": snap_x_tolerance, | |
"snap_y_tolerance": snap_y_tolerance, | |
"join_tolerance": join_tolerance, | |
"join_x_tolerance": join_x_tolerance, | |
"join_y_tolerance": join_y_tolerance, | |
"text_tolerance": text_tolerance, | |
"text_x_tolerance": text_x_tolerance, | |
"text_y_tolerance": text_y_tolerance, | |
"intersection_tolerance": intersection_tolerance, | |
"intersection_x_tolerance": intersection_x_tolerance, | |
"intersection_y_tolerance": intersection_y_tolerance, | |
"edge_min_length": edge_min_length, | |
"min_words_vertical": min_words_vertical, | |
"min_words_horizontal": min_words_horizontal, | |
#'keep_blank_chars': keep_blank_chars | |
} | |
keys = list(table_settings.keys()) | |
for key in keys: | |
if ( | |
table_settings[key] == "" | |
or table_settings[key] == [] | |
or table_settings[key] is None | |
): | |
del table_settings[key] | |
elif table_settings[key].isdigit(): | |
table_settings[key] = int(table_settings[key]) | |
table_num = int(table_num) | |
with pdfplumber.open(file_obj.name) as pdf: | |
page_num = int(page_num) | |
page = pdf.pages[page_num] | |
page_width = int(page.width) | |
page_height = int(page.height) | |
crop_x0 = int(crop_x0) if crop_x0 else 0 | |
crop_top = int(crop_top) if crop_top else 0 | |
crop_x1 = int(crop_x1) if crop_x1 else page_width | |
crop_bottom = int(crop_bottom) if crop_bottom else page_height | |
# Allow negative numbers | |
if crop_bottom < 0: | |
crop_bottom = page_height + crop_bottom | |
if crop_x1 < 0: | |
crop_x1 = page_width + crop_x1 | |
is_cropped = ( | |
crop_x0 != 0 | |
or crop_top != 0 | |
or crop_x1 != page_width | |
or crop_bottom != page_height | |
) | |
# Only crop if we need to! | |
if is_cropped: | |
page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)) | |
tables = page.extract_tables(table_settings) | |
if len(tables) > 0: | |
table = tables[0] | |
else: | |
table = None | |
visual = page.to_image().debug_tablefinder(table_settings).annotated | |
base_filename = file_obj.name.split("/")[-1] | |
notes = f""" | |
- **Filename:** {base_filename} | |
- **Pages:** {len(pdf.pages)} | |
- **Page num {int(page_num)}:** | |
- **Full dimensions:** {page_width} x {page_height} | |
- **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom} | |
- **Tables found:** {len(tables)} | |
```python | |
import pdfplumber | |
pdf = pdfplumber.open("{base_filename}") | |
page = pdf.pages[{page_num}] | |
""".strip() | |
if is_cropped: | |
notes += ( | |
f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))" | |
) | |
notes += f"""\n | |
table_settings = {pprint.pformat(table_settings, indent=8).strip()} | |
tables = page.extract_tables(table_settings) | |
table = tables[{table_num}] | |
```""" | |
notes = textwrap.dedent(notes) | |
return [notes, visual, table] | |
def demo_subset( | |
file_obj, | |
page_num, | |
table_num, | |
vertical_strategy, | |
horizontal_strategy, | |
snap_y_tolerance, | |
intersection_x_tolerance, | |
crop_bottom, | |
): | |
return table_debugger( | |
file_obj, | |
page_num=page_num, | |
table_num=table_num, | |
vertical_strategy=vertical_strategy, | |
horizontal_strategy=horizontal_strategy, | |
snap_y_tolerance=snap_y_tolerance, | |
intersection_x_tolerance=intersection_x_tolerance, | |
crop_bottom=crop_bottom, | |
) | |
notes = gr.Markdown() | |
output_image = gr.Image() | |
data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data') | |
crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False) | |
crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False) | |
crop_x1 = gr.Text( | |
label="Crop (x1)", placeholder="right (from page left)", container=False, render=False | |
) | |
crop_bottom = gr.Text( | |
label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False | |
) | |
vertical_strategy = gr.Dropdown( | |
label="Vertical Strategy", | |
choices=["lines", "lines_strict", "text", "explicit"], | |
render=False, | |
value="lines", | |
) | |
horizontal_strategy = gr.Dropdown( | |
label="Horizontal Strategy", | |
choices=["lines", "lines_strict", "text", "explicit"], | |
render=False, | |
value="lines", | |
) | |
explicit_vertical_lines = gr.Textbox( | |
label="explicit_vertical_lines", render=False, placeholder="[]" | |
) | |
explicit_horizontal_lines = gr.Textbox( | |
label="explicit_horizontal_lines", render=False, placeholder="[]" | |
) | |
snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False) | |
snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False) | |
snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False) | |
join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False) | |
join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False) | |
join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False) | |
text_tolerance = gr.Textbox( | |
label="Text tolerance", placeholder="1", render=False, value=None | |
) | |
text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False) | |
text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False) | |
intersection_tolerance = gr.Textbox( | |
label="Intersection tolerance", placeholder="1", render=False | |
) | |
intersection_x_tolerance = gr.Textbox( | |
label="Intersection tolerance (x)", placeholder="1", render=False | |
) | |
intersection_y_tolerance = gr.Textbox( | |
label="Intersection tolerance (y)", placeholder="1", render=False | |
) | |
edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False) | |
min_words_vertical = gr.Textbox( | |
label="min_words_vertical", placeholder="3", render=False | |
) | |
min_words_horizontal = gr.Textbox( | |
label="min_words_horizontal", placeholder="1", render=False | |
) | |
keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False) | |
file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False) | |
page_num = gr.Number( | |
label="Page number", value=0, info="It's an index: first is 0!", render=False | |
) | |
table_num = gr.Number( | |
label="Table number", value=0, info="It's an index: first is 0!", render=False | |
) | |
example_dir = Path(os.path.dirname(__file__)).joinpath("examples") | |
examples = [ | |
[str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None], | |
[ | |
str(example_dir.joinpath("museums.pdf")), | |
2, | |
0, | |
"lines", | |
"lines", | |
None, | |
None, | |
None, | |
], | |
[ | |
str(example_dir.joinpath("background-checks.pdf")), | |
0, | |
0, | |
"text", | |
"text", | |
5, | |
15, | |
487, | |
], | |
] | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# pdfplumber table extraction playground | |
[pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!** | |
YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/). | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
file.render() | |
with gr.Accordion("Table details", open=True): | |
with gr.Group(): | |
with gr.Row(): | |
page_num.render() | |
table_num.render() | |
with gr.Row(): | |
vertical_strategy.render() | |
horizontal_strategy.render() | |
with gr.Accordion("Crop", open=True): | |
with gr.Group(): | |
crop_top.render() | |
with gr.Row(): | |
crop_x0.render() | |
crop_x1.render() | |
crop_bottom.render() | |
btn = gr.Button(value="Run") | |
btn.click( | |
table_debugger, | |
inputs=[ | |
file, | |
page_num, | |
table_num, | |
crop_x0, | |
crop_top, | |
crop_x1, | |
crop_bottom, | |
vertical_strategy, | |
horizontal_strategy, | |
explicit_vertical_lines, | |
explicit_horizontal_lines, | |
snap_tolerance, | |
snap_x_tolerance, | |
snap_y_tolerance, | |
join_tolerance, | |
join_x_tolerance, | |
join_y_tolerance, | |
text_tolerance, | |
text_x_tolerance, | |
text_y_tolerance, | |
intersection_tolerance, | |
intersection_x_tolerance, | |
intersection_y_tolerance, | |
edge_min_length, | |
min_words_vertical, | |
min_words_horizontal, | |
keep_blank_chars, | |
], | |
outputs=[notes, output_image, data_table], | |
) | |
notes.render() | |
with gr.Column(scale=3): | |
data_table.render() | |
output_image.render() | |
gr.Examples( | |
examples=examples, | |
inputs=[ | |
file, | |
page_num, | |
table_num, | |
vertical_strategy, | |
horizontal_strategy, | |
snap_y_tolerance, | |
intersection_x_tolerance, | |
crop_bottom, | |
], | |
outputs=[notes, output_image, data_table], | |
fn=demo_subset, | |
run_on_click=True, | |
) | |
gr.Markdown("## Additional options") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Group(): | |
snap_tolerance.render() | |
with gr.Row(): | |
snap_x_tolerance.render() | |
snap_y_tolerance.render() | |
join_tolerance.render() | |
with gr.Row(): | |
join_x_tolerance.render() | |
join_y_tolerance.render() | |
text_tolerance.render() | |
with gr.Row(): | |
text_x_tolerance.render() | |
text_y_tolerance.render() | |
intersection_tolerance.render() | |
with gr.Row(): | |
intersection_x_tolerance.render() | |
intersection_y_tolerance.render() | |
with gr.Column(): | |
with gr.Group(): | |
explicit_vertical_lines.render() | |
explicit_horizontal_lines.render() | |
edge_min_length.render() | |
with gr.Row(): | |
min_words_vertical.render() | |
min_words_horizontal.render() | |
keep_blank_chars.render() | |
if __name__ == "__main__": | |
demo.launch() | |