pdfplumber-demo / app.py
wendys-llc's picture
change names
5d265c2
import gradio as gr
import pdfplumber
import textwrap
import pprint
import json
import os
from pathlib import Path
def table_debugger(
file_obj,
page_num=0,
table_num=0,
crop_x0=None,
crop_top=None,
crop_x1=None,
crop_bottom=None,
vertical_strategy=None,
horizontal_strategy=None,
explicit_vertical_lines=None,
explicit_horizontal_lines=None,
snap_tolerance=None,
snap_x_tolerance=None,
snap_y_tolerance=None,
join_tolerance=None,
join_x_tolerance=None,
join_y_tolerance=None,
text_tolerance=None,
text_x_tolerance=None,
text_y_tolerance=None,
intersection_tolerance=None,
intersection_x_tolerance=None,
intersection_y_tolerance=None,
edge_min_length=None,
min_words_vertical=None,
min_words_horizontal=None,
keep_blank_chars=None,
):
table_settings = {
"vertical_strategy": vertical_strategy,
"horizontal_strategy": horizontal_strategy,
"explicit_vertical_lines": json.loads(explicit_vertical_lines)
if explicit_vertical_lines
else None,
"explicit_horizontal_lines": json.loads(explicit_horizontal_lines)
if explicit_horizontal_lines
else None,
"snap_tolerance": snap_tolerance,
"snap_x_tolerance": snap_x_tolerance,
"snap_y_tolerance": snap_y_tolerance,
"join_tolerance": join_tolerance,
"join_x_tolerance": join_x_tolerance,
"join_y_tolerance": join_y_tolerance,
"text_tolerance": text_tolerance,
"text_x_tolerance": text_x_tolerance,
"text_y_tolerance": text_y_tolerance,
"intersection_tolerance": intersection_tolerance,
"intersection_x_tolerance": intersection_x_tolerance,
"intersection_y_tolerance": intersection_y_tolerance,
"edge_min_length": edge_min_length,
"min_words_vertical": min_words_vertical,
"min_words_horizontal": min_words_horizontal,
#'keep_blank_chars': keep_blank_chars
}
keys = list(table_settings.keys())
for key in keys:
if (
table_settings[key] == ""
or table_settings[key] == []
or table_settings[key] is None
):
del table_settings[key]
elif table_settings[key].isdigit():
table_settings[key] = int(table_settings[key])
table_num = int(table_num)
with pdfplumber.open(file_obj.name) as pdf:
page_num = int(page_num)
page = pdf.pages[page_num]
page_width = int(page.width)
page_height = int(page.height)
crop_x0 = int(crop_x0) if crop_x0 else 0
crop_top = int(crop_top) if crop_top else 0
crop_x1 = int(crop_x1) if crop_x1 else page_width
crop_bottom = int(crop_bottom) if crop_bottom else page_height
# Allow negative numbers
if crop_bottom < 0:
crop_bottom = page_height + crop_bottom
if crop_x1 < 0:
crop_x1 = page_width + crop_x1
is_cropped = (
crop_x0 != 0
or crop_top != 0
or crop_x1 != page_width
or crop_bottom != page_height
)
# Only crop if we need to!
if is_cropped:
page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom))
tables = page.extract_tables(table_settings)
if len(tables) > 0:
table = tables[0]
else:
table = None
visual = page.to_image().debug_tablefinder(table_settings).annotated
base_filename = file_obj.name.split("/")[-1]
notes = f"""
- **Filename:** {base_filename}
- **Pages:** {len(pdf.pages)}
- **Page num {int(page_num)}:**
- **Full dimensions:** {page_width} x {page_height}
- **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}
- **Tables found:** {len(tables)}
```python
import pdfplumber
pdf = pdfplumber.open("{base_filename}")
page = pdf.pages[{page_num}]
""".strip()
if is_cropped:
notes += (
f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))"
)
notes += f"""\n
table_settings = {pprint.pformat(table_settings, indent=8).strip()}
tables = page.extract_tables(table_settings)
table = tables[{table_num}]
```"""
notes = textwrap.dedent(notes)
return [notes, visual, table]
def demo_subset(
file_obj,
page_num,
table_num,
vertical_strategy,
horizontal_strategy,
snap_y_tolerance,
intersection_x_tolerance,
crop_bottom,
):
return table_debugger(
file_obj,
page_num=page_num,
table_num=table_num,
vertical_strategy=vertical_strategy,
horizontal_strategy=horizontal_strategy,
snap_y_tolerance=snap_y_tolerance,
intersection_x_tolerance=intersection_x_tolerance,
crop_bottom=crop_bottom,
)
notes = gr.Markdown()
output_image = gr.Image()
data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data')
crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False)
crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False)
crop_x1 = gr.Text(
label="Crop (x1)", placeholder="right (from page left)", container=False, render=False
)
crop_bottom = gr.Text(
label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False
)
vertical_strategy = gr.Dropdown(
label="Vertical Strategy",
choices=["lines", "lines_strict", "text", "explicit"],
render=False,
value="lines",
)
horizontal_strategy = gr.Dropdown(
label="Horizontal Strategy",
choices=["lines", "lines_strict", "text", "explicit"],
render=False,
value="lines",
)
explicit_vertical_lines = gr.Textbox(
label="explicit_vertical_lines", render=False, placeholder="[]"
)
explicit_horizontal_lines = gr.Textbox(
label="explicit_horizontal_lines", render=False, placeholder="[]"
)
snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False)
snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False)
snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False)
join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False)
join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
text_tolerance = gr.Textbox(
label="Text tolerance", placeholder="1", render=False, value=None
)
text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False)
text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False)
intersection_tolerance = gr.Textbox(
label="Intersection tolerance", placeholder="1", render=False
)
intersection_x_tolerance = gr.Textbox(
label="Intersection tolerance (x)", placeholder="1", render=False
)
intersection_y_tolerance = gr.Textbox(
label="Intersection tolerance (y)", placeholder="1", render=False
)
edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False)
min_words_vertical = gr.Textbox(
label="min_words_vertical", placeholder="3", render=False
)
min_words_horizontal = gr.Textbox(
label="min_words_horizontal", placeholder="1", render=False
)
keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False)
file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False)
page_num = gr.Number(
label="Page number", value=0, info="It's an index: first is 0!", render=False
)
table_num = gr.Number(
label="Table number", value=0, info="It's an index: first is 0!", render=False
)
example_dir = Path(os.path.dirname(__file__)).joinpath("examples")
examples = [
[str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None],
[
str(example_dir.joinpath("museums.pdf")),
2,
0,
"lines",
"lines",
None,
None,
None,
],
[
str(example_dir.joinpath("background-checks.pdf")),
0,
0,
"text",
"text",
5,
15,
487,
],
]
with gr.Blocks() as demo:
gr.Markdown(
"""
# pdfplumber table extraction playground
[pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!**
YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/).
"""
)
with gr.Row():
with gr.Column(scale=2):
file.render()
with gr.Accordion("Table details", open=True):
with gr.Group():
with gr.Row():
page_num.render()
table_num.render()
with gr.Row():
vertical_strategy.render()
horizontal_strategy.render()
with gr.Accordion("Crop", open=True):
with gr.Group():
crop_top.render()
with gr.Row():
crop_x0.render()
crop_x1.render()
crop_bottom.render()
btn = gr.Button(value="Run")
btn.click(
table_debugger,
inputs=[
file,
page_num,
table_num,
crop_x0,
crop_top,
crop_x1,
crop_bottom,
vertical_strategy,
horizontal_strategy,
explicit_vertical_lines,
explicit_horizontal_lines,
snap_tolerance,
snap_x_tolerance,
snap_y_tolerance,
join_tolerance,
join_x_tolerance,
join_y_tolerance,
text_tolerance,
text_x_tolerance,
text_y_tolerance,
intersection_tolerance,
intersection_x_tolerance,
intersection_y_tolerance,
edge_min_length,
min_words_vertical,
min_words_horizontal,
keep_blank_chars,
],
outputs=[notes, output_image, data_table],
)
notes.render()
with gr.Column(scale=3):
data_table.render()
output_image.render()
gr.Examples(
examples=examples,
inputs=[
file,
page_num,
table_num,
vertical_strategy,
horizontal_strategy,
snap_y_tolerance,
intersection_x_tolerance,
crop_bottom,
],
outputs=[notes, output_image, data_table],
fn=demo_subset,
run_on_click=True,
)
gr.Markdown("## Additional options")
with gr.Row():
with gr.Column():
with gr.Group():
snap_tolerance.render()
with gr.Row():
snap_x_tolerance.render()
snap_y_tolerance.render()
join_tolerance.render()
with gr.Row():
join_x_tolerance.render()
join_y_tolerance.render()
text_tolerance.render()
with gr.Row():
text_x_tolerance.render()
text_y_tolerance.render()
intersection_tolerance.render()
with gr.Row():
intersection_x_tolerance.render()
intersection_y_tolerance.render()
with gr.Column():
with gr.Group():
explicit_vertical_lines.render()
explicit_horizontal_lines.render()
edge_min_length.render()
with gr.Row():
min_words_vertical.render()
min_words_horizontal.render()
keep_blank_chars.render()
if __name__ == "__main__":
demo.launch()