File size: 7,178 Bytes
c8a32e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox
from marker.schema.block import Line, Span, Block
from marker.schema.page import Page
from tabulate import tabulate
from typing import List
from marker.settings import settings
from marker.tables.cells import assign_cells_to_columns
from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines
def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
table_rows = []
table_row = []
x_position = None
sorted_blocks = sort_table_blocks(page.blocks)
for block_idx, block in enumerate(sorted_blocks):
sorted_lines = sort_table_blocks(block.lines)
for line_idx, line in enumerate(sorted_lines):
line_bbox = line.bbox
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .5 or len(line.spans) == 0:
continue
normed_x_start = line_bbox[0] / page.width
normed_x_end = line_bbox[2] / page.width
cells = [[s.bbox, s.text] for s in line.spans]
if x_position is None or normed_x_start > x_position - space_tol:
# Same row
table_row.extend(cells)
else:
# New row
if len(table_row) > 0:
table_rows.append(table_row)
table_row = cells
x_position = normed_x_end
if len(table_row) > 0:
table_rows.append(table_row)
table_rows = assign_cells_to_columns(page, table_box, table_rows)
return table_rows
def get_table_pdftext(page: Page, table_box, space_tol=.01, round_factor=4) -> List[List[str]]:
page_width = page.width
table_rows = []
table_cell = ""
cell_bbox = None
table_row = []
sorted_char_blocks = sort_table_blocks(page.char_blocks)
table_width = table_box[2] - table_box[0]
new_line_start_x = table_box[0] + table_width * .2
for block_idx, block in enumerate(sorted_char_blocks):
sorted_lines = sort_table_blocks(block["lines"])
for line_idx, line in enumerate(sorted_lines):
line_bbox = line["bbox"]
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < settings.BBOX_INTERSECTION_THRESH:
continue
for span in line["spans"]:
for char in span["chars"]:
x_start, y_start, x_end, y_end = char["bbox"]
x_start /= page_width
x_end /= page_width
if cell_bbox is not None:
# Find boundaries of cell bbox before merging
cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox
cell_x_start /= page_width
cell_x_end /= page_width
cell_content = replace_dots(replace_newlines(table_cell))
if cell_bbox is None: # First char
table_cell += char["char"]
cell_bbox = char["bbox"]
elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell
table_cell += char["char"]
cell_bbox = merge_boxes(cell_bbox, char["bbox"])
# New line and cell
# Use x_start < new_line_start_x to account for out-of-order cells in the pdf
elif x_start < cell_x_end - space_tol and x_start < new_line_start_x:
if len(table_cell) > 0:
table_row.append((cell_bbox, cell_content))
table_cell = char["char"]
cell_bbox = char["bbox"]
if len(table_row) > 0:
table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
table_rows.append(table_row)
table_row = []
else: # Same line, new cell, check against cell bbox
if len(table_cell) > 0:
table_row.append((cell_bbox, cell_content))
table_cell = char["char"]
cell_bbox = char["bbox"]
if len(table_cell) > 0:
table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell))))
if len(table_row) > 0:
table_row = sorted(table_row, key=lambda x: round(x[0][0] / round_factor))
table_rows.append(table_row)
table_rows = assign_cells_to_columns(page, table_box, table_rows)
return table_rows
def format_tables(pages: List[Page]):
# Formats tables nicely into github flavored markdown
table_count = 0
for page in pages:
table_insert_points = {}
blocks_to_remove = set()
pnum = page.pnum
page_table_boxes = [b for b in page.layout.bboxes if b.label == "Table"]
page_table_boxes = [rescale_bbox(page.layout.image_bbox, page.bbox, b.bbox) for b in page_table_boxes]
for table_idx, table_box in enumerate(page_table_boxes):
for block_idx, block in enumerate(page.blocks):
intersect_pct = block.intersection_pct(table_box)
if intersect_pct > settings.BBOX_INTERSECTION_THRESH and block.block_type == "Table":
if table_idx not in table_insert_points:
table_insert_points[table_idx] = block_idx - len(blocks_to_remove) + table_idx # Where to insert the new table
blocks_to_remove.add(block_idx)
new_page_blocks = []
for block_idx, block in enumerate(page.blocks):
if block_idx in blocks_to_remove:
continue
new_page_blocks.append(block)
for table_idx, table_box in enumerate(page_table_boxes):
if table_idx not in table_insert_points:
continue
if page.ocr_method == "surya":
table_rows = get_table_surya(page, table_box)
else:
table_rows = get_table_pdftext(page, table_box)
# Skip empty tables
if len(table_rows) == 0:
continue
table_text = tabulate(table_rows, headers="firstrow", tablefmt="github", disable_numparse=True)
table_block = Block(
bbox=table_box,
block_type="Table",
pnum=pnum,
lines=[Line(
bbox=table_box,
spans=[Span(
bbox=table_box,
span_id=f"{table_idx}_table",
font="Table",
font_size=0,
font_weight=0,
block_type="Table",
text=table_text
)]
)]
)
insert_point = table_insert_points[table_idx]
new_page_blocks.insert(insert_point, table_block)
table_count += 1
page.blocks = new_page_blocks
return table_count |