|
class RawBlockProcessor: |
|
def __init__(self) -> None: |
|
self.y_tolerance = 2 |
|
self.pdf_dic = {} |
|
|
|
def __span_flags_decomposer(self, span_flags): |
|
""" |
|
Make font flags human readable. |
|
|
|
Parameters |
|
---------- |
|
self : object |
|
The instance of the class. |
|
|
|
span_flags : int |
|
span flags |
|
|
|
Returns |
|
------- |
|
l : dict |
|
decomposed flags |
|
""" |
|
|
|
l = { |
|
"is_superscript": False, |
|
"is_italic": False, |
|
"is_serifed": False, |
|
"is_sans_serifed": False, |
|
"is_monospaced": False, |
|
"is_proportional": False, |
|
"is_bold": False, |
|
} |
|
|
|
if span_flags & 2**0: |
|
l["is_superscript"] = True |
|
|
|
if span_flags & 2**1: |
|
l["is_italic"] = True |
|
|
|
if span_flags & 2**2: |
|
l["is_serifed"] = True |
|
else: |
|
l["is_sans_serifed"] = True |
|
|
|
if span_flags & 2**3: |
|
l["is_monospaced"] = True |
|
else: |
|
l["is_proportional"] = True |
|
|
|
if span_flags & 2**4: |
|
l["is_bold"] = True |
|
|
|
return l |
|
|
|
def __make_new_lines(self, raw_lines): |
|
""" |
|
This function makes new lines. |
|
|
|
Parameters |
|
---------- |
|
self : object |
|
The instance of the class. |
|
|
|
raw_lines : list |
|
raw lines |
|
|
|
Returns |
|
------- |
|
new_lines : list |
|
new lines |
|
""" |
|
new_lines = [] |
|
new_line = None |
|
|
|
for raw_line in raw_lines: |
|
raw_line_bbox = raw_line["bbox"] |
|
raw_line_spans = raw_line["spans"] |
|
raw_line_text = "".join([span["text"] for span in raw_line_spans]) |
|
raw_line_dir = raw_line.get("dir", None) |
|
|
|
decomposed_line_spans = [] |
|
for span in raw_line_spans: |
|
raw_flags = span["flags"] |
|
decomposed_flags = self.__span_flags_decomposer(raw_flags) |
|
span["decomposed_flags"] = decomposed_flags |
|
decomposed_line_spans.append(span) |
|
|
|
if new_line is None: |
|
new_line = { |
|
"bbox": raw_line_bbox, |
|
"text": raw_line_text, |
|
"dir": raw_line_dir if raw_line_dir else (0, 0), |
|
"spans": decomposed_line_spans, |
|
} |
|
else: |
|
if ( |
|
abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance |
|
and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance |
|
): |
|
new_line["bbox"] = ( |
|
min(new_line["bbox"][0], raw_line_bbox[0]), |
|
new_line["bbox"][1], |
|
max(new_line["bbox"][2], raw_line_bbox[2]), |
|
raw_line_bbox[3], |
|
) |
|
new_line["text"] += " " + raw_line_text |
|
new_line["spans"].extend(raw_line_spans) |
|
new_line["dir"] = ( |
|
new_line["dir"][0] + raw_line_dir[0], |
|
new_line["dir"][1] + raw_line_dir[1], |
|
) |
|
else: |
|
new_lines.append(new_line) |
|
new_line = { |
|
"bbox": raw_line_bbox, |
|
"text": raw_line_text, |
|
"dir": raw_line_dir if raw_line_dir else (0, 0), |
|
"spans": raw_line_spans, |
|
} |
|
if new_line: |
|
new_lines.append(new_line) |
|
|
|
return new_lines |
|
|
|
def __make_new_block(self, raw_block): |
|
""" |
|
This function makes a new block. |
|
|
|
Parameters |
|
---------- |
|
self : object |
|
The instance of the class. |
|
---------- |
|
raw_block : dict |
|
a raw block |
|
|
|
Returns |
|
------- |
|
new_block : dict |
|
|
|
Schema of new_block: |
|
{ |
|
"block_id": "block_1", |
|
"bbox": [0, 0, 100, 100], |
|
"text": "This is a block.", |
|
"lines": [ |
|
{ |
|
"bbox": [0, 0, 100, 100], |
|
"text": "This is a line.", |
|
"spans": [ |
|
{ |
|
"text": "This is a span.", |
|
"font": "Times New Roman", |
|
"size": 12, |
|
"color": "#000000", |
|
} |
|
], |
|
} |
|
], |
|
} |
|
""" |
|
new_block = {} |
|
|
|
block_id = raw_block["number"] |
|
block_bbox = raw_block["bbox"] |
|
block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"]) |
|
raw_lines = raw_block["lines"] |
|
block_lines = self.__make_new_lines(raw_lines) |
|
|
|
new_block["block_id"] = block_id |
|
new_block["bbox"] = block_bbox |
|
new_block["text"] = block_text |
|
new_block["lines"] = block_lines |
|
|
|
return new_block |
|
|
|
def batch_process_blocks(self, pdf_dic): |
|
""" |
|
This function processes the blocks in batch. |
|
|
|
Parameters |
|
---------- |
|
self : object |
|
The instance of the class. |
|
---------- |
|
blocks : list |
|
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json. |
|
|
|
Returns |
|
------- |
|
result_dict : dict |
|
result dictionary |
|
""" |
|
|
|
for page_id, blocks in pdf_dic.items(): |
|
if page_id.startswith("page_"): |
|
para_blocks = [] |
|
if "preproc_blocks" in blocks.keys(): |
|
input_blocks = blocks["preproc_blocks"] |
|
for raw_block in input_blocks: |
|
new_block = self.__make_new_block(raw_block) |
|
para_blocks.append(new_block) |
|
|
|
blocks["para_blocks"] = para_blocks |
|
|
|
return pdf_dic |
|
|
|
|