Spaces:
Running
Running
File size: 2,763 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from adapters.infrastructure.toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures
from adapters.infrastructure.toc.data.TOCItem import TOCItem
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
class TOCExtractor:
def __init__(self, pdf_segmentation: PdfSegmentation):
self.pdf_segmentation = pdf_segmentation
self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged
self.toc: list[TOCItem] = list()
self.set_toc()
def set_toc(self):
for index, title_features in enumerate(self.titles_features_sorted):
indentation = self.get_indentation(index, title_features)
self.toc.append(title_features.to_toc_item(indentation))
def __str__(self):
return "\n".join([f'{" " * x.indentation} * {x.label}' for x in self.toc])
def get_indentation(self, title_index: int, title_features: TitleFeatures):
if title_index == 0:
return 0
for index in reversed(range(title_index)):
if self.toc[index].point_closed:
continue
if self.same_indentation(self.titles_features_sorted[index], title_features):
self.close_toc_items(self.toc[index].indentation)
return self.toc[index].indentation
return self.toc[title_index - 1].indentation + 1
def close_toc_items(self, indentation):
for toc in self.toc:
if toc.indentation > indentation:
toc.point_closed = True
@staticmethod
def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures):
if previous_title_features.first_characters in title_features.get_possible_previous_point():
return True
if previous_title_features.get_features_toc() == title_features.get_features_toc():
return True
return False
def to_dict(self):
toc: list[dict[str, any]] = list()
for toc_item in self.toc:
toc_element_dict = dict()
toc_element_dict["indentation"] = toc_item.indentation
toc_element_dict["label"] = toc_item.label
rectangle = dict()
rectangle["left"] = int(toc_item.selection_rectangle.left)
rectangle["top"] = int(toc_item.selection_rectangle.top)
rectangle["width"] = int(toc_item.selection_rectangle.width)
rectangle["height"] = int(toc_item.selection_rectangle.height)
rectangle["page"] = str(toc_item.selection_rectangle.page_number)
toc_element_dict["bounding_box"] = rectangle
toc.append(toc_element_dict)
return toc
|