Spaces:
Running
Running
from adapters.infrastructure.toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles | |
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures | |
from adapters.infrastructure.toc.data.TOCItem import TOCItem | |
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation | |
class TOCExtractor: | |
def __init__(self, pdf_segmentation: PdfSegmentation): | |
self.pdf_segmentation = pdf_segmentation | |
self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged | |
self.toc: list[TOCItem] = list() | |
self.set_toc() | |
def set_toc(self): | |
for index, title_features in enumerate(self.titles_features_sorted): | |
indentation = self.get_indentation(index, title_features) | |
self.toc.append(title_features.to_toc_item(indentation)) | |
def __str__(self): | |
return "\n".join([f'{" " * x.indentation} * {x.label}' for x in self.toc]) | |
def get_indentation(self, title_index: int, title_features: TitleFeatures): | |
if title_index == 0: | |
return 0 | |
for index in reversed(range(title_index)): | |
if self.toc[index].point_closed: | |
continue | |
if self.same_indentation(self.titles_features_sorted[index], title_features): | |
self.close_toc_items(self.toc[index].indentation) | |
return self.toc[index].indentation | |
return self.toc[title_index - 1].indentation + 1 | |
def close_toc_items(self, indentation): | |
for toc in self.toc: | |
if toc.indentation > indentation: | |
toc.point_closed = True | |
def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures): | |
if previous_title_features.first_characters in title_features.get_possible_previous_point(): | |
return True | |
if previous_title_features.get_features_toc() == title_features.get_features_toc(): | |
return True | |
return False | |
def to_dict(self): | |
toc: list[dict[str, any]] = list() | |
for toc_item in self.toc: | |
toc_element_dict = dict() | |
toc_element_dict["indentation"] = toc_item.indentation | |
toc_element_dict["label"] = toc_item.label | |
rectangle = dict() | |
rectangle["left"] = int(toc_item.selection_rectangle.left) | |
rectangle["top"] = int(toc_item.selection_rectangle.top) | |
rectangle["width"] = int(toc_item.selection_rectangle.width) | |
rectangle["height"] = int(toc_item.selection_rectangle.height) | |
rectangle["page"] = str(toc_item.selection_rectangle.page_number) | |
toc_element_dict["bounding_box"] = rectangle | |
toc.append(toc_element_dict) | |
return toc | |