Spaces:
Running
Running
| from adapters.infrastructure.toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles | |
| from adapters.infrastructure.toc.TitleFeatures import TitleFeatures | |
| from adapters.infrastructure.toc.data.TOCItem import TOCItem | |
| from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation | |
| class TOCExtractor: | |
| def __init__(self, pdf_segmentation: PdfSegmentation): | |
| self.pdf_segmentation = pdf_segmentation | |
| self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged | |
| self.toc: list[TOCItem] = list() | |
| self.set_toc() | |
| def set_toc(self): | |
| for index, title_features in enumerate(self.titles_features_sorted): | |
| indentation = self.get_indentation(index, title_features) | |
| self.toc.append(title_features.to_toc_item(indentation)) | |
| def __str__(self): | |
| return "\n".join([f'{" " * x.indentation} * {x.label}' for x in self.toc]) | |
| def get_indentation(self, title_index: int, title_features: TitleFeatures): | |
| if title_index == 0: | |
| return 0 | |
| for index in reversed(range(title_index)): | |
| if self.toc[index].point_closed: | |
| continue | |
| if self.same_indentation(self.titles_features_sorted[index], title_features): | |
| self.close_toc_items(self.toc[index].indentation) | |
| return self.toc[index].indentation | |
| return self.toc[title_index - 1].indentation + 1 | |
| def close_toc_items(self, indentation): | |
| for toc in self.toc: | |
| if toc.indentation > indentation: | |
| toc.point_closed = True | |
| def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures): | |
| if previous_title_features.first_characters in title_features.get_possible_previous_point(): | |
| return True | |
| if previous_title_features.get_features_toc() == title_features.get_features_toc(): | |
| return True | |
| return False | |
| def to_dict(self): | |
| toc: list[dict[str, any]] = list() | |
| for toc_item in self.toc: | |
| toc_element_dict = dict() | |
| toc_element_dict["indentation"] = toc_item.indentation | |
| toc_element_dict["label"] = toc_item.label | |
| rectangle = dict() | |
| rectangle["left"] = int(toc_item.selection_rectangle.left) | |
| rectangle["top"] = int(toc_item.selection_rectangle.top) | |
| rectangle["width"] = int(toc_item.selection_rectangle.width) | |
| rectangle["height"] = int(toc_item.selection_rectangle.height) | |
| rectangle["page"] = str(toc_item.selection_rectangle.page_number) | |
| toc_element_dict["bounding_box"] = rectangle | |
| toc.append(toc_element_dict) | |
| return toc | |