File size: 2,763 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from adapters.infrastructure.toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures
from adapters.infrastructure.toc.data.TOCItem import TOCItem
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation


class TOCExtractor:
    def __init__(self, pdf_segmentation: PdfSegmentation):
        self.pdf_segmentation = pdf_segmentation
        self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged
        self.toc: list[TOCItem] = list()
        self.set_toc()

    def set_toc(self):
        for index, title_features in enumerate(self.titles_features_sorted):
            indentation = self.get_indentation(index, title_features)
            self.toc.append(title_features.to_toc_item(indentation))

    def __str__(self):
        return "\n".join([f'{"  " * x.indentation} * {x.label}' for x in self.toc])

    def get_indentation(self, title_index: int, title_features: TitleFeatures):
        if title_index == 0:
            return 0

        for index in reversed(range(title_index)):
            if self.toc[index].point_closed:
                continue

            if self.same_indentation(self.titles_features_sorted[index], title_features):
                self.close_toc_items(self.toc[index].indentation)
                return self.toc[index].indentation

        return self.toc[title_index - 1].indentation + 1

    def close_toc_items(self, indentation):
        for toc in self.toc:
            if toc.indentation > indentation:
                toc.point_closed = True

    @staticmethod
    def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures):
        if previous_title_features.first_characters in title_features.get_possible_previous_point():
            return True

        if previous_title_features.get_features_toc() == title_features.get_features_toc():
            return True

        return False

    def to_dict(self):
        toc: list[dict[str, any]] = list()

        for toc_item in self.toc:
            toc_element_dict = dict()
            toc_element_dict["indentation"] = toc_item.indentation
            toc_element_dict["label"] = toc_item.label
            rectangle = dict()
            rectangle["left"] = int(toc_item.selection_rectangle.left)
            rectangle["top"] = int(toc_item.selection_rectangle.top)
            rectangle["width"] = int(toc_item.selection_rectangle.width)
            rectangle["height"] = int(toc_item.selection_rectangle.height)
            rectangle["page"] = str(toc_item.selection_rectangle.page_number)
            toc_element_dict["bounding_box"] = rectangle
            toc.append(toc_element_dict)

        return toc