File size: 4,693 Bytes
4cf88e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class Doc:
    def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
        self.params = params
        self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
        self.title, self.lines = self._get_title(title)
        self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
        self.fulltext = fulltext

    def _get_title(self, title):
        lines = self.lines
        if self.params['type'] == 'input_text':
            if self.lines and self.lines[0] and self.lines[0].type == 'title':
                title = self.lines[0].text
                lines = lines[1:]
            else:
                title = 'the title is missing'
        return title, lines


class WikiPage(Doc):

    def __init__(self, fulltext='', title=''):
        self.params = {
            'type': 'wiki',
            'startswith_':
                {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
            'endswith_':
                [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],

            'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
                          "Cinematic adaptations", "Further reading", "Maps"]
        }
        super().__init__(fulltext=fulltext, title=title, params=self.params)

    def get_paragraphs(self, chunk=500):
        return self.container.get_paragraphs(chunk)


class Container:

    def __init__(self, lines=[], level=0, title='', father=None, params={}):

        self.children = []
        self.level = level
        self.title = title
        self.father = father
        self.lines = []
        self._expand(lines)
        if params and 'discarded' in params.keys():
            self.children = [child for child in self.children if child.title not in params['discarded']]
        self.containers = [self]
        for child in self.children:
            self.containers += child.containers
        self.text = ''
        for child in self.children:
            self.text += ' ' + child.text

    def _expand(self, lines):
        new_child = False
        new_child_lines = []
        new_child_title = []
        for line in lines:
            if not new_child:
                if line.is_structure:
                    new_child = True
                    new_child_lines = []
                    new_child_title = line.text
                    line.level = self.level + 1
                else:
                    self.lines.append(line)

            else:
                if self.level + 1 < line.level or not line.is_structure:
                    new_child_lines.append(line)
                elif self.level + 1 == line.level:
                    self.children.append(Container(lines=new_child_lines,
                                                   level=self.level + 1,
                                                   title=new_child_title,
                                                   father=self))
                    new_child_lines = []
                    new_child_title = line.text
        if new_child:
            self.children.append(Container(lines=new_child_lines,
                                           level=self.level + 1,
                                           title=new_child_title,
                                           father=self))

    def get_paragraphs(self, chunk=500):
        if len(self.text) < chunk:
            paragraphs = [self.text]
        else:
            paragraphs = [self.root_text]
            for child in self.children:
                paragraphs += child.get_paragraphs(chunk)
        return paragraphs


class Line:

    def __init__(self, text, params):
        self.text = text
        self.params = params
        self.type, self.text = self._parse_text()
        self.level = int(self.type) if self.type.isdigit() else -1
        self.is_structure = 0 < self.level


    def _parse_text(self):
        def strip_text(text_, start, end):
            text_ = text_.split(start)[1]
            if end != "":
                text_ = text_.split(end)[0]
            # text += ". \n"
            return text_.strip()

        startswith_ = self.params['startswith_']

        endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_)
        types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
                 for i, starter in enumerate(startswith_.keys())
                 if self.text.startswith(starter)]
        (text, type_) = types[0] if types else (self.text, 'normal')
        return type_, text.strip()