class Doc: def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): self.params = params self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] self.title, self.lines = self._get_title(title) self.container = Container(lines=self.lines, title=self.title, father=self, params=params) self.fulltext = fulltext def _get_title(self, title): lines = self.lines if self.params['type'] == 'input_text': if self.lines and self.lines[0] and self.lines[0].type == 'title': title = self.lines[0].text lines = lines[1:] else: title = 'the title is missing' return title, lines class WikiPage(Doc): def __init__(self, fulltext='', title=''): self.params = { 'type': 'wiki', 'startswith_': {'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, 'endswith_': [' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], 'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", "Cinematic adaptations", "Further reading", "Maps"] } super().__init__(fulltext=fulltext, title=title, params=self.params) def get_paragraphs(self, chunk=500): return self.container.get_paragraphs(chunk) class Container: def __init__(self, lines=[], level=0, title='', father=None, params={}): self.children = [] self.level = level self.title = title self.father = father self.lines = [] self._expand(lines) if params and 'discarded' in params.keys(): self.children = [child for child in self.children if child.title not in params['discarded']] self.containers = [self] for child in self.children: self.containers += child.containers self.text = '' for child in self.children: self.text += ' ' + child.text def _expand(self, lines): new_child = False new_child_lines = [] new_child_title = [] for line in lines: if not new_child: if line.is_structure: new_child = True new_child_lines = [] new_child_title = line.text line.level = self.level + 1 else: self.lines.append(line) else: if self.level + 1 < line.level or not line.is_structure: new_child_lines.append(line) elif self.level + 1 == line.level: self.children.append(Container(lines=new_child_lines, level=self.level + 1, title=new_child_title, father=self)) new_child_lines = [] new_child_title = line.text if new_child: self.children.append(Container(lines=new_child_lines, level=self.level + 1, title=new_child_title, father=self)) def get_paragraphs(self, chunk=500): if len(self.text) < chunk: paragraphs = [self.text] else: paragraphs = [self.root_text] for child in self.children: paragraphs += child.get_paragraphs(chunk) return paragraphs class Line: def __init__(self, text, params): self.text = text self.params = params self.type, self.text = self._parse_text() self.level = int(self.type) if self.type.isdigit() else -1 self.is_structure = 0 < self.level def _parse_text(self): def strip_text(text_, start, end): text_ = text_.split(start)[1] if end != "": text_ = text_.split(end)[0] # text += ". \n" return text_.strip() startswith_ = self.params['startswith_'] endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_) types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) for i, starter in enumerate(startswith_.keys()) if self.text.startswith(starter)] (text, type_) = types[0] if types else (self.text, 'normal') return type_, text.strip()