|
class Doc: |
|
def __init__(self, fulltext: str = '', title: str = '', params: dict = {}): |
|
self.params = params |
|
self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()] |
|
self.title, self.lines = self._get_title(title) |
|
self.container = Container(lines=self.lines, title=self.title, father=self, params=params) |
|
self.fulltext = fulltext |
|
|
|
def _get_title(self, title): |
|
lines = self.lines |
|
if self.params['type'] == 'input_text': |
|
if self.lines and self.lines[0] and self.lines[0].type == 'title': |
|
title = self.lines[0].text |
|
lines = lines[1:] |
|
else: |
|
title = 'the title is missing' |
|
return title, lines |
|
|
|
|
|
class WikiPage(Doc): |
|
|
|
def __init__(self, fulltext='', title=''): |
|
self.params = { |
|
'type': 'wiki', |
|
'startswith_': |
|
{'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'}, |
|
'endswith_': |
|
[' ==', ' ===', ' ====', ' =====', ' ======', ' ======'], |
|
|
|
'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography", |
|
"Cinematic adaptations", "Further reading", "Maps"] |
|
} |
|
super().__init__(fulltext=fulltext, title=title, params=self.params) |
|
|
|
def get_paragraphs(self, chunk=500): |
|
return self.container.get_paragraphs(chunk) |
|
|
|
|
|
class Container: |
|
|
|
def __init__(self, lines=[], level=0, title='', father=None, params={}): |
|
|
|
self.children = [] |
|
self.level = level |
|
self.title = title |
|
self.father = father |
|
self.lines = [] |
|
self._expand(lines) |
|
if params and 'discarded' in params.keys(): |
|
self.children = [child for child in self.children if child.title not in params['discarded']] |
|
self.containers = [self] |
|
for child in self.children: |
|
self.containers += child.containers |
|
self.text = '' |
|
for child in self.children: |
|
self.text += ' ' + child.text |
|
|
|
def _expand(self, lines): |
|
new_child = False |
|
new_child_lines = [] |
|
new_child_title = [] |
|
for line in lines: |
|
if not new_child: |
|
if line.is_structure: |
|
new_child = True |
|
new_child_lines = [] |
|
new_child_title = line.text |
|
line.level = self.level + 1 |
|
else: |
|
self.lines.append(line) |
|
|
|
else: |
|
if self.level + 1 < line.level or not line.is_structure: |
|
new_child_lines.append(line) |
|
elif self.level + 1 == line.level: |
|
self.children.append(Container(lines=new_child_lines, |
|
level=self.level + 1, |
|
title=new_child_title, |
|
father=self)) |
|
new_child_lines = [] |
|
new_child_title = line.text |
|
if new_child: |
|
self.children.append(Container(lines=new_child_lines, |
|
level=self.level + 1, |
|
title=new_child_title, |
|
father=self)) |
|
|
|
def get_paragraphs(self, chunk=500): |
|
if len(self.text) < chunk: |
|
paragraphs = [self.text] |
|
else: |
|
paragraphs = [self.root_text] |
|
for child in self.children: |
|
paragraphs += child.get_paragraphs(chunk) |
|
return paragraphs |
|
|
|
|
|
class Line: |
|
|
|
def __init__(self, text, params): |
|
self.text = text |
|
self.params = params |
|
self.type, self.text = self._parse_text() |
|
self.level = int(self.type) if self.type.isdigit() else -1 |
|
self.is_structure = 0 < self.level |
|
|
|
|
|
def _parse_text(self): |
|
def strip_text(text_, start, end): |
|
text_ = text_.split(start)[1] |
|
if end != "": |
|
text_ = text_.split(end)[0] |
|
|
|
return text_.strip() |
|
|
|
startswith_ = self.params['startswith_'] |
|
|
|
endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_) |
|
types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter]) |
|
for i, starter in enumerate(startswith_.keys()) |
|
if self.text.startswith(starter)] |
|
(text, type_) = types[0] if types else (self.text, 'normal') |
|
return type_, text.strip() |
|
|