File size: 4,693 Bytes
4cf88e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
class Doc:
def __init__(self, fulltext: str = '', title: str = '', params: dict = {}):
self.params = params
self.lines = [Line(text.strip(), self.params) for text in fulltext.split("\n") if text.strip()]
self.title, self.lines = self._get_title(title)
self.container = Container(lines=self.lines, title=self.title, father=self, params=params)
self.fulltext = fulltext
def _get_title(self, title):
lines = self.lines
if self.params['type'] == 'input_text':
if self.lines and self.lines[0] and self.lines[0].type == 'title':
title = self.lines[0].text
lines = lines[1:]
else:
title = 'the title is missing'
return title, lines
class WikiPage(Doc):
def __init__(self, fulltext='', title=''):
self.params = {
'type': 'wiki',
'startswith_':
{'== ': '1', '=== ': '2', '==== ': '3', '===== ': '4', '====== ': '5', '======= ': '6'},
'endswith_':
[' ==', ' ===', ' ====', ' =====', ' ======', ' ======'],
'discarded': ["See also", "Notes", "References", "Sources", "External links", "Bibliography",
"Cinematic adaptations", "Further reading", "Maps"]
}
super().__init__(fulltext=fulltext, title=title, params=self.params)
def get_paragraphs(self, chunk=500):
return self.container.get_paragraphs(chunk)
class Container:
def __init__(self, lines=[], level=0, title='', father=None, params={}):
self.children = []
self.level = level
self.title = title
self.father = father
self.lines = []
self._expand(lines)
if params and 'discarded' in params.keys():
self.children = [child for child in self.children if child.title not in params['discarded']]
self.containers = [self]
for child in self.children:
self.containers += child.containers
self.text = ''
for child in self.children:
self.text += ' ' + child.text
def _expand(self, lines):
new_child = False
new_child_lines = []
new_child_title = []
for line in lines:
if not new_child:
if line.is_structure:
new_child = True
new_child_lines = []
new_child_title = line.text
line.level = self.level + 1
else:
self.lines.append(line)
else:
if self.level + 1 < line.level or not line.is_structure:
new_child_lines.append(line)
elif self.level + 1 == line.level:
self.children.append(Container(lines=new_child_lines,
level=self.level + 1,
title=new_child_title,
father=self))
new_child_lines = []
new_child_title = line.text
if new_child:
self.children.append(Container(lines=new_child_lines,
level=self.level + 1,
title=new_child_title,
father=self))
def get_paragraphs(self, chunk=500):
if len(self.text) < chunk:
paragraphs = [self.text]
else:
paragraphs = [self.root_text]
for child in self.children:
paragraphs += child.get_paragraphs(chunk)
return paragraphs
class Line:
def __init__(self, text, params):
self.text = text
self.params = params
self.type, self.text = self._parse_text()
self.level = int(self.type) if self.type.isdigit() else -1
self.is_structure = 0 < self.level
def _parse_text(self):
def strip_text(text_, start, end):
text_ = text_.split(start)[1]
if end != "":
text_ = text_.split(end)[0]
# text += ". \n"
return text_.strip()
startswith_ = self.params['startswith_']
endswith_ = self.params['endswith_'] if 'endswith_' in self.params.keys() else [""] * len(startswith_)
types = [(strip_text(self.text, starter, endswith_[i]), startswith_[starter])
for i, starter in enumerate(startswith_.keys())
if self.text.startswith(starter)]
(text, type_) = types[0] if types else (self.text, 'normal')
return type_, text.strip()
|