Lena Merkli
Upload 3 files
1295a89
def split_list(array: list[str], separator: str) -> list[str]:
r = []
placeholder = "\uE000"
for s in array:
s_with_marker = s.replace(separator, separator + placeholder)
parts = s_with_marker.split(placeholder)
r.extend(parts)
return r
def split(text: str) -> list[str]:
for replacement in [' \n', '\n ', '\n\n']:
while replacement in text:
text = text.replace(replacement, '\n')
protections = ['d. h.', 'Abs.', 'Art.', 'Bem.', 'Bst.', ' ff.', ' f.', '(ff.', '(f.', 'insbes.', 'S.', 'V.']
for protection in protections:
text = text.replace(protection, protection.replace('.', '\uE000'))
placeholder = "\uE001"
for i in range(3, len(text) - 3):
if text[i] == '.':
if (
(text[i - 2] == ' ') or
( not text[i + 2].isupper()) or
(text[i - 1].isdigit())
):
text = text[:i] + placeholder + text[i+1:]
array = [text]
for value in ['\n', '. ', '? ']:
array = split_list(array, value)
final_list = []
for s in array:
cleaned_s = s.replace(placeholder, '.').strip()
final_list.append(cleaned_s)
return final_list