File size: 981 Bytes
98591ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
#!/usr/bin/env python3
#
# Created by lemswasabi on 24/05/2022.
# Copyright © 2022 letzspeak. All rights reserved.
#
import glob
import re
import textract
chars_to_ignore_regex = '[,?.!;:"“%‘„”�—’…–]'
def replace_chars(text, char, replace_char):
return re.sub(char, replace_char, text.lower())
def ignore_chars(sentence):
return re.sub(chars_to_ignore_regex, "", text.lower())
corpus = []
for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/**/*.doc", recursive=True):
try:
text = textract.process(text_file).decode("utf-8")
text = replace_chars(text, "’", "'")
text = replace_chars(text, "‘", "'")
text = replace_chars(text, "-", " ")
text = replace_chars(text, "\\n", " ")
text = ignore_chars(text)
corpus.append(text.strip())
except textract.exceptions.ShellError:
continue
with open("chamber_text.txt", "w") as f:
f.write(" ".join(corpus))
|