|
|
|
|
|
|
|
|
|
|
|
|
|
import glob |
|
import re |
|
import textract |
|
|
|
|
|
chars_to_ignore_regex = '[,?.!;:"β%βββοΏ½βββ¦β]' |
|
|
|
def replace_chars(text, char, replace_char): |
|
return re.sub(char, replace_char, text.lower()) |
|
|
|
def ignore_chars(sentence): |
|
return re.sub(chars_to_ignore_regex, "", text.lower()) |
|
|
|
corpus = [] |
|
|
|
for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/**/*.doc", recursive=True): |
|
try: |
|
text = textract.process(text_file).decode("utf-8") |
|
text = replace_chars(text, "β", "'") |
|
text = replace_chars(text, "β", "'") |
|
text = replace_chars(text, "-", " ") |
|
text = replace_chars(text, "\\n", " ") |
|
text = ignore_chars(text) |
|
corpus.append(text.strip()) |
|
except textract.exceptions.ShellError: |
|
continue |
|
|
|
with open("chamber_text.txt", "w") as f: |
|
f.write(" ".join(corpus)) |
|
|