#!/usr/bin/env python3 # # Created by lemswasabi on 24/05/2022. # Copyright © 2022 letzspeak. All rights reserved. # import glob import re import textract chars_to_ignore_regex = '[,?.!;:"“%‘„”�—’…–]' def replace_chars(text, char, replace_char): return re.sub(char, replace_char, text.lower()) def ignore_chars(sentence): return re.sub(chars_to_ignore_regex, "", text.lower()) corpus = [] for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/**/*.doc", recursive=True): try: text = textract.process(text_file).decode("utf-8") text = replace_chars(text, "’", "'") text = replace_chars(text, "‘", "'") text = replace_chars(text, "-", " ") text = replace_chars(text, "\\n", " ") text = ignore_chars(text) corpus.append(text.strip()) except textract.exceptions.ShellError: continue with open("chamber_text.txt", "w") as f: f.write(" ".join(corpus))