Lemswasabi's picture
add create lm scripts
98591ec
raw
history blame
981 Bytes
#!/usr/bin/env python3
#
# Created by lemswasabi on 24/05/2022.
# Copyright Β© 2022 letzspeak. All rights reserved.
#
import glob
import re
import textract
chars_to_ignore_regex = '[,?.!;:"β€œ%β€˜β€žβ€οΏ½β€”β€™β€¦β€“]'
def replace_chars(text, char, replace_char):
return re.sub(char, replace_char, text.lower())
def ignore_chars(sentence):
return re.sub(chars_to_ignore_regex, "", text.lower())
corpus = []
for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/**/*.doc", recursive=True):
try:
text = textract.process(text_file).decode("utf-8")
text = replace_chars(text, "’", "'")
text = replace_chars(text, "β€˜", "'")
text = replace_chars(text, "-", " ")
text = replace_chars(text, "\\n", " ")
text = ignore_chars(text)
corpus.append(text.strip())
except textract.exceptions.ShellError:
continue
with open("chamber_text.txt", "w") as f:
f.write(" ".join(corpus))