File size: 981 Bytes
98591ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python3
#
#  Created by lemswasabi on 24/05/2022.
#  Copyright © 2022 letzspeak. All rights reserved.
#

import glob
import re
import textract


chars_to_ignore_regex = '[,?.!;:"“%‘„”�—’…–]'

def replace_chars(text, char, replace_char):
    return re.sub(char, replace_char, text.lower())

def ignore_chars(sentence):
    return re.sub(chars_to_ignore_regex, "", text.lower())

corpus = []

for text_file in glob.glob("/home/lemswasabi/corpus/chamber_text_corpus/**/*.doc", recursive=True):
    try:
        text = textract.process(text_file).decode("utf-8")
        text = replace_chars(text, "’", "'")
        text = replace_chars(text, "‘", "'")
        text = replace_chars(text, "-", " ")
        text = replace_chars(text, "\\n", " ")
        text = ignore_chars(text)
        corpus.append(text.strip())
    except textract.exceptions.ShellError:
        continue

with open("chamber_text.txt", "w") as f:
    f.write(" ".join(corpus))