File size: 1,162 Bytes
3ddfd5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import re
import argparse
#chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]'
radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]'
def extract_text(text, is_radical=False):
text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " "
if is_radical:
text = re.sub(radical_regex, ' ', text)
text = re.sub(r'\s+', ' ', text)
return text
def main(args):
with open(args.input_file, 'r') as file:
data = file.read()
data = extract_text(data, bool(args.radical))
with open(args.output_file, 'w') as outputfile:
outputfile.write(data)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', required=True, help='Path to input file.')
parser.add_argument('--output_file', required=True, help='Path to output file.')
parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
main(args)
|