import re import argparse #chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' chars_to_ignore_regex = '[\'=&()\*\/,?.!\-\;\:"“%‘”�—’…–<>#@0123456789°′»«\[\]]' radical_regex = r'[^abcdefghijklmnopqrstuvwxyzåæø ]' def extract_text(text, is_radical=False): text = re.sub(chars_to_ignore_regex, " ", text.lower()) + " " if is_radical: text = re.sub(radical_regex, ' ', text) text = re.sub(r'\s+', ' ', text) return text def main(args): with open(args.input_file, 'r') as file: data = file.read() data = extract_text(data, bool(args.radical)) with open(args.output_file, 'w') as outputfile: outputfile.write(data) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--input_file', required=True, help='Path to input file.') parser.add_argument('--output_file', required=True, help='Path to output file.') parser.add_argument('--radical', dest='radical', action='store_true', help='Delete any non vocab character.') args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() main(args)