Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# -*- coding:utf-8 _*- | |
""" | |
@author:quincy qiang | |
@license: Apache Licence | |
@file: clean_corpus.py.py | |
@time: 2023/04/19 | |
@contact: yanqiangmiffy@gamil.com | |
@software: PyCharm | |
@description: coding.. | |
""" | |
""" | |
FILE : clean_corpus.py | |
FUNCTION : None | |
""" | |
import sys | |
import os | |
from optparse import OptionParser | |
class Clean(object): | |
def __init__(self, infile, outfile): | |
self.infile = infile | |
self.outfile = outfile | |
self.corpus = [] | |
self.remove_corpus = [] | |
self.read(self.infile) | |
self.remove(self.corpus) | |
self.write(self.remove_corpus, self.outfile) | |
def read(self, path): | |
print("reading now......") | |
if os.path.isfile(path) is False: | |
print("path is not a file") | |
exit() | |
now_line = 0 | |
with open(path, encoding="UTF-8") as f: | |
for line in f: | |
now_line += 1 | |
line = line.replace("\n", "").replace("\t", "") | |
self.corpus.append(line) | |
print("read finished.") | |
def remove(self, list): | |
print("removing now......") | |
for line in list: | |
re_list = [] | |
for word in line: | |
if self.is_chinese(word) is False: | |
continue | |
re_list.append(word) | |
self.remove_corpus.append("".join(re_list)) | |
print("remove finished.") | |
def write(self, list, path): | |
print("writing now......") | |
if os.path.exists(path): | |
os.remove(path) | |
file = open(path, encoding="UTF-8", mode="w") | |
for line in list: | |
file.writelines(line + "\n") | |
file.close() | |
print("writing finished") | |
def is_chinese(self, uchar): | |
"""判断一个unicode是否是汉字""" | |
if (uchar >= u'\u4e00') and (uchar <= u'\u9fa5'): | |
return True | |
else: | |
return False | |
if __name__ == "__main__": | |
print("clean corpus") | |
parser = OptionParser() | |
parser.add_option("--input", dest="input", default="", help="input file") | |
parser.add_option("--output", dest="output", default="", help="output file") | |
(options, args) = parser.parse_args() | |
input = options.input | |
output = options.output | |
try: | |
Clean(infile=input, outfile=output) | |
print("All Finished.") | |
except Exception as err: | |
print(err) |