|
|
|
|
|
import argparse |
|
import re |
|
import time |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--text", |
|
type=str, |
|
default="讹言:“苍天已死,黄天当立;岁在甲子,天下大吉。”令人各以白土书“甲子”二字于家中大门上。" |
|
) |
|
parser.add_argument( |
|
"--language", type=str, default="chinese" |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def chinese_sent_tokenize(text: str): |
|
|
|
text = re.sub(r"([。!??])([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"(\.{6})([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"(…{2})([^”’])", r"\1\n\2", text) |
|
|
|
text = re.sub(r"([。!??][”’])([^,。!??])", r"\1\n\2", text) |
|
|
|
|
|
text = text.rstrip() |
|
|
|
return text.split("\n") |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
begin_time = time.time() |
|
|
|
result = chinese_sent_tokenize(args.text) |
|
|
|
cost = time.time() - begin_time |
|
print(f"time cost: {cost}") |
|
|
|
print(result) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|