HoneyTian's picture
update
974e1e6
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
import time
from project_settings import project_path
os.environ["NLTK_DATA"] = (project_path / "data/nltk_data").as_posix()
import nltk
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
default="Mr. Honey Tian. How are you."
)
parser.add_argument(
"--language",
type=str,
default="english"
)
args = parser.parse_args()
return args
nltk_sent_tokenize_languages = [
"czech", "danish", "dutch", "english", "estonian",
"finnish", "french", "german", "greek", "italian", "norwegian",
"polish", "portuguese", "russian", "slovene", "spanish", "swedish", "turkish"
]
def main():
args = get_args()
begin_time = time.time()
sent_list = nltk.sent_tokenize(args.text, args.language)
cost = time.time() - begin_time
print(f"time cost: {cost}")
print(sent_list)
return
if __name__ == "__main__":
main()