File size: 1,037 Bytes
974e1e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
import time
from project_settings import project_path
os.environ["NLTK_DATA"] = (project_path / "data/nltk_data").as_posix()
import nltk
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
default="Mr. Honey Tian. How are you."
)
parser.add_argument(
"--language",
type=str,
default="english"
)
args = parser.parse_args()
return args
nltk_sent_tokenize_languages = [
"czech", "danish", "dutch", "english", "estonian",
"finnish", "french", "german", "greek", "italian", "norwegian",
"polish", "portuguese", "russian", "slovene", "spanish", "swedish", "turkish"
]
def main():
args = get_args()
begin_time = time.time()
sent_list = nltk.sent_tokenize(args.text, args.language)
cost = time.time() - begin_time
print(f"time cost: {cost}")
print(sent_list)
return
if __name__ == "__main__":
main()
|