|
|
|
|
|
import argparse |
|
import os |
|
import time |
|
|
|
from project_settings import project_path |
|
|
|
os.environ["NLTK_DATA"] = (project_path / "data/nltk_data").as_posix() |
|
|
|
import nltk |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--text", |
|
type=str, |
|
default="Mr. Honey Tian. How are you." |
|
) |
|
parser.add_argument( |
|
"--language", |
|
type=str, |
|
default="english" |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
nltk_sent_tokenize_languages = [ |
|
"czech", "danish", "dutch", "english", "estonian", |
|
"finnish", "french", "german", "greek", "italian", "norwegian", |
|
"polish", "portuguese", "russian", "slovene", "spanish", "swedish", "turkish" |
|
] |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
begin_time = time.time() |
|
|
|
sent_list = nltk.sent_tokenize(args.text, args.language) |
|
|
|
cost = time.time() - begin_time |
|
print(f"time cost: {cost}") |
|
|
|
print(sent_list) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|