Spaces:

intelli-zen
/

sentence_boundary_detection

Sleeping

File size: 1,667 Bytes

974e1e6

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
import warnings
import time

warnings.simplefilter(action='ignore', category=FutureWarning)

from project_settings import project_path

os.environ["STANZA_RESOURCES_DIR"] = (project_path / "data/stanza_resources").as_posix()

import stanza


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--text",
        type=str,
        default="Mr. Honey Tian. How are you."
    )
    parser.add_argument(
        "--language",
        type=str,
        default="en"
    )
    args = parser.parse_args()
    return args


# https://huggingface.co/stanfordnlp
languages = [
    "ru", "pl", "cs", "hi", "fr", "es", "en", "de", "ca", "zh-hant", "zh-hans",
    "xcl", "wo", "vi", "ur", "uk", "ug", "tr", "th", "te", "ta", "sv", "sr", "sme",
    "sl", "sk", "si", "sd", "sa", "ro", "qtd", "qpm", "qaf", "pt", "pcm", "orv",
    "nn", "nl", "nb", "myv", "my", "multilingual", "mt", "mr", "ml", "lzh", "lv",
    "lt", "lij", "la", "ky", "ko", "kmr", "kk", "ja", "it", "is", "id", "hyw", "hy",
    "hu", "hsb", "hr", "he", "hbo", "gv", "grc", "got", "gl", "fro", "fo", "fi", "fa",
    "eu", "et", "el", "da", "cy", "cu", "cop", "bxr", "hn", "bg", "be", "ar", "ang",
    "af", "swl", "no"
]


def main():
    args = get_args()

    stanza.download(args.language)

    stanza_nlp = stanza.Pipeline(args.language)

    begin_time = time.time()
    doc = stanza_nlp(args.text)
    sentences = [sentence.text for sentence in doc.sentences]

    cost = time.time() - begin_time
    print(f"time cost: {cost}")

    print(sentences)
    return


if __name__ == "__main__":
    main()