File size: 1,348 Bytes
9d434bb
 
 
 
39af5eb
9d434bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0d010f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import nltk
import jieba
import sudachipy
import langid
nltk.download('punkt')
langid.set_languages(['en', 'zh', 'ja'])

def split_text_into_sentences(text):
    if langid.classify(text)[0] == "en":
        sentences = nltk.tokenize.sent_tokenize(text)

        return sentences
    elif langid.classify(text)[0] == "zh":
        sentences = []
        segs = jieba.cut(text, cut_all=False)
        segs = list(segs)
        start = 0
        for i, seg in enumerate(segs):
            if seg in ["。", "!", "?", "……"]:
                sentences.append("".join(segs[start:i + 1]))
                start = i + 1
        if start < len(segs):
            sentences.append("".join(segs[start:]))

        return sentences
    elif langid.classify(text)[0] == "ja":
        sentences = []
        tokenizer = sudachipy.Dictionary().create()
        tokens = tokenizer.tokenize(text)
        current_sentence = ""

        for token in tokens:
            current_sentence += token.surface()
            if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点":
                sentences.append(current_sentence)
                current_sentence = ""

        if current_sentence:
            sentences.append(current_sentence)

        return sentences

    raise RuntimeError("It is impossible to reach here.")