File size: 1,654 Bytes
dd4b76a
 
13ecc63
 
57944b5
dd4b76a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
### NLTK ###
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

def nltk_sent_tokenize(texts: list[str]):
    return (sent for text in texts for sent in nltk.sent_tokenize(text))


# ### Spacy ###
# import spacy
# try:
#     spacy_nlp = spacy.load('en_core_web_sm')
# except OSError:
#     spacy.cli.download("en_core_web_sm")
#     spacy_nlp = spacy.load('en_core_web_sm')

# def spacy_sent_tokenize(texts: list[str]):
#     # nlp = spacy.load('en_core_web_sm')
#     return (sent.text for text in texts for sent in spacy_nlp(text).sents)


# ### Segtok ###
# from segtok.segmenter import split_single, split_multi

# def segtok_sent_tokenize(texts: list[str]):
#     return (sent for text in texts for sent in split_single(text))


### Sentence Tokenization ###

def sent_tokenize(text, method: str = 'nltk', initial_split_sep: str = None) -> list[str]:
    def has_info(text: str):
        return any(char.isalnum() for char in text)

    texts = [text] if isinstance(text, str) else text
    assert isinstance(texts, list)

    if initial_split_sep:
        texts = [sline 
                 for text in texts 
                 for line in text.split(initial_split_sep) 
                 if (sline := line.strip())]

    if method == 'nltk':
        sents = nltk_sent_tokenize(texts)
    # elif method == 'spacy':
    #     sents = spacy_sent_tokenize(texts)
    # elif method == 'segtok':
    #     sents = segtok_sent_tokenize(texts)
    elif method == 'none':
        sents = texts
    else:
        raise ValueError(f"Invalid method: {method}")
    
    return [ssent for sent in sents if (ssent := sent.strip()) and has_info(ssent)]