Spaces:
Sleeping
Sleeping
### NLTK ### | |
import nltk | |
nltk.download('punkt') | |
nltk.download('punkt_tab') | |
def nltk_sent_tokenize(texts: list[str]): | |
return (sent for text in texts for sent in nltk.sent_tokenize(text)) | |
# ### Spacy ### | |
# import spacy | |
# try: | |
# spacy_nlp = spacy.load('en_core_web_sm') | |
# except OSError: | |
# spacy.cli.download("en_core_web_sm") | |
# spacy_nlp = spacy.load('en_core_web_sm') | |
# def spacy_sent_tokenize(texts: list[str]): | |
# # nlp = spacy.load('en_core_web_sm') | |
# return (sent.text for text in texts for sent in spacy_nlp(text).sents) | |
# ### Segtok ### | |
# from segtok.segmenter import split_single, split_multi | |
# def segtok_sent_tokenize(texts: list[str]): | |
# return (sent for text in texts for sent in split_single(text)) | |
### Sentence Tokenization ### | |
def sent_tokenize(text, method: str = 'nltk', initial_split_sep: str = None) -> list[str]: | |
def has_info(text: str): | |
return any(char.isalnum() for char in text) | |
texts = [text] if isinstance(text, str) else text | |
assert isinstance(texts, list) | |
if initial_split_sep: | |
texts = [sline | |
for text in texts | |
for line in text.split(initial_split_sep) | |
if (sline := line.strip())] | |
if method == 'nltk': | |
sents = nltk_sent_tokenize(texts) | |
# elif method == 'spacy': | |
# sents = spacy_sent_tokenize(texts) | |
# elif method == 'segtok': | |
# sents = segtok_sent_tokenize(texts) | |
elif method == 'none': | |
sents = texts | |
else: | |
raise ValueError(f"Invalid method: {method}") | |
return [ssent for sent in sents if (ssent := sent.strip()) and has_info(ssent)] | |