saeedabc's picture
Add punkt_tab
57944b5
### NLTK ###
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
def nltk_sent_tokenize(texts: list[str]):
return (sent for text in texts for sent in nltk.sent_tokenize(text))
# ### Spacy ###
# import spacy
# try:
# spacy_nlp = spacy.load('en_core_web_sm')
# except OSError:
# spacy.cli.download("en_core_web_sm")
# spacy_nlp = spacy.load('en_core_web_sm')
# def spacy_sent_tokenize(texts: list[str]):
# # nlp = spacy.load('en_core_web_sm')
# return (sent.text for text in texts for sent in spacy_nlp(text).sents)
# ### Segtok ###
# from segtok.segmenter import split_single, split_multi
# def segtok_sent_tokenize(texts: list[str]):
# return (sent for text in texts for sent in split_single(text))
### Sentence Tokenization ###
def sent_tokenize(text, method: str = 'nltk', initial_split_sep: str = None) -> list[str]:
def has_info(text: str):
return any(char.isalnum() for char in text)
texts = [text] if isinstance(text, str) else text
assert isinstance(texts, list)
if initial_split_sep:
texts = [sline
for text in texts
for line in text.split(initial_split_sep)
if (sline := line.strip())]
if method == 'nltk':
sents = nltk_sent_tokenize(texts)
# elif method == 'spacy':
# sents = spacy_sent_tokenize(texts)
# elif method == 'segtok':
# sents = segtok_sent_tokenize(texts)
elif method == 'none':
sents = texts
else:
raise ValueError(f"Invalid method: {method}")
return [ssent for sent in sents if (ssent := sent.strip()) and has_info(ssent)]