File size: 698 Bytes
a2ff264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from .IChunkGenerator import IChunkGenerator
import nltk
from nltk.tokenize import sent_tokenize
class ChunkGenerator(IChunkGenerator):
def chunk_text(self, text: str,max_words: int=100) -> list:
sentences = sent_tokenize(text)
chunks, chunk = [], []
word_count = 0
for sentence in sentences:
word_count += len(sentence.split())
chunk.append(sentence)
if word_count >= max_words:
chunks.append(" ".join(chunk))
chunk = []
word_count = 0
if chunk:
chunks.append(" ".join(chunk))
return chunks |