japan / src /core /chunking.py
Adrian Cowham
restarting
e71c4e6
raw
history blame
2 kB
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from .parsing import File
def chunk_sentences(sentences, chunk_size=512):
sents = []
current_sent = ""
for sentence in sentences:
# If adding the next sentence doesn't exceed the chunk_size,
# we add the sentence to the current chunk.
if len(current_sent) + len(sentence) <= chunk_size:
current_sent += " " + sentence
else:
# If adding the sentence would make the chunk too long,
# we add the current_sent chunk to the list of chunks and start a new chunk.
sents.append(current_sent)
current_sent = sentence
# After going through all the sentences, there may be a chunk that hasn't yet been added to the list.
# We add it now:
if current_sent:
sents.append(current_sent)
return sents
def chunk_file(
file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
) -> File:
"""Chunks each document in a file into smaller documents
according to the specified chunk size and overlap
where the size is determined by the number of token for the specified model.
"""
# split each document into chunks
chunked_docs = []
for doc in file.docs:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
model_name=model_name,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
chunks = text_splitter.split_text(doc.page_content)
for i, chunk in enumerate(chunks):
doc = Document(
page_content=chunk,
metadata={
"page": doc.metadata.get("page", 1),
"chunk": i + 1,
"source": f"{doc.metadata.get('page', 1)}-{i + 1}",
},
)
chunked_docs.append(doc)
chunked_file = file.copy()
chunked_file.docs = chunked_docs
return chunked_file