File size: 1,996 Bytes
e71c4e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from .parsing import File


def chunk_sentences(sentences, chunk_size=512):
  sents = []
  current_sent = ""

  for sentence in sentences:
    # If adding the next sentence doesn't exceed the chunk_size,
    # we add the sentence to the current chunk.
    if len(current_sent) + len(sentence) <= chunk_size:
      current_sent += " " + sentence
    else:
      # If adding the sentence would make the chunk too long,
      # we add the current_sent chunk to the list of chunks and start a new chunk.
      sents.append(current_sent)
      current_sent = sentence

  # After going through all the sentences, there may be a chunk that hasn't yet been added to the list.
  # We add it now:
  if current_sent:
    sents.append(current_sent)

  return sents

def chunk_file(
    file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
) -> File:
    """Chunks each document in a file into smaller documents
    according to the specified chunk size and overlap
    where the size is determined by the number of token for the specified model.
    """

    # split each document into chunks
    chunked_docs = []
    for doc in file.docs:
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            model_name=model_name,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

        chunks = text_splitter.split_text(doc.page_content)

        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page": doc.metadata.get("page", 1),
                    "chunk": i + 1,
                    "source": f"{doc.metadata.get('page', 1)}-{i + 1}",
                },
            )
            chunked_docs.append(doc)

    chunked_file = file.copy()
    chunked_file.docs = chunked_docs
    return chunked_file