File size: 2,086 Bytes
e6cc6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
import hashlib

def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
    """

    Chunk a list of page contents into smaller segments with document ID metadata.

    

    Args:

        page_list (list): List of strings, each string being the content of a page.

        doc_id (str): Unique identifier for the document.

        chunk_size (int): Maximum size of each chunk (default: 1000 characters).

        chunk_overlap (int): Overlap between chunks (default: 200 characters).

    

    Returns:

        list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.

    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = []
    seen_hashes = set()  # Track hashes of chunks to avoid duplicates

    for page_num, page_content in enumerate(page_list, start=1):  # Start page numbering at 1
        if not page_content or not isinstance(page_content, str):
            continue  # Skip empty or invalid pages

        # Split the page content into chunks
        chunks = text_splitter.split_text(page_content)
        
        for i, chunk in enumerate(chunks):
            # Generate a unique hash for the chunk
            chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
            
            # Skip if the chunk is a duplicate
            if chunk_hash in seen_hashes:
                continue
            
            # Create source identifier (e.g., "doc_123_page_1_chunk_0")
            source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
            
            # Add the chunk with doc_id as metadata
            documents.append({
                'text': chunk,
                'source': source,
                'doc_id': doc_id
            })
            seen_hashes.add(chunk_hash)
            
    logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
    return documents