File size: 1,375 Bytes
ffdd872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

def load_and_index_documents(data_dir="data"):
    """Load documents from `data_dir`, split into chunks, and create a Chroma vector store."""
    docs = []
    # Load PDF and text files
    for filename in os.listdir(data_dir):
        path = os.path.join(data_dir, filename)
        if filename.lower().endswith(".pdf"):
            loader = PyPDFLoader(path)
            pages = loader.load_and_split()  # each page has metadata with 'page' info:contentReference[oaicite:15]{index=15}
            docs.extend(pages)
        elif filename.lower().endswith(".txt"):
            loader = TextLoader(path, encoding='utf-8')
            docs.extend(loader.load())  # single Document
        # (Add other formats if needed)
    # Split documents into chunks with overlap
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)
    # Create embeddings and vector store (Chroma)
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store

# Example usage (called at app startup)
# vectordb = load_and_index_documents()