RustSa's picture
Initial commit: Linux Documentation Support Chatbot
ffdd872
Raw
History Blame Contribute Delete
1.38 kB
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
def load_and_index_documents(data_dir="data"):
"""Load documents from `data_dir`, split into chunks, and create a Chroma vector store."""
docs = []
# Load PDF and text files
for filename in os.listdir(data_dir):
path = os.path.join(data_dir, filename)
if filename.lower().endswith(".pdf"):
loader = PyPDFLoader(path)
pages = loader.load_and_split() # each page has metadata with 'page' info:contentReference[oaicite:15]{index=15}
docs.extend(pages)
elif filename.lower().endswith(".txt"):
loader = TextLoader(path, encoding='utf-8')
docs.extend(loader.load()) # single Document
# (Add other formats if needed)
# Split documents into chunks with overlap
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)
# Create embeddings and vector store (Chroma)
embeddings = OpenAIEmbeddings()
vector_store = Chroma.from_documents(chunks, embeddings)
return vector_store
# Example usage (called at app startup)
# vectordb = load_and_index_documents()