Spaces:
Sleeping
Sleeping
File size: 1,154 Bytes
999f447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
"""
embeddings.py
Module for processing and storing document embeddings using ChromaDB.
"""
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
PERSIST_DIRECTORY = "./chroma_db/courses"
def process_documents_with_chroma(documents):
"""Processes documents and stores embeddings in ChromaDB.
Args:
documents (list): List of documents to be embedded.
Returns:
Chroma: Vector store with document embeddings.
"""
if os.path.exists(PERSIST_DIRECTORY):
print("Loading existing embeddings from ChromaDB...")
vector_store = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=OpenAIEmbeddings())
else:
print("Creating new embeddings and saving to ChromaDB...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vector_store = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
return vector_store
|