Spaces:
Runtime error
Runtime error
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader | |
from langchain_chroma import Chroma | |
from langchain.schema import Document | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from dotenv import load_dotenv | |
import os | |
import shutil | |
load_dotenv() | |
OPEN_AI_KEY = os.getenv('OPEN_AI_KEY') | |
CHROMA_PATH = "chroma" | |
DATA_PATH = "data/" | |
TEST_PATH = "data/theory_of_computation.pdf" | |
embed = OpenAIEmbeddings( | |
api_key=OPEN_AI_KEY, | |
model="text-embedding-3-large" | |
) | |
def main(): | |
generate_data_store() | |
# print(load_documents()) | |
def generate_data_store(): | |
documents = load_documents() | |
chunks = split_text(documents) | |
save_to_chroma(chunks) | |
def load_documents(): | |
loader = PyPDFDirectoryLoader(DATA_PATH) | |
docs = loader.load() | |
print(docs[0].metadata) | |
return docs | |
# loader = PyPDFLoader(TEST_PATH) | |
# docs = [] | |
# docs_lazy = loader.load() | |
# for doc in docs_lazy: | |
# docs.append(doc) | |
# return docs_lazy | |
def split_text(documents: list[Document]): | |
# chunk_size = 1000, | |
# chunk_overlap = 200, | |
# length_function = len, | |
# add_start_index = True, | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1100, | |
chunk_overlap=100, | |
length_function=len, | |
) | |
chunks = text_splitter.split_documents(documents) | |
print(f"Split {len(documents)} documents into {len(chunks)} chunks.") | |
document = chunks[10] | |
print(document.page_content) | |
print(document.metadata) | |
return chunks | |
def save_to_chroma(chunks: list[Document]): | |
if os.path.exists(CHROMA_PATH): # clear out the DB first | |
shutil.rmtree(CHROMA_PATH) | |
db = Chroma( | |
collection_name="linux_funds", | |
embedding_function=embed, | |
persist_directory=CHROMA_PATH | |
) | |
# below breaks text & metadata down to Chroma vector store | |
texts = [chunk.page_content for chunk in chunks] | |
metadatas = [chunk.metadata for chunk in chunks] | |
db.add_texts(texts=texts, metadatas=metadatas) | |
print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.") | |
if __name__ == "__main__": | |
main() | |