LinuxGPT / init_db.py
jaynopponep's picture
adding non large files
fd5f784
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os
import shutil
load_dotenv()
OPEN_AI_KEY = os.getenv('OPEN_AI_KEY')
CHROMA_PATH = "chroma"
DATA_PATH = "data/"
TEST_PATH = "data/theory_of_computation.pdf"
embed = OpenAIEmbeddings(
api_key=OPEN_AI_KEY,
model="text-embedding-3-large"
)
def main():
generate_data_store()
# print(load_documents())
def generate_data_store():
documents = load_documents()
chunks = split_text(documents)
save_to_chroma(chunks)
def load_documents():
loader = PyPDFDirectoryLoader(DATA_PATH)
docs = loader.load()
print(docs[0].metadata)
return docs
# loader = PyPDFLoader(TEST_PATH)
# docs = []
# docs_lazy = loader.load()
# for doc in docs_lazy:
# docs.append(doc)
# return docs_lazy
def split_text(documents: list[Document]):
# chunk_size = 1000,
# chunk_overlap = 200,
# length_function = len,
# add_start_index = True,
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1100,
chunk_overlap=100,
length_function=len,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
document = chunks[10]
print(document.page_content)
print(document.metadata)
return chunks
def save_to_chroma(chunks: list[Document]):
if os.path.exists(CHROMA_PATH): # clear out the DB first
shutil.rmtree(CHROMA_PATH)
db = Chroma(
collection_name="linux_funds",
embedding_function=embed,
persist_directory=CHROMA_PATH
)
# below breaks text & metadata down to Chroma vector store
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]
db.add_texts(texts=texts, metadatas=metadatas)
print(f"Saved {len(chunks)} chunks to CHROMA PATH {CHROMA_PATH}.")
if __name__ == "__main__":
main()