import pysqlite3 import sys, os sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Chroma import streamlit as st HF_TOKEN = st.secrets["HF_TOKEN"] def persist_dir(file_path): data = PyPDFLoader(file_path) print("Loading data...") content = data.load() print("Splitting data...") splitter = RecursiveCharacterTextSplitter(chunk_size=1024,chunk_overlap=150) chunks = splitter.split_documents(content) embeddings = HuggingFaceInferenceAPIEmbeddings( api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5" ) print("Save to db...") vectorstore = Chroma.from_documents(chunks, embeddings,persist_directory="./db") if __name__ == "__main__": #will change, if you add file upload on streamlit #data = "./data/Sungwon_Kim_ML_DL.pdf" data = "./data/Sungwon_Kim_ML_DL_Intro_together.pdf" persist_dir(data)