from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings import os import time import streamlit as st def embed_doc(filename): if len(os.listdir("."))>0: loader=PyPDFLoader(filename) start = time.time() raw_documents = loader.load() # Split text text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0, length_function=len ) documents = text_splitter.split_documents(raw_documents) end = time.time() st.text("Load and split text: "+str(round(end - start,1))) start = time.time() embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-base") end = time.time() st.text("Embedding time: "+str(round(end - start,1))) start = time.time() vectorstore = Chroma.from_documents(documents, embeddings) end = time.time() st.text("Vectorizing time: "+str(round(end - start,1))) return vectorstore