from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings import TensorflowHubEmbeddings import os import time import streamlit as st def embed_doc(filename): if len(os.listdir("."))>0: loader=PyPDFLoader(filename) start = time.time() raw_documents = loader.load() # Split text text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0, length_function=len ) documents = text_splitter.split_documents(raw_documents) end = time.time() st.text("Load and split text: "+str(round(end - start,1))) start = time.time() embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3") end = time.time() st.text("Embedding time: "+str(round(end - start,1))) start = time.time() vectorstore = Chroma.from_documents(documents, embeddings) end = time.time() st.text("Vectorizing time: "+str(round(end - start,1))) return vectorstore