insight / pages /DocIndex.py
DeepVen's picture
Upload 7 files
6872416
raw
history blame
No virus
2.1 kB
import streamlit as st
#from langchain.retrievers import KNNRetriever
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain.vectorstores import FAISS
#from streamapp import *
from PIL import Image
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
st.sidebar.image(Image.open("./test-logo.png"), use_column_width=True)
print("Loading Index Page!!")
#if 'vectorstore' in st.session_state.keys():
vectorstore = st.session_state['vectorstore']
# else:
# retriever = initialize_vectorstore()
# vectorstore = st.session_state['vectorstore']
def _text_splitter(doc):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=50,
length_function=len,
)
return text_splitter.transform_documents(doc)
def _load_docs(path: str):
load_doc = WebBaseLoader(path).load()
doc = _text_splitter(load_doc)
return doc
with st.form("Index documents to Vector Store"):
file_path = st.text_input(label="Enter the web link", value="", placeholder="", label_visibility="visible", disabled=False)
print("file_path " ,file_path)
submitted = st.form_submit_button("Submit")
if submitted:
st.write("Submitted web link: " + file_path)
webpage_loader = _load_docs(file_path)
webpage_chunks = _text_splitter(webpage_loader)
# store embeddings in vector store
print("vectorstore length before addition, ", len(vectorstore.serialize_to_bytes()))
vectorstore.add_documents(webpage_chunks)
print("vectorstore length after addition, ", len(vectorstore.serialize_to_bytes()))
st.session_state['vectorstore'] = vectorstore
retriever = vectorstore.as_retriever()
st.session_state['retriever'] = retriever
st.session_state['docadd'] = 1
st.markdown('<h2 style="color:#100170;font-size:24px;">Document loaded to vector store successfully!!</h2>', unsafe_allow_html=True)