GPT-Docker / app /utils.py
heikowagner's picture
remove data
4f0dc21
raw
history blame
3.41 kB
import streamlit as st
import latex2markdown
from langchain.docstore.document import Document
import chromadb
from chromadb.config import Settings
import load_model
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
persist_directory = load_model.persist_directory
def format_document(document: Document):
"""TODO: Implement a nice style"""
return document.dict()
def format_result_set(result):
st.write(latex2markdown.LaTeX2Markdown(result["result"]).to_markdown())
agree = st.checkbox('Show source documents')
source_documents = result["source_documents"]
if agree:
st.write('Source Documents:')
for document in source_documents:
st.write(format_document(document))
@st.cache_resource
def get_chroma_client():
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
persist_directory=persist_directory
))
@st.cache_data
def retrieve_collections():
client = get_chroma_client()
collections = tuple( [collection.name for collection in client.list_collections()] )
return collections
def load_files():
client = get_chroma_client()
option = st.radio(
"",
options=["Add Documents", "Start new collection"],
)
collections = retrieve_collections()
if option == "Add Documents":
selected_collection = st.selectbox(
'Add to exsisting collection or create a new one',
collections )
if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
client.delete_collection(name=selected_collection)
retrieve_collections.clear()
collections = retrieve_collections()
option = st.radio(
"",
options=["Upload Files", "Download Files"],
)
if option == "Upload Files":
st.write('Source Documents:')
uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
chunk_size = st.text_area('chunk Size:', 1000)
if st.button('Upload'):
docs = load_from_file(uploaded_files)
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
uploaded_files=None
else:
st.write('Source Documents (Comma separated):')
urls = chunk_size = st.text_area('Urls:', '')
chunk_size = st.text_area('chunk Size:', 1000)
urls = urls.replace(",", "" ).replace('"', "" ).split(',')
if st.button('Upload'):
docs = load_from_web(urls)
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
uploaded_files=None
else:
collection = st.text_area('Name of your new collection:', '')
if st.button('Create'):
if len(collection)>3:
client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name) --Problem i added the model to the name -> Better use Metadata :)
retrieve_collections.clear()
st.write("Collection " +collection+" succesfully created.")