import os import streamlit as st from langchain.chat_models import AzureChatOpenAI from knowledge_gpt.components.sidebar import sidebar from knowledge_gpt.core.caching import bootstrap_caching from knowledge_gpt.core.chunking import chunk_file from knowledge_gpt.core.embedding import embed_files from knowledge_gpt.core.parsing import read_file from knowledge_gpt.core.qa import query_folder from knowledge_gpt.ui import display_file_read_error from knowledge_gpt.ui import is_file_valid from knowledge_gpt.ui import is_query_valid from knowledge_gpt.ui import wrap_doc_in_html st.set_page_config(page_title="ReferenceBot", page_icon="📖", layout="wide") # add all secrets into environmental variables if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + "/../.streamlit/secrets.toml" ): # to avoid redundant print by calling st.secrets for key, value in st.secrets.items(): os.environ[key] = value def main(): EMBEDDING = "openai" VECTOR_STORE = "faiss" MODEL_LIST = ["gpt-3.5-turbo", "gpt-4"] # Uncomment to enable debug mode # MODEL_LIST.insert(0, "debug") st.header("📖ReferenceBot") # Enable caching for expensive functions bootstrap_caching() sidebar() uploaded_file = st.file_uploader( "Upload a pdf, docx, or txt file", type=["pdf", "docx", "txt"], help="Scanned documents are not supported yet!", ) model: str = st.selectbox("Model", options=MODEL_LIST) # type: ignore with st.expander("Advanced Options"): return_all_chunks = st.checkbox("Show all chunks retrieved from vector search") show_full_doc = st.checkbox("Show parsed contents of the document") if not uploaded_file: st.stop() try: file = read_file(uploaded_file) except Exception as e: display_file_read_error(e, file_name=uploaded_file.name) chunked_file = chunk_file(file, chunk_size=300, chunk_overlap=0) if not is_file_valid(file): st.stop() with st.spinner("Indexing document... This may take a while⏳"): folder_index = embed_files( files=[chunked_file], embedding=EMBEDDING if model != "debug" else "debug", vector_store=VECTOR_STORE if model != "debug" else "debug", deployment=os.environ["ENGINE_EMBEDDING"], model=os.environ["ENGINE"], openai_api_key=os.environ["OPENAI_API_KEY"], openai_api_base=os.environ["OPENAI_API_BASE"], openai_api_type="azure", chunk_size=1, ) with st.form(key="qa_form"): query = st.text_area("Ask a question about the document") submit = st.form_submit_button("Submit") if show_full_doc: with st.expander("Document"): # Hack to get around st.markdown rendering LaTeX st.markdown(f"
{wrap_doc_in_html(file.docs)}
", unsafe_allow_html=True) if submit: if not is_query_valid(query): st.stop() # Output Columns answer_col, sources_col = st.columns(2) with st.spinner("Setting up AzureChatOpenAI bot..."): llm = AzureChatOpenAI( openai_api_base=os.environ["OPENAI_API_BASE"], openai_api_version=os.environ["OPENAI_API_VERSION"], deployment_name=os.environ["ENGINE"], openai_api_key=os.environ["OPENAI_API_KEY"], openai_api_type="azure", temperature=0, ) with st.spinner("Querying folder to get result..."): result = query_folder( folder_index=folder_index, query=query, return_all=return_all_chunks, llm=llm, ) with answer_col: st.markdown("#### Answer") st.markdown(result.answer) with sources_col: st.markdown("#### Sources") for source in result.sources: st.markdown(source.page_content) st.markdown(source.metadata["source"]) st.markdown("---") if __name__ == "__main__": main()