Spaces:

bohmian
/

esg_countries_chatbot

Running

App Files Files Community

bohmian commited on Feb 14, 2024

Commit

cc8b84c

verified ·

1 Parent(s): 88cbf6a

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -21

app.py CHANGED Viewed

@@ -172,7 +172,8 @@ def get_llm():
                         )
     return llm
-@st.cache_data # only going to get this once instead of all the time when page refreshers
 def get_embeddings():
     with st.spinner(f'Getting HuggingFaceEmbeddings'):
         # We use HuggingFaceEmbeddings() as it is open source and free to use.
@@ -209,28 +210,33 @@ if not os.path.exists("chromadb/"):
     with st.spinner(f'Unzipping chromadb retrievers for all chunk sizes and overlaps, will take some time'):
         os.system("unzip chromadb.zip")
-persist_directory = f"chromadb/chromadb_esg_countries_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}"
-with st.spinner(f'Setting up pre-built chroma vector store'):
-    chroma_db = Chroma(persist_directory=persist_directory,embedding_function=hf_embeddings)
-# Initialize BM25 Retriever
-# Unlike Chroma (semantic) BM25 is a keyword-based algorithm that performs well on queries containing keywords without capturing the semantic meaning of the query terms,
-# hence there is no need to embed the text with HuggingFaceEmbeddings and it is relatively faster to set up.
-# The retrievers with different chunking sizes and overlaps and countries were created in advanced and saved as pickle files and pulled using !wget.
-# Need to initialize one BM25Retriever for each country so the search results later in the main app can be limited to just a particular country.
-# (Chroma DB gives an option to filter metadata for just a particular country during the retrieval processbut BM25 does not because it makes use of external ranking library.)
-# A separate retriever was hence pre-built for each unique country and each unique chunk size and overlap.
-bm25_retrievers = {} # to store retrievers for different countries
-with st.spinner(f'Setting up pre-built bm25 retrievers'):
-    for country in countries:
-        bm25_filename = f"bm25/bm25_esg_countries_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}.pickle"
-        with open(bm25_filename, 'rb') as handle:
-            bm25_retriever = pickle.load(handle)
-            bm25_retrievers[country] = bm25_retriever
-# One retriever above is semantic based and the other is keyword based
 # Both retrievers will be used
 # Then Langchain's EnsembleRetriever will be used to rerank both their results to give final output to RetrievalQA chain below
 ################################ Tools for Agent to Use ################################
@@ -450,6 +456,14 @@ if page == "Chat Config":
 # to override existing data on new scraped data or new pdf uploaded
 if page == "Document, Retriever, Web Scraping Config":
     st.header(page)
 ################################ Main Chatbot Page ################################

                         )
     return llm
+@st.cache_data # only going to get this once instead of all the time when page refreshes
+# for chromadb vectore store
 def get_embeddings():
     with st.spinner(f'Getting HuggingFaceEmbeddings'):
         # We use HuggingFaceEmbeddings() as it is open source and free to use.
     with st.spinner(f'Unzipping chromadb retrievers for all chunk sizes and overlaps, will take some time'):
         os.system("unzip chromadb.zip")
+# One retriever below is semantic based (chromadb) and the other is keyword based (bm25)
 # Both retrievers will be used
 # Then Langchain's EnsembleRetriever will be used to rerank both their results to give final output to RetrievalQA chain below
+def get_retrievers():
+    persist_directory = f"chromadb/chromadb_esg_countries_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}"
+    with st.spinner(f'Setting up pre-built chroma vector store'):
+        chroma_db = Chroma(persist_directory=persist_directory,embedding_function=hf_embeddings)
+    # Initialize BM25 Retriever
+    # Unlike Chroma (semantic) BM25 is a keyword-based algorithm that performs well on queries containing keywords without capturing the semantic meaning of the query terms,
+    # hence there is no need to embed the text with HuggingFaceEmbeddings and it is relatively faster to set up.
+    # The retrievers with different chunking sizes and overlaps and countries were created in advanced and saved as pickle files and pulled using !wget.
+    # Need to initialize one BM25Retriever for each country so the search results later in the main app can be limited to just a particular country.
+    # (Chroma DB gives an option to filter metadata for just a particular country during the retrieval processbut BM25 does not because it makes use of external ranking library.)
+    # A separate retriever was hence pre-built for each unique country and each unique chunk size and overlap.
+    bm25_retrievers = {} # to store retrievers for different countries
+    with st.spinner(f'Setting up pre-built bm25 retrievers'):
+        for country in countries:
+            bm25_filename = f"bm25/bm25_esg_countries_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}.pickle"
+            with open(bm25_filename, 'rb') as handle:
+                bm25_retriever = pickle.load(handle)
+                bm25_retrievers[country] = bm25_retriever
+    return chroma_db, bm25_retrievers
+chroma_db, bm25_retrievers = get_retrievers()
 ################################ Tools for Agent to Use ################################
 # to override existing data on new scraped data or new pdf uploaded
 if page == "Document, Retriever, Web Scraping Config":
     st.header(page)
+    st.session_state['chunk_size'] = st.selectbox(
+        "Chunk Size",
+        options=[500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000],
+        key="chunk_size").on_change(get_retrievers)
+    st.session_state['chunk_overlap'] = st.selectbox(
+        "Chunk Overlap",
+        options=[50, 100, 150, 200],
+         key="chunk_overlap").on_change(get_retrievers)
 ################################ Main Chatbot Page ################################