Spaces:

bohmian
/

esg_countries_chatbot

Sleeping

App Files Files Community

bohmian commited on Feb 15

Commit

c4776d0

•

1 Parent(s): c2768d3

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -57

app.py CHANGED Viewed

@@ -256,6 +256,50 @@ def update_retrievers():
     global bm25_retrievers
     chroma_db, bm25_retrievers = get_retrievers()
 ################################ Tools for Agent to Use ################################
 # The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
@@ -280,19 +324,18 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
         query_and_country_list = ast.literal_eval(query_and_country)
         query = query_and_country_list[0]
         country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
-        if not country in countries:
             return """The country that you input into the tool cannot be found.
             If you did not make a mistake and the country that you input is indeed what the user asked,
             then there is no record for the country and no answer can be obtained."""
-        # different retrievers
         if country in st.session_state['countries_override']:
-            # # keyword
-            # bm = new_bm25_retrievers[country]
-            # bm.k = st.session_state['bm25_n_similar_documents']
-            # # semantic
-            # chroma = new_chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
-            pass
         else:
             # keyword
             bm = bm25_retrievers[country]
@@ -315,7 +358,7 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
         # all source documents linked to answer any query (or part of it) are visible
         st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
         st.session_state['source_documents'].append(result['source_documents'])
-        return f"{query.capitalize()} for {country}: " + result['result']
     except Exception as e:
         return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
@@ -410,29 +453,11 @@ with st.sidebar:
     st.write("")
-    # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
-    new_documents_chroma = glob.glob("chromadb/new*")
-    new_documents_bm25 = glob.glob("bm25/new*")
-    new_countries = []
-    # loop through new docs in chroma retrievers created by user scraping/pdf
-    for i, doc in enumerate(new_documents_chroma):
-        if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]): # check that the doc also exists for bm25 retriever
-            new_doc_country = doc.split('_')[1]
-            new_doc_chunk_size = doc.split('_')[3]
-            new_doc_chunk_overlap = doc.split('_')[5]
-            # check that the retrievers are created for the current selected chunk sizes
-            if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
-                new_countries.append(new_doc_country)
     # if new retrievers that pass the above criteria are found, let the user know their countries
     # the user can select from these countries to override existing retrievers
     # otherwise prompt user to scrape or upload own PDF to create the new retrievers
-    if len(new_countries) == 0:
-        info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
-    else:
-        info = '(⚠️Own documents for the following countries found, select them in the list below to override)'
     with st.expander("Document Config", expanded = True):
         st.multiselect(
@@ -498,13 +523,13 @@ with st.sidebar:
         )
         st.number_input(
-                "Number of Relevant Documents Returned by Keyword Retriever",
                 0, 20,
                 key="bm25_n_similar_documents"
         )
         st.number_input(
-                "Number of Relevant Documents Returned by Semantic Retriever",
                 0, 20,
                 key="chroma_n_similar_documents"
         )
@@ -594,6 +619,7 @@ if page == "View Source Docs for Last Query":
 if page == "Scrape or Upload Own Docs":
     st.header("Scrape or Upload Own PDF")
     st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
     st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
     country_scrape_upload = st.selectbox(
@@ -622,7 +648,7 @@ if page == "Scrape or Upload Own Docs":
                 "Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
             ]
-    option = st.selectbox(
         "How Do You Wish To Create New Documents",
         options=options
     )
@@ -631,17 +657,23 @@ if page == "Scrape or Upload Own Docs":
     submit_scrape_web = False
     submit_scrape_vector_store = False
-    def get_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
-        #with st.spinner('Setting up new bm25 retrievers with documents, can take very long...'):
-            # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
             # can be used to override existing vectorstore for this country in sidebar document configuration
-        setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
-        #with st.spinner('Setting up new chromadb vectores with documents, can take 5 mins and above...'):
-            # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
             # can be used to override existing vectorstore for this country in sidebar document configuration
-        setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
     # form for user to configure pdf loading options
     if option == options[0]:
         with st.form(key='upload_pdf_form'):
@@ -651,13 +683,20 @@ if page == "Scrape or Upload Own Docs":
                 temp_file = "./temp.pdf"
                 with open(temp_file, "wb") as file:
                     file.write(uploaded_pdf.getvalue())
-                    pdf_filename, = uploaded_pdf.name
             submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
         if submit_upload_pdf:
-            #with st.spinner('Generating documents from PDF...'):
-            all_documents = pdf_loader_local(pdf_filename, country_scrape_upload)
-            get_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
     # form for user to configure web scraping for duckduckgo
@@ -665,7 +704,7 @@ if page == "Scrape or Upload Own Docs":
         with st.form(key='scrape_web_form'):
             st.subheader(f"Selected Option: {option}")
             n_search_results = st.number_input(
-                                        "How many DuckDuckGo search results would you like to scrape?",
                                         0, 20,
                                         value = 5
                                     )
@@ -673,21 +712,21 @@ if page == "Scrape or Upload Own Docs":
                                         "Search Term",
                                         value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
                                     )
-            submit_scrape_web = st.form_submit_button(label='Scrape Web for Results (Scroll down after clicking)')
         if submit_scrape_web:
             with st.spinner('Scraping web using Duck Duck Go search...'):
-                all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
-            with st.form(key='scrape_web_form2'):
-                st.write(f"Results from Web Scrape")
-                try:
-                    st.write(df_links)
-                except:
-                    st.write("Waiting for web scraping results.")
-                submit_scrape_vector_store = st.form_submit_button(label='Create Vector Store from Search Results')
-            if submit_scrape_vector_store:
-                #with st.spinner('Generating documents from web search results...'):
                 all_documents = process_links_load_documents(all_links)
-                get_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
-                st.write("Done.")

     global bm25_retrievers
     chroma_db, bm25_retrievers = get_retrievers()
+chroma_db_new = None
+bm25_new_retrievers = {} # to store retrievers for different countries
+# get retrievers for country which we override
+if len(st.session_state['countries_override']) > 0:
+    for country in st.session_state['countries_override']:
+        chroma_db_new = Chroma(persist_directory=f"chroma_db/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_",embedding_function=hf_embeddings)
+        bm25_filename = f"bm25/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_.pickle"
+        with open(bm25_filename, 'rb') as handle:
+            bm25_retriever = pickle.load(handle)
+            bm25_new_retrievers[country] = bm25_retriever
+# check if there are any new retrievers where user uploaded PDF or scraped new links and return list of countries for them
+def check_for_new_retrievers():
+    # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
+    new_documents_chroma = glob.glob("chromadb/new*")
+    new_documents_bm25 = glob.glob("bm25/new*")
+    new_countries = []
+    print(new_documents_bm25)
+    # loop through new docs in chroma retrievers created by user scraping/pdf (if any)
+    try:
+        for doc in new_documents_chroma:
+            if (f"bm25\\{doc.split('\\')[1]}.pickle" in new_documents_bm25): # check that the doc also exists for bm25 retriever
+                new_doc_country = doc.split('_')[1]
+                new_doc_chunk_size = doc.split('_')[3]
+                new_doc_chunk_overlap = doc.split('_')[5]
+                # check that the retrievers are created for the current selected chunk sizes
+                if ((new_doc_chunk_overlap == str(st.session_state['chunk_overlap'])) & (new_doc_chunk_size == str(st.session_state['chunk_size']))):
+                    new_countries.append(new_doc_country)
+    except Exception as e:
+        print(e)
+    if len(new_countries) == 0:
+        info = ' (Own documents are :red[NOT FOUND]. Must first scrape or upload own PDF (in menu above) before you can select any countries to override.)'
+    else:
+        info = ' (⚠️Own documents for the following countries :green[FOUND], select them in the list below to override.)'
+    return new_countries, info
 ################################ Tools for Agent to Use ################################
 # The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
         query_and_country_list = ast.literal_eval(query_and_country)
         query = query_and_country_list[0]
         country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
+        if not country in (countries + st.session_state['countries_override']):
             return """The country that you input into the tool cannot be found.
             If you did not make a mistake and the country that you input is indeed what the user asked,
             then there is no record for the country and no answer can be obtained."""
+        # if there are countries we want to override
         if country in st.session_state['countries_override']:
+            # keyword
+            bm = bm25_new_retrievers [country]
+            bm.k = st.session_state['bm25_n_similar_documents']
+            # semantic
+            chroma = chroma_db_new.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
         else:
             # keyword
             bm = bm25_retrievers[country]
         # all source documents linked to answer any query (or part of it) are visible
         st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
         st.session_state['source_documents'].append(result['source_documents'])
+        return f"'{query.capitalize()}' for '{country}': " + result['result']
     except Exception as e:
         return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
     st.write("")
+    new_countries, info = check_for_new_retrievers()
     # if new retrievers that pass the above criteria are found, let the user know their countries
     # the user can select from these countries to override existing retrievers
     # otherwise prompt user to scrape or upload own PDF to create the new retrievers
     with st.expander("Document Config", expanded = True):
         st.multiselect(
         )
         st.number_input(
+                "Number of Relevant Documents Returned by Keyword Retriever (BM25)",
                 0, 20,
                 key="bm25_n_similar_documents"
         )
         st.number_input(
+                "Number of Relevant Documents Returned by Semantic Retriever (ChromaDB)",
                 0, 20,
                 key="chroma_n_similar_documents"
         )
 if page == "Scrape or Upload Own Docs":
     st.header("Scrape or Upload Own PDF")
     st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
+    st.write(":blue[NOTE: Certain countries were not present in the original default vector stores, you can scrape data for these countries too so you can ask about them in the chat.]")
     st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
     country_scrape_upload = st.selectbox(
                 "Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
             ]
+    option = st.radio(
         "How Do You Wish To Create New Documents",
         options=options
     )
     submit_scrape_web = False
     submit_scrape_vector_store = False
+    # save new retrievers in local directory
+    def save_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
+        with st.spinner('Setting up new bm25 retrievers with documents, can take 5 mins and above...'):
+            # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
             # can be used to override existing vectorstore for this country in sidebar document configuration
+            setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
+        with st.spinner('Setting up new chromadb vector stores with documents, can take 5 mins and above...'):
+            # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
             # can be used to override existing vectorstore for this country in sidebar document configuration
+            setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country_scrape_upload)
+        st.toast(":blue[SUCCESS!] New retrievers set up with your new data. To override data for this country, you can :blue[Select the Countries to Override in the 'Document Config'] section of the left sidebar.")
+        st.rerun()
     # form for user to configure pdf loading options
     if option == options[0]:
         with st.form(key='upload_pdf_form'):
                 temp_file = "./temp.pdf"
                 with open(temp_file, "wb") as file:
                     file.write(uploaded_pdf.getvalue())
+                    pdf_filename = uploaded_pdf.name
             submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
+            st.markdown(":blue[NOTE:] After you are done creating the vectore store, the country will appear under :blue[Countries to Override in the 'Document Config'] section of the left sidebar. Select the country to override it.")
         if submit_upload_pdf:
+            try:
+                with st.spinner('Generating documents from PDF...'):
+                    all_documents = pdf_loader_local(temp_file, country_scrape_upload)
+                    #st.write(all_documents)
+                save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
+            except Exception as e:
+                st.write(f"Error! Did you remember to upload the PDF file? Error Message: {e}")
     # form for user to configure web scraping for duckduckgo
         with st.form(key='scrape_web_form'):
             st.subheader(f"Selected Option: {option}")
             n_search_results = st.number_input(
+                                        "How many DuckDuckGo search results would you like to scrape? In the default vector stores, the number is 10 but it will take a very long time!",
                                         0, 20,
                                         value = 5
                                     )
                                         "Search Term",
                                         value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
                                     )
+            submit_scrape_web = st.form_submit_button(label='Scrape Web for Results and Create Vector Store (Scroll down after clicking)')
         if submit_scrape_web:
             with st.spinner('Scraping web using Duck Duck Go search...'):
+                 all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
+            # with st.form(key='scrape_web_form2'):
+            st.write(f"Results from Web Scrape")
+            try:
+                st.write(df_links)
+            except:
+                st.write("Waiting for web scraping results.")
+            #     submit_scrape_vector_store = st.form_submit_button(label='Create Vector Store from Search Results')
+           # if submit_scrape_vector_store:
+            with st.spinner('Generating documents from web search results...'):
                 all_documents = process_links_load_documents(all_links)
+            save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)