bohmian commited on
Commit
c4776d0
1 Parent(s): c2768d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -57
app.py CHANGED
@@ -256,6 +256,50 @@ def update_retrievers():
256
  global bm25_retrievers
257
  chroma_db, bm25_retrievers = get_retrievers()
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  ################################ Tools for Agent to Use ################################
260
 
261
  # The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
@@ -280,19 +324,18 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
280
  query_and_country_list = ast.literal_eval(query_and_country)
281
  query = query_and_country_list[0]
282
  country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
283
- if not country in countries:
284
  return """The country that you input into the tool cannot be found.
285
  If you did not make a mistake and the country that you input is indeed what the user asked,
286
  then there is no record for the country and no answer can be obtained."""
287
 
288
- # different retrievers
289
  if country in st.session_state['countries_override']:
290
- # # keyword
291
- # bm = new_bm25_retrievers[country]
292
- # bm.k = st.session_state['bm25_n_similar_documents']
293
- # # semantic
294
- # chroma = new_chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
295
- pass
296
  else:
297
  # keyword
298
  bm = bm25_retrievers[country]
@@ -315,7 +358,7 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
315
  # all source documents linked to answer any query (or part of it) are visible
316
  st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
317
  st.session_state['source_documents'].append(result['source_documents'])
318
- return f"{query.capitalize()} for {country}: " + result['result']
319
 
320
  except Exception as e:
321
  return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
@@ -410,29 +453,11 @@ with st.sidebar:
410
 
411
  st.write("")
412
 
413
- # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
414
- new_documents_chroma = glob.glob("chromadb/new*")
415
- new_documents_bm25 = glob.glob("bm25/new*")
416
- new_countries = []
417
-
418
- # loop through new docs in chroma retrievers created by user scraping/pdf
419
- for i, doc in enumerate(new_documents_chroma):
420
- if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]): # check that the doc also exists for bm25 retriever
421
- new_doc_country = doc.split('_')[1]
422
- new_doc_chunk_size = doc.split('_')[3]
423
- new_doc_chunk_overlap = doc.split('_')[5]
424
-
425
- # check that the retrievers are created for the current selected chunk sizes
426
- if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
427
- new_countries.append(new_doc_country)
428
 
429
  # if new retrievers that pass the above criteria are found, let the user know their countries
430
  # the user can select from these countries to override existing retrievers
431
  # otherwise prompt user to scrape or upload own PDF to create the new retrievers
432
- if len(new_countries) == 0:
433
- info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
434
- else:
435
- info = '(⚠️Own documents for the following countries found, select them in the list below to override)'
436
 
437
  with st.expander("Document Config", expanded = True):
438
  st.multiselect(
@@ -498,13 +523,13 @@ with st.sidebar:
498
  )
499
 
500
  st.number_input(
501
- "Number of Relevant Documents Returned by Keyword Retriever",
502
  0, 20,
503
  key="bm25_n_similar_documents"
504
  )
505
 
506
  st.number_input(
507
- "Number of Relevant Documents Returned by Semantic Retriever",
508
  0, 20,
509
  key="chroma_n_similar_documents"
510
  )
@@ -594,6 +619,7 @@ if page == "View Source Docs for Last Query":
594
  if page == "Scrape or Upload Own Docs":
595
  st.header("Scrape or Upload Own PDF")
596
  st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
 
597
  st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
598
 
599
  country_scrape_upload = st.selectbox(
@@ -622,7 +648,7 @@ if page == "Scrape or Upload Own Docs":
622
  "Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
623
  ]
624
 
625
- option = st.selectbox(
626
  "How Do You Wish To Create New Documents",
627
  options=options
628
  )
@@ -631,17 +657,23 @@ if page == "Scrape or Upload Own Docs":
631
  submit_scrape_web = False
632
  submit_scrape_vector_store = False
633
 
634
- def get_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
635
- #with st.spinner('Setting up new bm25 retrievers with documents, can take very long...'):
636
- # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
 
637
  # can be used to override existing vectorstore for this country in sidebar document configuration
638
- setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
639
 
640
- #with st.spinner('Setting up new chromadb vectores with documents, can take 5 mins and above...'):
641
- # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
642
  # can be used to override existing vectorstore for this country in sidebar document configuration
643
- setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
 
 
 
 
644
 
 
645
  # form for user to configure pdf loading options
646
  if option == options[0]:
647
  with st.form(key='upload_pdf_form'):
@@ -651,13 +683,20 @@ if page == "Scrape or Upload Own Docs":
651
  temp_file = "./temp.pdf"
652
  with open(temp_file, "wb") as file:
653
  file.write(uploaded_pdf.getvalue())
654
- pdf_filename, = uploaded_pdf.name
655
  submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
 
 
656
 
657
  if submit_upload_pdf:
658
- #with st.spinner('Generating documents from PDF...'):
659
- all_documents = pdf_loader_local(pdf_filename, country_scrape_upload)
660
- get_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
 
 
 
 
 
661
 
662
 
663
  # form for user to configure web scraping for duckduckgo
@@ -665,7 +704,7 @@ if page == "Scrape or Upload Own Docs":
665
  with st.form(key='scrape_web_form'):
666
  st.subheader(f"Selected Option: {option}")
667
  n_search_results = st.number_input(
668
- "How many DuckDuckGo search results would you like to scrape?",
669
  0, 20,
670
  value = 5
671
  )
@@ -673,21 +712,21 @@ if page == "Scrape or Upload Own Docs":
673
  "Search Term",
674
  value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
675
  )
676
- submit_scrape_web = st.form_submit_button(label='Scrape Web for Results (Scroll down after clicking)')
677
 
678
  if submit_scrape_web:
679
  with st.spinner('Scraping web using Duck Duck Go search...'):
680
- all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
681
- with st.form(key='scrape_web_form2'):
682
- st.write(f"Results from Web Scrape")
683
- try:
684
- st.write(df_links)
685
- except:
686
- st.write("Waiting for web scraping results.")
687
- submit_scrape_vector_store = st.form_submit_button(label='Create Vector Store from Search Results')
688
-
689
- if submit_scrape_vector_store:
690
- #with st.spinner('Generating documents from web search results...'):
 
691
  all_documents = process_links_load_documents(all_links)
692
- get_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
693
- st.write("Done.")
 
256
  global bm25_retrievers
257
  chroma_db, bm25_retrievers = get_retrievers()
258
 
259
+ chroma_db_new = None
260
+ bm25_new_retrievers = {} # to store retrievers for different countries
261
+
262
+ # get retrievers for country which we override
263
+ if len(st.session_state['countries_override']) > 0:
264
+ for country in st.session_state['countries_override']:
265
+ chroma_db_new = Chroma(persist_directory=f"chroma_db/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_",embedding_function=hf_embeddings)
266
+ bm25_filename = f"bm25/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_.pickle"
267
+ with open(bm25_filename, 'rb') as handle:
268
+ bm25_retriever = pickle.load(handle)
269
+ bm25_new_retrievers[country] = bm25_retriever
270
+
271
+
272
+ # check if there are any new retrievers where user uploaded PDF or scraped new links and return list of countries for them
273
+ def check_for_new_retrievers():
274
+
275
+ # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
276
+ new_documents_chroma = glob.glob("chromadb/new*")
277
+ new_documents_bm25 = glob.glob("bm25/new*")
278
+ new_countries = []
279
+ print(new_documents_bm25)
280
+ # loop through new docs in chroma retrievers created by user scraping/pdf (if any)
281
+ try:
282
+ for doc in new_documents_chroma:
283
+ if (f"bm25\\{doc.split('\\')[1]}.pickle" in new_documents_bm25): # check that the doc also exists for bm25 retriever
284
+
285
+ new_doc_country = doc.split('_')[1]
286
+ new_doc_chunk_size = doc.split('_')[3]
287
+ new_doc_chunk_overlap = doc.split('_')[5]
288
+
289
+ # check that the retrievers are created for the current selected chunk sizes
290
+ if ((new_doc_chunk_overlap == str(st.session_state['chunk_overlap'])) & (new_doc_chunk_size == str(st.session_state['chunk_size']))):
291
+ new_countries.append(new_doc_country)
292
+ except Exception as e:
293
+ print(e)
294
+
295
+ if len(new_countries) == 0:
296
+ info = ' (Own documents are :red[NOT FOUND]. Must first scrape or upload own PDF (in menu above) before you can select any countries to override.)'
297
+ else:
298
+ info = ' (⚠️Own documents for the following countries :green[FOUND], select them in the list below to override.)'
299
+
300
+ return new_countries, info
301
+
302
+
303
  ################################ Tools for Agent to Use ################################
304
 
305
  # The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
 
324
  query_and_country_list = ast.literal_eval(query_and_country)
325
  query = query_and_country_list[0]
326
  country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
327
+ if not country in (countries + st.session_state['countries_override']):
328
  return """The country that you input into the tool cannot be found.
329
  If you did not make a mistake and the country that you input is indeed what the user asked,
330
  then there is no record for the country and no answer can be obtained."""
331
 
332
+ # if there are countries we want to override
333
  if country in st.session_state['countries_override']:
334
+ # keyword
335
+ bm = bm25_new_retrievers [country]
336
+ bm.k = st.session_state['bm25_n_similar_documents']
337
+ # semantic
338
+ chroma = chroma_db_new.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
 
339
  else:
340
  # keyword
341
  bm = bm25_retrievers[country]
 
358
  # all source documents linked to answer any query (or part of it) are visible
359
  st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
360
  st.session_state['source_documents'].append(result['source_documents'])
361
+ return f"'{query.capitalize()}' for '{country}': " + result['result']
362
 
363
  except Exception as e:
364
  return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
 
453
 
454
  st.write("")
455
 
456
+ new_countries, info = check_for_new_retrievers()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  # if new retrievers that pass the above criteria are found, let the user know their countries
459
  # the user can select from these countries to override existing retrievers
460
  # otherwise prompt user to scrape or upload own PDF to create the new retrievers
 
 
 
 
461
 
462
  with st.expander("Document Config", expanded = True):
463
  st.multiselect(
 
523
  )
524
 
525
  st.number_input(
526
+ "Number of Relevant Documents Returned by Keyword Retriever (BM25)",
527
  0, 20,
528
  key="bm25_n_similar_documents"
529
  )
530
 
531
  st.number_input(
532
+ "Number of Relevant Documents Returned by Semantic Retriever (ChromaDB)",
533
  0, 20,
534
  key="chroma_n_similar_documents"
535
  )
 
619
  if page == "Scrape or Upload Own Docs":
620
  st.header("Scrape or Upload Own PDF")
621
  st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
622
+ st.write(":blue[NOTE: Certain countries were not present in the original default vector stores, you can scrape data for these countries too so you can ask about them in the chat.]")
623
  st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
624
 
625
  country_scrape_upload = st.selectbox(
 
648
  "Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
649
  ]
650
 
651
+ option = st.radio(
652
  "How Do You Wish To Create New Documents",
653
  options=options
654
  )
 
657
  submit_scrape_web = False
658
  submit_scrape_vector_store = False
659
 
660
+ # save new retrievers in local directory
661
+ def save_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
662
+ with st.spinner('Setting up new bm25 retrievers with documents, can take 5 mins and above...'):
663
+ # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
664
  # can be used to override existing vectorstore for this country in sidebar document configuration
665
+ setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
666
 
667
+ with st.spinner('Setting up new chromadb vector stores with documents, can take 5 mins and above...'):
668
+ # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
669
  # can be used to override existing vectorstore for this country in sidebar document configuration
670
+ setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country_scrape_upload)
671
+
672
+ st.toast(":blue[SUCCESS!] New retrievers set up with your new data. To override data for this country, you can :blue[Select the Countries to Override in the 'Document Config'] section of the left sidebar.")
673
+ st.rerun()
674
+
675
 
676
+
677
  # form for user to configure pdf loading options
678
  if option == options[0]:
679
  with st.form(key='upload_pdf_form'):
 
683
  temp_file = "./temp.pdf"
684
  with open(temp_file, "wb") as file:
685
  file.write(uploaded_pdf.getvalue())
686
+ pdf_filename = uploaded_pdf.name
687
  submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
688
+ st.markdown(":blue[NOTE:] After you are done creating the vectore store, the country will appear under :blue[Countries to Override in the 'Document Config'] section of the left sidebar. Select the country to override it.")
689
+
690
 
691
  if submit_upload_pdf:
692
+ try:
693
+ with st.spinner('Generating documents from PDF...'):
694
+ all_documents = pdf_loader_local(temp_file, country_scrape_upload)
695
+ #st.write(all_documents)
696
+ save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
697
+
698
+ except Exception as e:
699
+ st.write(f"Error! Did you remember to upload the PDF file? Error Message: {e}")
700
 
701
 
702
  # form for user to configure web scraping for duckduckgo
 
704
  with st.form(key='scrape_web_form'):
705
  st.subheader(f"Selected Option: {option}")
706
  n_search_results = st.number_input(
707
+ "How many DuckDuckGo search results would you like to scrape? In the default vector stores, the number is 10 but it will take a very long time!",
708
  0, 20,
709
  value = 5
710
  )
 
712
  "Search Term",
713
  value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
714
  )
715
+ submit_scrape_web = st.form_submit_button(label='Scrape Web for Results and Create Vector Store (Scroll down after clicking)')
716
 
717
  if submit_scrape_web:
718
  with st.spinner('Scraping web using Duck Duck Go search...'):
719
+ all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
720
+ # with st.form(key='scrape_web_form2'):
721
+ st.write(f"Results from Web Scrape")
722
+ try:
723
+ st.write(df_links)
724
+ except:
725
+ st.write("Waiting for web scraping results.")
726
+ # submit_scrape_vector_store = st.form_submit_button(label='Create Vector Store from Search Results')
727
+
728
+ # if submit_scrape_vector_store:
729
+
730
+ with st.spinner('Generating documents from web search results...'):
731
  all_documents = process_links_load_documents(all_links)
732
+ save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)