Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -256,6 +256,50 @@ def update_retrievers():
|
|
256 |
global bm25_retrievers
|
257 |
chroma_db, bm25_retrievers = get_retrievers()
|
258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
################################ Tools for Agent to Use ################################
|
260 |
|
261 |
# The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
|
@@ -280,19 +324,18 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
|
|
280 |
query_and_country_list = ast.literal_eval(query_and_country)
|
281 |
query = query_and_country_list[0]
|
282 |
country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
|
283 |
-
if not country in countries:
|
284 |
return """The country that you input into the tool cannot be found.
|
285 |
If you did not make a mistake and the country that you input is indeed what the user asked,
|
286 |
then there is no record for the country and no answer can be obtained."""
|
287 |
|
288 |
-
#
|
289 |
if country in st.session_state['countries_override']:
|
290 |
-
#
|
291 |
-
|
292 |
-
|
293 |
-
#
|
294 |
-
|
295 |
-
pass
|
296 |
else:
|
297 |
# keyword
|
298 |
bm = bm25_retrievers[country]
|
@@ -315,7 +358,7 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
|
|
315 |
# all source documents linked to answer any query (or part of it) are visible
|
316 |
st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
|
317 |
st.session_state['source_documents'].append(result['source_documents'])
|
318 |
-
return f"{query.capitalize()} for {country}: " + result['result']
|
319 |
|
320 |
except Exception as e:
|
321 |
return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
|
@@ -410,29 +453,11 @@ with st.sidebar:
|
|
410 |
|
411 |
st.write("")
|
412 |
|
413 |
-
|
414 |
-
new_documents_chroma = glob.glob("chromadb/new*")
|
415 |
-
new_documents_bm25 = glob.glob("bm25/new*")
|
416 |
-
new_countries = []
|
417 |
-
|
418 |
-
# loop through new docs in chroma retrievers created by user scraping/pdf
|
419 |
-
for i, doc in enumerate(new_documents_chroma):
|
420 |
-
if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]): # check that the doc also exists for bm25 retriever
|
421 |
-
new_doc_country = doc.split('_')[1]
|
422 |
-
new_doc_chunk_size = doc.split('_')[3]
|
423 |
-
new_doc_chunk_overlap = doc.split('_')[5]
|
424 |
-
|
425 |
-
# check that the retrievers are created for the current selected chunk sizes
|
426 |
-
if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
|
427 |
-
new_countries.append(new_doc_country)
|
428 |
|
429 |
# if new retrievers that pass the above criteria are found, let the user know their countries
|
430 |
# the user can select from these countries to override existing retrievers
|
431 |
# otherwise prompt user to scrape or upload own PDF to create the new retrievers
|
432 |
-
if len(new_countries) == 0:
|
433 |
-
info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
|
434 |
-
else:
|
435 |
-
info = '(⚠️Own documents for the following countries found, select them in the list below to override)'
|
436 |
|
437 |
with st.expander("Document Config", expanded = True):
|
438 |
st.multiselect(
|
@@ -498,13 +523,13 @@ with st.sidebar:
|
|
498 |
)
|
499 |
|
500 |
st.number_input(
|
501 |
-
"Number of Relevant Documents Returned by Keyword Retriever",
|
502 |
0, 20,
|
503 |
key="bm25_n_similar_documents"
|
504 |
)
|
505 |
|
506 |
st.number_input(
|
507 |
-
"Number of Relevant Documents Returned by Semantic Retriever",
|
508 |
0, 20,
|
509 |
key="chroma_n_similar_documents"
|
510 |
)
|
@@ -594,6 +619,7 @@ if page == "View Source Docs for Last Query":
|
|
594 |
if page == "Scrape or Upload Own Docs":
|
595 |
st.header("Scrape or Upload Own PDF")
|
596 |
st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
|
|
|
597 |
st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
|
598 |
|
599 |
country_scrape_upload = st.selectbox(
|
@@ -622,7 +648,7 @@ if page == "Scrape or Upload Own Docs":
|
|
622 |
"Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
|
623 |
]
|
624 |
|
625 |
-
option = st.
|
626 |
"How Do You Wish To Create New Documents",
|
627 |
options=options
|
628 |
)
|
@@ -631,17 +657,23 @@ if page == "Scrape or Upload Own Docs":
|
|
631 |
submit_scrape_web = False
|
632 |
submit_scrape_vector_store = False
|
633 |
|
634 |
-
|
635 |
-
|
636 |
-
|
|
|
637 |
# can be used to override existing vectorstore for this country in sidebar document configuration
|
638 |
-
|
639 |
|
640 |
-
|
641 |
-
# vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
642 |
# can be used to override existing vectorstore for this country in sidebar document configuration
|
643 |
-
|
|
|
|
|
|
|
|
|
644 |
|
|
|
645 |
# form for user to configure pdf loading options
|
646 |
if option == options[0]:
|
647 |
with st.form(key='upload_pdf_form'):
|
@@ -651,13 +683,20 @@ if page == "Scrape or Upload Own Docs":
|
|
651 |
temp_file = "./temp.pdf"
|
652 |
with open(temp_file, "wb") as file:
|
653 |
file.write(uploaded_pdf.getvalue())
|
654 |
-
pdf_filename
|
655 |
submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
|
|
|
|
|
656 |
|
657 |
if submit_upload_pdf:
|
658 |
-
|
659 |
-
|
660 |
-
|
|
|
|
|
|
|
|
|
|
|
661 |
|
662 |
|
663 |
# form for user to configure web scraping for duckduckgo
|
@@ -665,7 +704,7 @@ if page == "Scrape or Upload Own Docs":
|
|
665 |
with st.form(key='scrape_web_form'):
|
666 |
st.subheader(f"Selected Option: {option}")
|
667 |
n_search_results = st.number_input(
|
668 |
-
"How many DuckDuckGo search results would you like to scrape?",
|
669 |
0, 20,
|
670 |
value = 5
|
671 |
)
|
@@ -673,21 +712,21 @@ if page == "Scrape or Upload Own Docs":
|
|
673 |
"Search Term",
|
674 |
value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
|
675 |
)
|
676 |
-
submit_scrape_web = st.form_submit_button(label='Scrape Web for Results (Scroll down after clicking)')
|
677 |
|
678 |
if submit_scrape_web:
|
679 |
with st.spinner('Scraping web using Duck Duck Go search...'):
|
680 |
-
|
681 |
-
with st.form(key='scrape_web_form2'):
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
|
|
691 |
all_documents = process_links_load_documents(all_links)
|
692 |
-
|
693 |
-
st.write("Done.")
|
|
|
256 |
global bm25_retrievers
|
257 |
chroma_db, bm25_retrievers = get_retrievers()
|
258 |
|
259 |
+
chroma_db_new = None
|
260 |
+
bm25_new_retrievers = {} # to store retrievers for different countries
|
261 |
+
|
262 |
+
# get retrievers for country which we override
|
263 |
+
if len(st.session_state['countries_override']) > 0:
|
264 |
+
for country in st.session_state['countries_override']:
|
265 |
+
chroma_db_new = Chroma(persist_directory=f"chroma_db/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_",embedding_function=hf_embeddings)
|
266 |
+
bm25_filename = f"bm25/new_{country}_chunk_{st.session_state['chunk_size']}_overlap_{st.session_state['chunk_overlap']}_.pickle"
|
267 |
+
with open(bm25_filename, 'rb') as handle:
|
268 |
+
bm25_retriever = pickle.load(handle)
|
269 |
+
bm25_new_retrievers[country] = bm25_retriever
|
270 |
+
|
271 |
+
|
272 |
+
# check if there are any new retrievers where user uploaded PDF or scraped new links and return list of countries for them
|
273 |
+
def check_for_new_retrievers():
|
274 |
+
|
275 |
+
# see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
|
276 |
+
new_documents_chroma = glob.glob("chromadb/new*")
|
277 |
+
new_documents_bm25 = glob.glob("bm25/new*")
|
278 |
+
new_countries = []
|
279 |
+
print(new_documents_bm25)
|
280 |
+
# loop through new docs in chroma retrievers created by user scraping/pdf (if any)
|
281 |
+
try:
|
282 |
+
for doc in new_documents_chroma:
|
283 |
+
if (f"bm25\\{doc.split('\\')[1]}.pickle" in new_documents_bm25): # check that the doc also exists for bm25 retriever
|
284 |
+
|
285 |
+
new_doc_country = doc.split('_')[1]
|
286 |
+
new_doc_chunk_size = doc.split('_')[3]
|
287 |
+
new_doc_chunk_overlap = doc.split('_')[5]
|
288 |
+
|
289 |
+
# check that the retrievers are created for the current selected chunk sizes
|
290 |
+
if ((new_doc_chunk_overlap == str(st.session_state['chunk_overlap'])) & (new_doc_chunk_size == str(st.session_state['chunk_size']))):
|
291 |
+
new_countries.append(new_doc_country)
|
292 |
+
except Exception as e:
|
293 |
+
print(e)
|
294 |
+
|
295 |
+
if len(new_countries) == 0:
|
296 |
+
info = ' (Own documents are :red[NOT FOUND]. Must first scrape or upload own PDF (in menu above) before you can select any countries to override.)'
|
297 |
+
else:
|
298 |
+
info = ' (⚠️Own documents for the following countries :green[FOUND], select them in the list below to override.)'
|
299 |
+
|
300 |
+
return new_countries, info
|
301 |
+
|
302 |
+
|
303 |
################################ Tools for Agent to Use ################################
|
304 |
|
305 |
# The most important tool is the first one, which uses a RetrievalQA chain to answer a question about a specific country's ESG policies,
|
|
|
324 |
query_and_country_list = ast.literal_eval(query_and_country)
|
325 |
query = query_and_country_list[0]
|
326 |
country = query_and_country_list[1].capitalize() # in case LLM did not capitalize first letter as filtering for metadata is case sensitive
|
327 |
+
if not country in (countries + st.session_state['countries_override']):
|
328 |
return """The country that you input into the tool cannot be found.
|
329 |
If you did not make a mistake and the country that you input is indeed what the user asked,
|
330 |
then there is no record for the country and no answer can be obtained."""
|
331 |
|
332 |
+
# if there are countries we want to override
|
333 |
if country in st.session_state['countries_override']:
|
334 |
+
# keyword
|
335 |
+
bm = bm25_new_retrievers [country]
|
336 |
+
bm.k = st.session_state['bm25_n_similar_documents']
|
337 |
+
# semantic
|
338 |
+
chroma = chroma_db_new.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
|
|
|
339 |
else:
|
340 |
# keyword
|
341 |
bm = bm25_retrievers[country]
|
|
|
358 |
# all source documents linked to answer any query (or part of it) are visible
|
359 |
st.session_state['source_documents'].append(f"Documents retrieved for agent query '{query}' for country '{country}'.")
|
360 |
st.session_state['source_documents'].append(result['source_documents'])
|
361 |
+
return f"'{query.capitalize()}' for '{country}': " + result['result']
|
362 |
|
363 |
except Exception as e:
|
364 |
return f"""There is an error using this tool: {e}. Check if you have input anything wrongly and try again.
|
|
|
453 |
|
454 |
st.write("")
|
455 |
|
456 |
+
new_countries, info = check_for_new_retrievers()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
# if new retrievers that pass the above criteria are found, let the user know their countries
|
459 |
# the user can select from these countries to override existing retrievers
|
460 |
# otherwise prompt user to scrape or upload own PDF to create the new retrievers
|
|
|
|
|
|
|
|
|
461 |
|
462 |
with st.expander("Document Config", expanded = True):
|
463 |
st.multiselect(
|
|
|
523 |
)
|
524 |
|
525 |
st.number_input(
|
526 |
+
"Number of Relevant Documents Returned by Keyword Retriever (BM25)",
|
527 |
0, 20,
|
528 |
key="bm25_n_similar_documents"
|
529 |
)
|
530 |
|
531 |
st.number_input(
|
532 |
+
"Number of Relevant Documents Returned by Semantic Retriever (ChromaDB)",
|
533 |
0, 20,
|
534 |
key="chroma_n_similar_documents"
|
535 |
)
|
|
|
619 |
if page == "Scrape or Upload Own Docs":
|
620 |
st.header("Scrape or Upload Own PDF")
|
621 |
st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
|
622 |
+
st.write(":blue[NOTE: Certain countries were not present in the original default vector stores, you can scrape data for these countries too so you can ask about them in the chat.]")
|
623 |
st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
|
624 |
|
625 |
country_scrape_upload = st.selectbox(
|
|
|
648 |
"Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
|
649 |
]
|
650 |
|
651 |
+
option = st.radio(
|
652 |
"How Do You Wish To Create New Documents",
|
653 |
options=options
|
654 |
)
|
|
|
657 |
submit_scrape_web = False
|
658 |
submit_scrape_vector_store = False
|
659 |
|
660 |
+
# save new retrievers in local directory
|
661 |
+
def save_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
|
662 |
+
with st.spinner('Setting up new bm25 retrievers with documents, can take 5 mins and above...'):
|
663 |
+
# vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
|
664 |
# can be used to override existing vectorstore for this country in sidebar document configuration
|
665 |
+
setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
|
666 |
|
667 |
+
with st.spinner('Setting up new chromadb vector stores with documents, can take 5 mins and above...'):
|
668 |
+
# vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
|
669 |
# can be used to override existing vectorstore for this country in sidebar document configuration
|
670 |
+
setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country_scrape_upload)
|
671 |
+
|
672 |
+
st.toast(":blue[SUCCESS!] New retrievers set up with your new data. To override data for this country, you can :blue[Select the Countries to Override in the 'Document Config'] section of the left sidebar.")
|
673 |
+
st.rerun()
|
674 |
+
|
675 |
|
676 |
+
|
677 |
# form for user to configure pdf loading options
|
678 |
if option == options[0]:
|
679 |
with st.form(key='upload_pdf_form'):
|
|
|
683 |
temp_file = "./temp.pdf"
|
684 |
with open(temp_file, "wb") as file:
|
685 |
file.write(uploaded_pdf.getvalue())
|
686 |
+
pdf_filename = uploaded_pdf.name
|
687 |
submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
|
688 |
+
st.markdown(":blue[NOTE:] After you are done creating the vectore store, the country will appear under :blue[Countries to Override in the 'Document Config'] section of the left sidebar. Select the country to override it.")
|
689 |
+
|
690 |
|
691 |
if submit_upload_pdf:
|
692 |
+
try:
|
693 |
+
with st.spinner('Generating documents from PDF...'):
|
694 |
+
all_documents = pdf_loader_local(temp_file, country_scrape_upload)
|
695 |
+
#st.write(all_documents)
|
696 |
+
save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
|
697 |
+
|
698 |
+
except Exception as e:
|
699 |
+
st.write(f"Error! Did you remember to upload the PDF file? Error Message: {e}")
|
700 |
|
701 |
|
702 |
# form for user to configure web scraping for duckduckgo
|
|
|
704 |
with st.form(key='scrape_web_form'):
|
705 |
st.subheader(f"Selected Option: {option}")
|
706 |
n_search_results = st.number_input(
|
707 |
+
"How many DuckDuckGo search results would you like to scrape? In the default vector stores, the number is 10 but it will take a very long time!",
|
708 |
0, 20,
|
709 |
value = 5
|
710 |
)
|
|
|
712 |
"Search Term",
|
713 |
value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
|
714 |
)
|
715 |
+
submit_scrape_web = st.form_submit_button(label='Scrape Web for Results and Create Vector Store (Scroll down after clicking)')
|
716 |
|
717 |
if submit_scrape_web:
|
718 |
with st.spinner('Scraping web using Duck Duck Go search...'):
|
719 |
+
all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
|
720 |
+
# with st.form(key='scrape_web_form2'):
|
721 |
+
st.write(f"Results from Web Scrape")
|
722 |
+
try:
|
723 |
+
st.write(df_links)
|
724 |
+
except:
|
725 |
+
st.write("Waiting for web scraping results.")
|
726 |
+
# submit_scrape_vector_store = st.form_submit_button(label='Create Vector Store from Search Results')
|
727 |
+
|
728 |
+
# if submit_scrape_vector_store:
|
729 |
+
|
730 |
+
with st.spinner('Generating documents from web search results...'):
|
731 |
all_documents = process_links_load_documents(all_links)
|
732 |
+
save_new_retrievers(all_documents, st.session_state['chunk_size'], st.session_state['chunk_overlap'], country_scrape_upload)
|
|