Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -27,6 +27,8 @@ import warnings
|
|
27 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
28 |
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
29 |
|
|
|
|
|
30 |
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
|
31 |
|
32 |
|
@@ -97,21 +99,16 @@ st.set_page_config(
|
|
97 |
)
|
98 |
|
99 |
# Document Config
|
|
|
|
|
|
|
|
|
|
|
100 |
if 'chunk_size' not in st.session_state:
|
101 |
st.session_state['chunk_size'] = 1000 # choose one of [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
|
102 |
|
103 |
if 'chunk_overlap' not in st.session_state:
|
104 |
-
st.session_state['chunk_overlap'] = 100 # choose one of [50, 100, 150, 200]
|
105 |
-
|
106 |
-
# scraping results using DuckDuckGo
|
107 |
-
if 'top_n_results' not in st.session_state:
|
108 |
-
st.session_state['top_n_results'] = 10 # this is for returning top n search results using DuckDuckGo
|
109 |
-
|
110 |
-
if 'countries_to_scrape' not in st.session_state:
|
111 |
-
st.session_state['countries_to_scrape'] = [] # this is for returning top n search results using DuckDuckGo
|
112 |
-
|
113 |
-
# in main app, add configuration for user to scrape new data from DuckDuckGo
|
114 |
-
# in main app, add configuration for user to upload PDF to override country's existing policies in vectorstore
|
115 |
|
116 |
# Retriever Config
|
117 |
if 'chroma_n_similar_documents' not in st.session_state:
|
@@ -379,15 +376,49 @@ with st.sidebar:
|
|
379 |
[
|
380 |
"Main Chatbot",
|
381 |
"View Source Docs for Last Query",
|
382 |
-
"Scrape or Upload Docs",
|
383 |
],
|
384 |
icons=['house', 'gear', 'gear', 'gear'],
|
385 |
menu_icon="", default_index=0)
|
386 |
|
387 |
-
with st.container(border = True):
|
388 |
-
st.write("DO NOT NAVIGATE between pages or change when agent is still generating messages in the chat. Wait for query to complete first.")
|
389 |
-
st.write("")
|
390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
with st.expander("LLM Config", expanded = True):
|
392 |
|
393 |
st.selectbox(
|
@@ -410,22 +441,9 @@ with st.sidebar:
|
|
410 |
200, 1000,
|
411 |
on_change=update_llm,
|
412 |
key="max_new_tokens"
|
413 |
-
)
|
414 |
|
415 |
-
|
416 |
-
st.selectbox(
|
417 |
-
"Chunk Size",
|
418 |
-
options=[500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000],
|
419 |
-
on_change=update_retrievers,
|
420 |
-
key="chunk_size"
|
421 |
-
)
|
422 |
-
|
423 |
-
st.selectbox(
|
424 |
-
"Chunk Overlap",
|
425 |
-
options=[50, 100, 150, 200],
|
426 |
-
on_change=update_retrievers,
|
427 |
-
key="chunk_overlap"
|
428 |
-
)
|
429 |
|
430 |
with st.expander("Retriever Config", expanded = True):
|
431 |
|
@@ -517,18 +535,9 @@ if page == "Main Chatbot":
|
|
517 |
st.markdown(response)
|
518 |
|
519 |
|
520 |
-
|
521 |
-
################################ Document Page ################################
|
522 |
-
# to scrape new documents from DuckDuckGo
|
523 |
-
# to chnange paramters like chunk size
|
524 |
-
# to upload own PDF
|
525 |
-
# to override existing data on new scraped data or new pdf uploaded
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
################################ Source Documents Page ################################
|
530 |
if page == "View Source Docs for Last Query":
|
531 |
-
st.
|
532 |
try:
|
533 |
st.subheader(st.session_state['source_documents'][0])
|
534 |
for doc in st.session_state['source_documents'][1:]:
|
@@ -539,6 +548,45 @@ if page == "View Source Docs for Last Query":
|
|
539 |
|
540 |
|
541 |
|
542 |
-
|
543 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
|
|
|
27 |
warnings.filterwarnings("ignore", category=FutureWarning)
|
28 |
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
29 |
|
30 |
+
import glob
|
31 |
+
|
32 |
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
|
33 |
|
34 |
|
|
|
99 |
)
|
100 |
|
101 |
# Document Config
|
102 |
+
if 'countries_override' not in st.session_state:
|
103 |
+
# countries to override with own documents from uploaded pdf or updated scraped search results
|
104 |
+
# must first scrape or upload own documents to use this
|
105 |
+
st.session_state['countries_override'] = []
|
106 |
+
|
107 |
if 'chunk_size' not in st.session_state:
|
108 |
st.session_state['chunk_size'] = 1000 # choose one of [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
|
109 |
|
110 |
if 'chunk_overlap' not in st.session_state:
|
111 |
+
st.session_state['chunk_overlap'] = 100 # choose one of [50, 100, 150, 200]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
# Retriever Config
|
114 |
if 'chroma_n_similar_documents' not in st.session_state:
|
|
|
376 |
[
|
377 |
"Main Chatbot",
|
378 |
"View Source Docs for Last Query",
|
379 |
+
"Scrape or Upload Own Docs",
|
380 |
],
|
381 |
icons=['house', 'gear', 'gear', 'gear'],
|
382 |
menu_icon="", default_index=0)
|
383 |
|
|
|
|
|
|
|
384 |
|
385 |
+
with st.expander("Warning", expanded = True):
|
386 |
+
st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
|
387 |
+
|
388 |
+
st.container()
|
389 |
+
|
390 |
+
# see if retrievers created by user's own uploaded PDF or newly scraped data is found
|
391 |
+
new_documents_chroma = glob.glob("chromadb/new*")
|
392 |
+
new_documents_bm25 = glob.glob("chromadb/new*")
|
393 |
+
new_countries = [doc.split('_')[1] for doc in new_documents_chroma]
|
394 |
+
if len(new_countries) == 0:
|
395 |
+
info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
|
396 |
+
else:
|
397 |
+
info = '(⚠️Own documents for the following countries found, select them in the list below to override)'
|
398 |
+
|
399 |
+
with st.expander("Document Config", expanded = True):
|
400 |
+
st.multiselect(
|
401 |
+
'Countries to Override with Own Docs:' + info,
|
402 |
+
new_countries,
|
403 |
+
key="countries_override"
|
404 |
+
)
|
405 |
+
|
406 |
+
st.selectbox(
|
407 |
+
"Chunk Size",
|
408 |
+
options=[500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000],
|
409 |
+
on_change=update_retrievers,
|
410 |
+
key="chunk_size"
|
411 |
+
)
|
412 |
+
|
413 |
+
st.selectbox(
|
414 |
+
"Chunk Overlap",
|
415 |
+
options=[50, 100, 150, 200],
|
416 |
+
on_change=update_retrievers,
|
417 |
+
key="chunk_overlap"
|
418 |
+
)
|
419 |
+
|
420 |
+
st.write("")
|
421 |
+
|
422 |
with st.expander("LLM Config", expanded = True):
|
423 |
|
424 |
st.selectbox(
|
|
|
441 |
200, 1000,
|
442 |
on_change=update_llm,
|
443 |
key="max_new_tokens"
|
444 |
+
)
|
445 |
|
446 |
+
st.write("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
with st.expander("Retriever Config", expanded = True):
|
449 |
|
|
|
535 |
st.markdown(response)
|
536 |
|
537 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
################################ Source Documents Page ################################
|
539 |
if page == "View Source Docs for Last Query":
|
540 |
+
st.subheader("Source Documents for Last Query")
|
541 |
try:
|
542 |
st.subheader(st.session_state['source_documents'][0])
|
543 |
for doc in st.session_state['source_documents'][1:]:
|
|
|
548 |
|
549 |
|
550 |
|
551 |
+
################################ Scrap or Upload Documents Page ################################
|
552 |
+
# to scrape new documents from DuckDuckGo
|
553 |
+
# to upload own PDF
|
554 |
+
# to override existing data on new scraped data or new pdf uploaded
|
555 |
+
if page == "Scrape or Upload Own Docs":
|
556 |
+
st.header("Scrape or Upload Own PDF")
|
557 |
+
st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
|
558 |
+
st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
|
559 |
+
|
560 |
+
country_scrape_upload = st.selectbox(
|
561 |
+
"Select Country",
|
562 |
+
options=[
|
563 |
+
"Australia", "Bangladesh", "Brunei", "Cambodia", "China", "India", "Indonesia", "Japan", "Laos", "Macau", "Malaysia", "Myanmar",
|
564 |
+
"Nepal", "Philippines", "Singapore", "South Korea", "Sri Lanka", "Thailand", "Vietnam", "France", "Germany", "Israel", "Poland",
|
565 |
+
"Sweden", "Turkey", "United Kingdom", "United States"
|
566 |
+
],
|
567 |
+
)
|
568 |
+
options = [
|
569 |
+
"Upload Own PDF",
|
570 |
+
"Automatically Scrape Web Data using DuckDuckGo (Will take a long time, 10 mins or more)"
|
571 |
+
]
|
572 |
+
|
573 |
+
option = st.selectbox(
|
574 |
+
"How Do You Wish To Create New Documents",
|
575 |
+
options=options
|
576 |
+
)
|
577 |
+
|
578 |
+
col1, col2 = st.columns(2)
|
579 |
+
with col1:
|
580 |
+
with st.container(border = True):
|
581 |
+
st.write("New Documents Chunk Size: (Can change in sidebar)" )
|
582 |
+
st.text(f"{st.session_state['chunk_size']}" )
|
583 |
+
with col2:
|
584 |
+
with st.container(border = True):
|
585 |
+
st.write("New Documents Chunk Overlap: (Can change in sidebar)" )
|
586 |
+
st.text(f"{st.session_state['chunk_overlap']}")
|
587 |
+
|
588 |
+
st.subheader(f"Selected Option: {option}")
|
589 |
+
|
590 |
+
|
591 |
|
592 |
+
# spinner
|