bohmian commited on
Commit
4bb91cf
1 Parent(s): eabbef9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -42
app.py CHANGED
@@ -27,6 +27,8 @@ import warnings
27
  warnings.filterwarnings("ignore", category=FutureWarning)
28
  warnings.filterwarnings("ignore", category=DeprecationWarning)
29
 
 
 
30
  # os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
31
 
32
 
@@ -97,21 +99,16 @@ st.set_page_config(
97
  )
98
 
99
  # Document Config
 
 
 
 
 
100
  if 'chunk_size' not in st.session_state:
101
  st.session_state['chunk_size'] = 1000 # choose one of [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
102
 
103
  if 'chunk_overlap' not in st.session_state:
104
- st.session_state['chunk_overlap'] = 100 # choose one of [50, 100, 150, 200]
105
-
106
- # scraping results using DuckDuckGo
107
- if 'top_n_results' not in st.session_state:
108
- st.session_state['top_n_results'] = 10 # this is for returning top n search results using DuckDuckGo
109
-
110
- if 'countries_to_scrape' not in st.session_state:
111
- st.session_state['countries_to_scrape'] = [] # this is for returning top n search results using DuckDuckGo
112
-
113
- # in main app, add configuration for user to scrape new data from DuckDuckGo
114
- # in main app, add configuration for user to upload PDF to override country's existing policies in vectorstore
115
 
116
  # Retriever Config
117
  if 'chroma_n_similar_documents' not in st.session_state:
@@ -379,15 +376,49 @@ with st.sidebar:
379
  [
380
  "Main Chatbot",
381
  "View Source Docs for Last Query",
382
- "Scrape or Upload Docs",
383
  ],
384
  icons=['house', 'gear', 'gear', 'gear'],
385
  menu_icon="", default_index=0)
386
 
387
- with st.container(border = True):
388
- st.write("DO NOT NAVIGATE between pages or change when agent is still generating messages in the chat. Wait for query to complete first.")
389
- st.write("")
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  with st.expander("LLM Config", expanded = True):
392
 
393
  st.selectbox(
@@ -410,22 +441,9 @@ with st.sidebar:
410
  200, 1000,
411
  on_change=update_llm,
412
  key="max_new_tokens"
413
- )
414
 
415
- with st.expander("Document Config", expanded = True):
416
- st.selectbox(
417
- "Chunk Size",
418
- options=[500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000],
419
- on_change=update_retrievers,
420
- key="chunk_size"
421
- )
422
-
423
- st.selectbox(
424
- "Chunk Overlap",
425
- options=[50, 100, 150, 200],
426
- on_change=update_retrievers,
427
- key="chunk_overlap"
428
- )
429
 
430
  with st.expander("Retriever Config", expanded = True):
431
 
@@ -517,18 +535,9 @@ if page == "Main Chatbot":
517
  st.markdown(response)
518
 
519
 
520
-
521
- ################################ Document Page ################################
522
- # to scrape new documents from DuckDuckGo
523
- # to chnange paramters like chunk size
524
- # to upload own PDF
525
- # to override existing data on new scraped data or new pdf uploaded
526
-
527
-
528
-
529
  ################################ Source Documents Page ################################
530
  if page == "View Source Docs for Last Query":
531
- st.header("Source Documents for Last Query")
532
  try:
533
  st.subheader(st.session_state['source_documents'][0])
534
  for doc in st.session_state['source_documents'][1:]:
@@ -539,6 +548,45 @@ if page == "View Source Docs for Last Query":
539
 
540
 
541
 
542
- # in main app, add configuration for user to scrape new data from DuckDuckGo
543
- # in main app, add configuration for user to upload PDF to override country's existing policies in vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
 
 
27
  warnings.filterwarnings("ignore", category=FutureWarning)
28
  warnings.filterwarnings("ignore", category=DeprecationWarning)
29
 
30
+ import glob
31
+
32
  # os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
33
 
34
 
 
99
  )
100
 
101
  # Document Config
102
+ if 'countries_override' not in st.session_state:
103
+ # countries to override with own documents from uploaded pdf or updated scraped search results
104
+ # must first scrape or upload own documents to use this
105
+ st.session_state['countries_override'] = []
106
+
107
  if 'chunk_size' not in st.session_state:
108
  st.session_state['chunk_size'] = 1000 # choose one of [500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
109
 
110
  if 'chunk_overlap' not in st.session_state:
111
+ st.session_state['chunk_overlap'] = 100 # choose one of [50, 100, 150, 200]
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Retriever Config
114
  if 'chroma_n_similar_documents' not in st.session_state:
 
376
  [
377
  "Main Chatbot",
378
  "View Source Docs for Last Query",
379
+ "Scrape or Upload Own Docs",
380
  ],
381
  icons=['house', 'gear', 'gear', 'gear'],
382
  menu_icon="", default_index=0)
383
 
 
 
 
384
 
385
+ with st.expander("Warning", expanded = True):
386
+ st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
387
+
388
+ st.container()
389
+
390
+ # see if retrievers created by user's own uploaded PDF or newly scraped data is found
391
+ new_documents_chroma = glob.glob("chromadb/new*")
392
+ new_documents_bm25 = glob.glob("chromadb/new*")
393
+ new_countries = [doc.split('_')[1] for doc in new_documents_chroma]
394
+ if len(new_countries) == 0:
395
+ info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
396
+ else:
397
+ info = '(⚠️Own documents for the following countries found, select them in the list below to override)'
398
+
399
+ with st.expander("Document Config", expanded = True):
400
+ st.multiselect(
401
+ 'Countries to Override with Own Docs:' + info,
402
+ new_countries,
403
+ key="countries_override"
404
+ )
405
+
406
+ st.selectbox(
407
+ "Chunk Size",
408
+ options=[500, 600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000],
409
+ on_change=update_retrievers,
410
+ key="chunk_size"
411
+ )
412
+
413
+ st.selectbox(
414
+ "Chunk Overlap",
415
+ options=[50, 100, 150, 200],
416
+ on_change=update_retrievers,
417
+ key="chunk_overlap"
418
+ )
419
+
420
+ st.write("")
421
+
422
  with st.expander("LLM Config", expanded = True):
423
 
424
  st.selectbox(
 
441
  200, 1000,
442
  on_change=update_llm,
443
  key="max_new_tokens"
444
+ )
445
 
446
+ st.write("")
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
  with st.expander("Retriever Config", expanded = True):
449
 
 
535
  st.markdown(response)
536
 
537
 
 
 
 
 
 
 
 
 
 
538
  ################################ Source Documents Page ################################
539
  if page == "View Source Docs for Last Query":
540
+ st.subheader("Source Documents for Last Query")
541
  try:
542
  st.subheader(st.session_state['source_documents'][0])
543
  for doc in st.session_state['source_documents'][1:]:
 
548
 
549
 
550
 
551
+ ################################ Scrap or Upload Documents Page ################################
552
+ # to scrape new documents from DuckDuckGo
553
+ # to upload own PDF
554
+ # to override existing data on new scraped data or new pdf uploaded
555
+ if page == "Scrape or Upload Own Docs":
556
+ st.header("Scrape or Upload Own PDF")
557
+ st.write("Here you can choose to upload your own PDF or scrape more recent data via DuckDuckGo search for a selected country below.")
558
+ st.write("You will create new BM2.5 (keyword) and Chroma (semantic) retrievers for it. Note that this can take a very long time.")
559
+
560
+ country_scrape_upload = st.selectbox(
561
+ "Select Country",
562
+ options=[
563
+ "Australia", "Bangladesh", "Brunei", "Cambodia", "China", "India", "Indonesia", "Japan", "Laos", "Macau", "Malaysia", "Myanmar",
564
+ "Nepal", "Philippines", "Singapore", "South Korea", "Sri Lanka", "Thailand", "Vietnam", "France", "Germany", "Israel", "Poland",
565
+ "Sweden", "Turkey", "United Kingdom", "United States"
566
+ ],
567
+ )
568
+ options = [
569
+ "Upload Own PDF",
570
+ "Automatically Scrape Web Data using DuckDuckGo (Will take a long time, 10 mins or more)"
571
+ ]
572
+
573
+ option = st.selectbox(
574
+ "How Do You Wish To Create New Documents",
575
+ options=options
576
+ )
577
+
578
+ col1, col2 = st.columns(2)
579
+ with col1:
580
+ with st.container(border = True):
581
+ st.write("New Documents Chunk Size: (Can change in sidebar)" )
582
+ st.text(f"{st.session_state['chunk_size']}" )
583
+ with col2:
584
+ with st.container(border = True):
585
+ st.write("New Documents Chunk Overlap: (Can change in sidebar)" )
586
+ st.text(f"{st.session_state['chunk_overlap']}")
587
+
588
+ st.subheader(f"Selected Option: {option}")
589
+
590
+
591
 
592
+ # spinner