Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -36,6 +36,7 @@ from web_scrape_and_pdf_loader import (
|
|
36 |
pdf_loader_local
|
37 |
)
|
38 |
|
|
|
39 |
import glob
|
40 |
|
41 |
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
|
@@ -604,28 +605,21 @@ if page == "Scrape or Upload Own Docs":
|
|
604 |
options=options
|
605 |
)
|
606 |
|
607 |
-
|
608 |
st.subheader(f"Selected Option: {option}")
|
609 |
-
|
610 |
-
|
611 |
uploaded_pdf = st.file_uploader("Upload a PDF")
|
612 |
if uploaded_pdf:
|
613 |
temp_file = "./temp.pdf"
|
614 |
with open(temp_file, "wb") as file:
|
615 |
file.write(uploaded_pdf.getvalue())
|
616 |
pdf_filename, = uploaded_pdf.name
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country)
|
623 |
-
|
624 |
-
# vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
625 |
-
# can be used to override existing vectorstore for this country in sidebar document configuration
|
626 |
-
setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country)
|
627 |
-
|
628 |
-
if option == options[1]:
|
629 |
n_search_results = st.number_input(
|
630 |
"How many DuckDuckGo search results would you like to scrape?",
|
631 |
0, 20,
|
@@ -635,15 +629,21 @@ if page == "Scrape or Upload Own Docs":
|
|
635 |
"Search Term",
|
636 |
value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
|
637 |
)
|
|
|
638 |
|
639 |
-
|
640 |
-
all_documents = process_links_load_documents(all_links)
|
641 |
|
642 |
-
|
643 |
-
|
644 |
-
|
|
|
|
|
|
|
645 |
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
|
|
|
|
|
|
|
|
36 |
pdf_loader_local
|
37 |
)
|
38 |
|
39 |
+
# look for new retrievers that user created (to override existing ones if user chooses)
|
40 |
import glob
|
41 |
|
42 |
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your_api_key' # for using HuggingFace Inference API
|
|
|
605 |
options=options
|
606 |
)
|
607 |
|
608 |
+
if option == options[0]:
|
609 |
st.subheader(f"Selected Option: {option}")
|
610 |
+
with st.form(key='upload_pdf_form'):
|
611 |
+
|
612 |
uploaded_pdf = st.file_uploader("Upload a PDF")
|
613 |
if uploaded_pdf:
|
614 |
temp_file = "./temp.pdf"
|
615 |
with open(temp_file, "wb") as file:
|
616 |
file.write(uploaded_pdf.getvalue())
|
617 |
pdf_filename, = uploaded_pdf.name
|
618 |
+
submit_upload_pdf = st.form_submit_button(label='Submit')
|
619 |
+
|
620 |
+
if option == options[1]:
|
621 |
+
st.subheader(f"Selected Option: {option}")
|
622 |
+
with st.form(key='upload_pdf_form'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
623 |
n_search_results = st.number_input(
|
624 |
"How many DuckDuckGo search results would you like to scrape?",
|
625 |
0, 20,
|
|
|
629 |
"Search Term",
|
630 |
value = f"{country_scrape_upload} sustainability esg newest updated public policy document government",
|
631 |
)
|
632 |
+
submit_scrape_web = st.form_submit_button(label='Submit')
|
633 |
|
634 |
+
if submit_upload_pdf | submit_scrape_web:
|
|
|
635 |
|
636 |
+
if submit_upload_pdf:
|
637 |
+
all_documents = pdf_loader_local(pdf_filename, country)
|
638 |
+
|
639 |
+
if submit_scrape_web:
|
640 |
+
all_links, df_links = duckduckgo_scrape(country, search_term, n_search_results)
|
641 |
+
all_documents = process_links_load_documents(all_links)
|
642 |
|
643 |
+
# vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
644 |
+
# can be used to override existing vectorstore for this country in sidebar document configuration
|
645 |
+
setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country)
|
646 |
|
647 |
+
# vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
648 |
+
# can be used to override existing vectorstore for this country in sidebar document configuration
|
649 |
+
setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country)
|