bohmian commited on
Commit
558b359
·
verified ·
1 Parent(s): 8ee023c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -6
app.py CHANGED
@@ -273,12 +273,15 @@ def check_for_new_retrievers():
273
  # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
274
  new_documents_chroma = glob.glob("chromadb/new*")
275
  new_documents_bm25 = glob.glob("bm25/new*")
 
 
276
  new_countries = []
277
- print(new_documents_bm25)
278
  # loop through new docs in chroma retrievers created by user scraping/pdf (if any)
279
  try:
280
  for doc in new_documents_chroma:
281
- if ((("bm25\\" + doc.split('\\')[1] + ".pickle") in new_documents_bm25) | (("bm25/" + doc.split('/')[1] + ".pickle") in new_documents_bm25)): # check that the doc also exists for bm25 retriever
 
282
 
283
  new_doc_country = doc.split('_')[1]
284
  new_doc_chunk_size = doc.split('_')[3]
@@ -643,7 +646,7 @@ if page == "Scrape or Upload Own Docs":
643
  # how user wishes to populate documents
644
  options = [
645
  "Upload Own PDF",
646
- "Automatically Scrape Web Data using DuckDuckGo (Will take 5 mins or more)"
647
  ]
648
 
649
  option = st.radio(
@@ -657,12 +660,12 @@ if page == "Scrape or Upload Own Docs":
657
 
658
  # save new retrievers in local directory
659
  def save_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
660
- with st.spinner('Setting up new bm25 retrievers with documents, can take 5 mins and above...'):
661
  # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
662
  # can be used to override existing vectorstore for this country in sidebar document configuration
663
  setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
664
 
665
- with st.spinner('Setting up new chromadb vector stores with documents, can take 5 mins and above...'):
666
  # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
667
  # can be used to override existing vectorstore for this country in sidebar document configuration
668
  setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country_scrape_upload)
@@ -682,7 +685,7 @@ if page == "Scrape or Upload Own Docs":
682
  with open(temp_file, "wb") as file:
683
  file.write(uploaded_pdf.getvalue())
684
  pdf_filename = uploaded_pdf.name
685
- submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store')
686
  st.markdown(":blue[NOTE:] After you are done creating the vectore store, the country will appear under :blue[Countries to Override in the 'Document Config'] section of the left sidebar. Select the country to override it.")
687
 
688
 
 
273
  # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
274
  new_documents_chroma = glob.glob("chromadb/new*")
275
  new_documents_bm25 = glob.glob("bm25/new*")
276
+ new_documents_chroma = [os.path.split(doc)[-1] for doc in new_documents_chroma]
277
+ new_documents_bm25 = [os.path.split(doc)[-1] for doc in new_documents_bm25]
278
  new_countries = []
279
+
280
  # loop through new docs in chroma retrievers created by user scraping/pdf (if any)
281
  try:
282
  for doc in new_documents_chroma:
283
+ #print(doc)
284
+ if ((doc + ".pickle") in new_documents_bm25): # check that the doc also exists for bm25 retriever
285
 
286
  new_doc_country = doc.split('_')[1]
287
  new_doc_chunk_size = doc.split('_')[3]
 
646
  # how user wishes to populate documents
647
  options = [
648
  "Upload Own PDF",
649
+ "Automatically Scrape Web Data using DuckDuckGo (may take more than 5 mins)"
650
  ]
651
 
652
  option = st.radio(
 
660
 
661
  # save new retrievers in local directory
662
  def save_new_retrievers(all_documents, chunk_size, chunk_overlap, country_scrape_upload):
663
+ with st.spinner('Setting up new bm25 retrievers with documents, may take more than 5 mins...'):
664
  # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
665
  # can be used to override existing vectorstore for this country in sidebar document configuration
666
  setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
667
 
668
+ with st.spinner('Setting up new chromadb vector stores with documents, may take more than 5 mins...'):
669
  # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
670
  # can be used to override existing vectorstore for this country in sidebar document configuration
671
  setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country_scrape_upload)
 
685
  with open(temp_file, "wb") as file:
686
  file.write(uploaded_pdf.getvalue())
687
  pdf_filename = uploaded_pdf.name
688
+ submit_upload_pdf = st.form_submit_button(label='Upload and Create Vector Store (Scroll down after clicking)')
689
  st.markdown(":blue[NOTE:] After you are done creating the vectore store, the country will appear under :blue[Countries to Override in the 'Document Config'] section of the left sidebar. Select the country to override it.")
690
 
691