seanpedrickcase commited on
Commit
2754a2b
β€’
1 Parent(s): 1dc162b

Some package updates and minor changes

Browse files
Dockerfile CHANGED
@@ -17,7 +17,7 @@ COPY requirements.txt .
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
  # Gradio needs to be installed after due to conflict with spacy in requirements
20
- RUN pip install --no-cache-dir gradio==4.32.2
21
 
22
  # Download the BGE embedding model during the build process
23
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
  # Gradio needs to be installed after due to conflict with spacy in requirements
20
+ RUN pip install --no-cache-dir gradio==4.36.1
21
 
22
  # Download the BGE embedding model during the build process
23
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ”
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.32.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -183,12 +183,12 @@ depends on factors such as the type of documents or queries. Information taken f
183
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
184
 
185
  # Load in BM25 data
186
- load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
187
- then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
188
 
189
 
190
  # BM25 search functions on click or enter
191
- keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
192
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
193
 
194
  # Fuzzy search functions on click
@@ -209,15 +209,15 @@ depends on factors such as the type of documents or queries. Information taken f
209
  # Simple run for HF spaces or local on your computer
210
  #block.queue().launch(debug=True)
211
 
212
- #def get_params(request: gr.Request):
213
  # if request:
214
  # print("Request headers dictionary:", request.headers)
215
  # print("IP address:", request.client.host)
216
  # print("Query parameters:", dict(request.query_params))
217
  # return request.query_params
218
 
219
- #request_params = get_params()
220
- #print(request_params)
221
 
222
  # Running on server (e.g. AWS) without specifying port
223
  block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
 
183
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
184
 
185
  # Load in BM25 data
186
+ load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column], api_name="load_keyword").\
187
+ then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, in_clean_data, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file], api_name="prepare_keyword")#.\
188
 
189
 
190
  # BM25 search functions on click or enter
191
+ keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword_search")
192
  keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, orig_keyword_data_state, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
193
 
194
  # Fuzzy search functions on click
 
209
  # Simple run for HF spaces or local on your computer
210
  #block.queue().launch(debug=True)
211
 
212
+ # def get_params(request: gr.Request):
213
  # if request:
214
  # print("Request headers dictionary:", request.headers)
215
  # print("IP address:", request.client.host)
216
  # print("Query parameters:", dict(request.query_params))
217
  # return request.query_params
218
 
219
+ # request_params = get_params()
220
+ # print(request_params)
221
 
222
  # Running on server (e.g. AWS) without specifying port
223
  block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
how_to_create_exe_dist.txt CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.4 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
@@ -28,7 +28,7 @@ a = Analysis(
28
  }
29
  )
30
 
31
- c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.4.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
 
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.5 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
 
28
  }
29
  )
30
 
31
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.5.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
requirements.txt CHANGED
@@ -2,10 +2,11 @@ pandas==2.2.2
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
- torch==2.1.2
 
6
  spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  gradio
9
- sentence_transformers==2.3.1
10
  lxml==5.1.0
11
- boto3==1.34.103
 
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
+ torch==2.3.1
6
+ transformers==4.41.2
7
  spacy
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
9
  gradio
10
+ sentence_transformers==3.0.1
11
  lxml==5.1.0
12
+ boto3==1.34.103
requirements_gpu.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ polars==0.20.3
3
+ pyarrow==14.0.2
4
+ openpyxl==3.1.2
5
+ torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
6
+ spacy
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
+ gradio
9
+ sentence_transformers==2.3.1
10
+ lxml==5.1.0
11
+ boto3==1.34.103
search_funcs/helper_functions.py CHANGED
@@ -37,6 +37,18 @@ default_value = 'output/'
37
  output_folder = get_or_create_env_var(env_var_name, default_value)
38
  print(f'The value of {env_var_name} is {output_folder}')
39
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # Attempt to delete content of gradio temp folder
41
  def get_temp_folder_path():
42
  username = getpass.getuser()
 
37
  output_folder = get_or_create_env_var(env_var_name, default_value)
38
  print(f'The value of {env_var_name} is {output_folder}')
39
 
40
+ def ensure_output_folder_exists(output_folder):
41
+ """Checks if the output folder exists, creates it if not."""
42
+
43
+ folder_name = output_folder
44
+
45
+ if not os.path.exists(folder_name):
46
+ # Create the folder if it doesn't exist
47
+ os.makedirs(folder_name)
48
+ print(f"Created the output folder:", folder_name)
49
+ else:
50
+ print(f"The output folder already exists:", folder_name)
51
+
52
  # Attempt to delete content of gradio temp folder
53
  def get_temp_folder_path():
54
  username = getpass.getuser()
search_funcs/semantic_functions.py CHANGED
@@ -206,6 +206,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
206
 
207
  results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
208
  results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
 
209
 
210
  # Join back to original df
211
  # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
 
206
 
207
  results_df_out = results_df_out.drop(["page_section", "row", "source", "id"], axis=1, errors="ignore")
208
  results_df_out['distances'] = round(results_df_out['distances'].astype(float), 3)
209
+
210
 
211
  # Join back to original df
212
  # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")