seanpedrickcase commited on
Commit
e0fe055
β€’
1 Parent(s): 7e9dd76

Gradio 4.21. Limitations on file size and creating embeddings. Added AWS integration

Browse files
.gitignore CHANGED
@@ -16,6 +16,7 @@
16
  *.pkl
17
  *.pkl.gz
18
  *.pem
 
19
  docs/*
20
  build/*
21
  dist/*
 
16
  *.pkl
17
  *.pkl.gz
18
  *.pem
19
+ *.json.out
20
  docs/*
21
  build/*
22
  dist/*
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ”
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -11,9 +11,10 @@ from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
11
  from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
12
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
13
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 
14
 
15
- from fastapi import FastAPI
16
- app = FastAPI()
17
 
18
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
19
  temp_folder_path = get_temp_folder_path()
@@ -155,19 +156,34 @@ depends on factors such as the type of documents or queries. Information taken f
155
  in_join_message = gr.Textbox(label="Join file load progress")
156
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
157
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
 
 
 
 
 
 
 
 
 
158
 
159
- in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
160
-
 
161
  # ---
162
  in_k1_button.click(display_info, inputs=in_k1_info)
163
  in_b_button.click(display_info, inputs=in_b_info)
164
  in_alpha_button.click(display_info, inputs=in_alpha_info)
165
  in_no_search_results_button.click(display_info, inputs=in_no_search_info)
 
 
 
 
 
166
 
167
  ### BM25 SEARCH ###
168
  # Update dropdowns upon initial file load
169
- in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
170
- in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
171
 
172
  # Load in BM25 data
173
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
@@ -184,7 +200,7 @@ depends on factors such as the type of documents or queries. Information taken f
184
  ### SEMANTIC SEARCH ###
185
 
186
  # Load in a csv/excel file for semantic search
187
- in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
188
  load_semantic_data_button.click(
189
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
190
  then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
@@ -196,6 +212,16 @@ depends on factors such as the type of documents or queries. Information taken f
196
  # Simple run for HF spaces or local on your computer
197
  #block.queue().launch(debug=True)
198
 
 
 
 
 
 
 
 
 
 
 
199
  # Running on server (e.g. AWS) without specifying port
200
  block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
201
 
 
11
  from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
12
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
13
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
14
+ from search_funcs.aws_functions import load_data_from_aws
15
 
16
+ #from fastapi import FastAPI
17
+ #app = FastAPI()
18
 
19
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!)
20
  temp_folder_path = get_temp_folder_path()
 
156
  in_join_message = gr.Textbox(label="Join file load progress")
157
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
158
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
159
+ with gr.Accordion(label = "AWS data access", open = False):
160
+ with gr.Row():
161
+ in_aws_keyword_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - keyword search"])
162
+ load_aws_keyword_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
163
+ with gr.Row():
164
+ in_aws_semantic_file = gr.Dropdown(label="Choose semantic file to load from AWS (only valid for API Gateway app)", choices=["None", "Bioasq - Biomedical example data - semantic search"])
165
+ load_aws_semantic_data_button = gr.Button(value="Load semantic data from AWS", variant="secondary")
166
+
167
+ out_aws_data_message = gr.Textbox(label="AWS data load progress")
168
 
169
+ # Changing search parameters button
170
+ in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
171
+
172
  # ---
173
  in_k1_button.click(display_info, inputs=in_k1_info)
174
  in_b_button.click(display_info, inputs=in_b_info)
175
  in_alpha_button.click(display_info, inputs=in_alpha_info)
176
  in_no_search_results_button.click(display_info, inputs=in_no_search_info)
177
+
178
+ ### Loading AWS data ###
179
+ load_aws_keyword_data_button.click(fn=load_data_from_aws, inputs=[in_aws_keyword_file], outputs=[in_bm25_file, out_aws_data_message])
180
+ load_aws_semantic_data_button.click(fn=load_data_from_aws, inputs=[in_aws_semantic_file], outputs=[in_semantic_file, out_aws_data_message])
181
+
182
 
183
  ### BM25 SEARCH ###
184
  # Update dropdowns upon initial file load
185
+ in_bm25_file.change(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, orig_keyword_data_state, search_index_state, embeddings_state, tokenised_state, load_finished_message, current_source])
186
+ in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
187
 
188
  # Load in BM25 data
189
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, tokenised_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, keyword_data_list_state, in_bm25_column]).\
 
200
  ### SEMANTIC SEARCH ###
201
 
202
  # Load in a csv/excel file for semantic search
203
+ in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, search_index_state, embeddings_state, tokenised_state, semantic_load_progress, current_source_semantic])
204
  load_semantic_data_button.click(
205
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress, output_file_state]).\
206
  then(docs_to_bge_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file, output_file_state])
 
212
  # Simple run for HF spaces or local on your computer
213
  #block.queue().launch(debug=True)
214
 
215
+ #def get_params(request: gr.Request):
216
+ # if request:
217
+ # print("Request headers dictionary:", request.headers)
218
+ # print("IP address:", request.client.host)
219
+ # print("Query parameters:", dict(request.query_params))
220
+ # return request.query_params
221
+
222
+ #request_params = get_params()
223
+ #print(request_params)
224
+
225
  # Running on server (e.g. AWS) without specifying port
226
  block.queue().launch(ssl_verify=False) # root_path="/data-text-search" # server_name="0.0.0.0",
227
 
requirements.txt CHANGED
@@ -7,6 +7,7 @@ openpyxl==3.1.2
7
  torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
- gradio==4.20.1
11
  sentence_transformers==2.3.1
12
- lxml==5.1.0
 
 
7
  torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
+ gradio==4.21.0
11
  sentence_transformers==2.3.1
12
+ lxml==5.1.0
13
+ boto3==1.34.63
search_funcs/aws_functions.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type
2
+ import pandas as pd
3
+ import boto3
4
+ import tempfile
5
+ import os
6
+
7
+ PandasDataFrame = Type[pd.DataFrame]
8
+
9
+ bucket_name = 'data-text-search-data'
10
+
11
+ # Create a Session with the IAM role ARN
12
+ aws_role = 'arn:aws:iam::460501890304:role/ecsTaskExecutionRole'
13
+
14
+ try:
15
+ session = boto3.Session(profile_name="default")
16
+ except Exception as e:
17
+ print(e)
18
+
19
+ #sts = session.client("sts")
20
+ # response = sts.assume_role(
21
+ # RoleArn=aws_role,
22
+ # RoleSessionName="ecs-test-session"
23
+ # )
24
+ # print(response)
25
+
26
+
27
+ def get_assumed_role_info():
28
+ sts = boto3.client('sts')
29
+ response = sts.get_caller_identity()
30
+
31
+ # Extract ARN of the assumed role
32
+ assumed_role_arn = response['Arn']
33
+
34
+ # Extract the name of the assumed role from the ARN
35
+ assumed_role_name = assumed_role_arn.split('/')[-1]
36
+
37
+ return assumed_role_arn, assumed_role_name
38
+
39
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
40
+
41
+ print("Assumed Role ARN:", assumed_role_arn)
42
+ print("Assumed Role Name:", assumed_role_name)
43
+
44
+
45
+ # Download direct from S3 - requires login credentials
46
+ def download_file_from_s3(bucket_name, key, local_file_path):
47
+
48
+ s3 = boto3.client('s3')
49
+ s3.download_file(bucket_name, key, local_file_path)
50
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
51
+
52
+ #download_file_from_s3(bucket_name, object_key, local_file_loc)
53
+
54
+ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
55
+ """
56
+ Download all files from an S3 folder to a local folder.
57
+ """
58
+ s3 = boto3.client('s3')
59
+
60
+ # List objects in the specified S3 folder
61
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
62
+
63
+ # Download each object
64
+ for obj in response.get('Contents', []):
65
+ # Extract object key and construct local file path
66
+ object_key = obj['Key']
67
+ local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
68
+
69
+ # Create directories if necessary
70
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
71
+
72
+ # Download the object
73
+ try:
74
+ s3.download_file(bucket_name, object_key, local_file_path)
75
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
76
+ except Exception as e:
77
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
78
+
79
+
80
+ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
81
+ """
82
+ Download specific files from an S3 folder to a local folder.
83
+ """
84
+ s3 = boto3.client('s3')
85
+
86
+ if filenames == '*':
87
+ # List all objects in the S3 folder
88
+ response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
89
+ filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
90
+
91
+ for filename in filenames:
92
+ object_key = os.path.join(s3_folder, filename)
93
+ local_file_path = os.path.join(local_folder, filename)
94
+
95
+ # Create directories if necessary
96
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
97
+
98
+ # Download the object
99
+ try:
100
+ s3.download_file(bucket_name, object_key, local_file_path)
101
+ print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
102
+ except Exception as e:
103
+ print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
104
+
105
+
106
+
107
+ def load_data_from_aws(in_aws_keyword_file, bucket_name=bucket_name):
108
+
109
+ temp_dir = tempfile.mkdtemp()
110
+ local_keyword_stub = temp_dir + '/keyword/'
111
+ local_semantic_stub = temp_dir + '/semantic/'
112
+
113
+ files = []
114
+
115
+ if "Bioasq - Biomedical example data" in in_aws_keyword_file:
116
+
117
+ s3_folder_stub = 'example_data/bioasq/latest/'
118
+
119
+ if 'keyword' in in_aws_keyword_file:
120
+ s3_folder_stub = s3_folder_stub + 'keyword/'
121
+ local_folder_path = local_keyword_stub
122
+
123
+ if 'semantic' in in_aws_keyword_file:
124
+ s3_folder_stub = s3_folder_stub + 'semantic/'
125
+ local_folder_path = local_semantic_stub
126
+
127
+
128
+ # Check if folder exists
129
+ if not os.path.exists(local_folder_path):
130
+ print(f"Folder {local_folder_path} does not exist! Making folder.")
131
+
132
+ os.mkdir(local_folder_path)
133
+
134
+ # Check if folder is empty
135
+ if len(os.listdir(local_folder_path)) == 0:
136
+ print(f"Folder {local_folder_path} is empty")
137
+
138
+ if 'keyword' in in_aws_keyword_file:
139
+ # Download keyword folder
140
+ download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
141
+
142
+ if 'semantic' in in_aws_keyword_file:
143
+ # Download keyword folder
144
+ download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames=['mini-bioasq-0000_cleaned_bge_embedding_compress.npz', 'mini-bioasq-0000_cleaned_prepared_docs.pkl.gz'])
145
+
146
+ print("AWS data downloaded")
147
+
148
+ else:
149
+ print(f"Folder {local_folder_path} is not empty")
150
+
151
+ #files = os.listdir(local_folder_stub)
152
+ #print(files)
153
+
154
+ files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
155
+
156
+ out_message = "Data successfully loaded from AWS"
157
+ print(out_message)
158
+
159
+ else:
160
+ out_message = "Data not loaded from AWS"
161
+ print(out_message)
162
+
163
+ return files, out_message
164
+
search_funcs/helper_functions.py CHANGED
@@ -15,6 +15,10 @@ from openpyxl.cell.text import InlineFont
15
  from openpyxl.cell.rich_text import TextBlock, CellRichText
16
  from openpyxl.styles import Font, Alignment
17
 
 
 
 
 
18
  # Attempt to delete content of gradio temp folder
19
  def get_temp_folder_path():
20
  username = getpass.getuser()
@@ -115,7 +119,7 @@ def initial_data_load(in_file):
115
  if not data_file_names:
116
  out_message = "Please load in at least one csv/Excel/parquet data file."
117
  print(out_message)
118
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, out_message
119
 
120
  # This if you have loaded in a documents object for the semantic search
121
  if "pkl" in data_file_names[0]:
@@ -129,6 +133,15 @@ def initial_data_load(in_file):
129
 
130
  current_source = current_source + get_file_path_end_with_ext(file) + " "
131
 
 
 
 
 
 
 
 
 
 
132
  df_new = read_file(file)
133
 
134
  df = pd.concat([df, df_new], ignore_index = True)
 
15
  from openpyxl.cell.rich_text import TextBlock, CellRichText
16
  from openpyxl.styles import Font, Alignment
17
 
18
+ megabyte = 1024 * 1024 # Bytes in a megabyte
19
+ file_size_mb = 500 # Size in megabytes
20
+ file_size_bytes_500mb = megabyte * file_size_mb
21
+
22
  # Attempt to delete content of gradio temp folder
23
  def get_temp_folder_path():
24
  username = getpass.getuser()
 
119
  if not data_file_names:
120
  out_message = "Please load in at least one csv/Excel/parquet data file."
121
  print(out_message)
122
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
123
 
124
  # This if you have loaded in a documents object for the semantic search
125
  if "pkl" in data_file_names[0]:
 
133
 
134
  current_source = current_source + get_file_path_end_with_ext(file) + " "
135
 
136
+ # Get the size of the file
137
+ print("Checking file size")
138
+ file_size = os.path.getsize(file)
139
+ if file_size > file_size_bytes_500mb:
140
+ out_message = "Data file greater than 500mb in size. Please use smaller sizes."
141
+ print(out_message)
142
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
143
+
144
+
145
  df_new = read_file(file)
146
 
147
  df = pd.concat([df, df_new], ignore_index = True)