seanpedrickcase commited on
Commit
759001a
1 Parent(s): f9e3451

Cognito authorisation option added to app, some other minor changes.

Browse files
app.py CHANGED
@@ -2,19 +2,20 @@ from typing import Type
2
  import gradio as gr
3
  import pandas as pd
4
  import numpy as np
5
-
6
  PandasDataFrame = Type[pd.DataFrame]
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
  from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
11
- from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
 
14
 
15
- # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
16
- temp_folder_path = get_temp_folder_path()
17
- empty_folder(temp_folder_path)
18
 
19
  ## Gradio app - BM25 search
20
  app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
@@ -24,9 +25,7 @@ with app:
24
 
25
  # BM25 state objects
26
  orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
27
- #orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
28
  prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
29
- #tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
30
  tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
31
  bm25_search_index_state = gr.State()
32
 
@@ -211,9 +210,15 @@ depends on factors such as the type of documents or queries. Information taken f
211
 
212
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
213
 
214
- # Launch the Gradio app
 
 
215
  if __name__ == "__main__":
216
- app.queue().launch(show_error=True) # root_path="/data-text-search" # server_name="0.0.0.0",
 
 
 
 
217
 
218
  # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d # Need to download OpenSSL and create own keys
219
  # app.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
 
2
  import gradio as gr
3
  import pandas as pd
4
  import numpy as np
5
+ import os
6
  PandasDataFrame = Type[pd.DataFrame]
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
  from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
11
+ from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
14
+ from search_funcs.auth import authenticate_user
15
 
16
+ # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
17
+ # temp_folder_path = get_temp_folder_path()
18
+ # empty_folder(temp_folder_path)
19
 
20
  ## Gradio app - BM25 search
21
  app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
 
25
 
26
  # BM25 state objects
27
  orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
 
28
  prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
 
29
  tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
30
  bm25_search_index_state = gr.State()
31
 
 
210
 
211
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
212
 
213
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
214
+ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
215
+
216
  if __name__ == "__main__":
217
+
218
+ if os.environ['COGNITO_AUTH'] == "1":
219
+ app.queue().launch(show_error=True, auth=authenticate_user)
220
+ else:
221
+ app.queue().launch(show_error=True, inbrowser=True)
222
 
223
  # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d # Need to download OpenSSL and create own keys
224
  # app.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
how_to_create_exe_dist.txt CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.5 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
@@ -28,7 +28,7 @@ a = Analysis(
28
  }
29
  )
30
 
31
- c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.5.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
 
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.6_kword app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
 
28
  }
29
  )
30
 
31
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.6_kword.spec
32
 
33
 
34
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
requirements.txt CHANGED
@@ -8,4 +8,4 @@ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_
8
  gradio
9
  sentence_transformers==3.0.1
10
  lxml==5.2.2
11
- boto3==1.34.103
 
8
  gradio
9
  sentence_transformers==3.0.1
10
  lxml==5.2.2
11
+ boto3==1.34.142
requirements_keyword_only.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ polars==0.20.3
3
+ pyarrow==14.0.2
4
+ openpyxl==3.1.3
5
+ #torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
6
+ spacy
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
+ gradio
9
+ #sentence_transformers==3.0.1
10
+ lxml==5.2.2
11
+ #boto3==1.34.103
search_funcs/auth.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import boto3
3
+ from search_funcs.helper_functions import get_or_create_env_var
4
+
5
+ client_id = get_or_create_env_var('AWS_CLIENT_ID', 'l762du1rg94e1r2q0ii7ls0ef') # This client id is borrowed from async gradio app client
6
+ print(f'The value of AWS_CLIENT_ID is {client_id}')
7
+
8
+ user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'eu-west-2_8fCzl8qej')
9
+ print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
10
+
11
+ def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
12
+ """Authenticates a user against an AWS Cognito user pool.
13
+
14
+ Args:
15
+ user_pool_id (str): The ID of the Cognito user pool.
16
+ client_id (str): The ID of the Cognito user pool client.
17
+ username (str): The username of the user.
18
+ password (str): The password of the user.
19
+
20
+ Returns:
21
+ bool: True if the user is authenticated, False otherwise.
22
+ """
23
+
24
+ client = boto3.client('cognito-idp') # Cognito Identity Provider client
25
+
26
+ try:
27
+ response = client.initiate_auth(
28
+ AuthFlow='USER_PASSWORD_AUTH',
29
+ AuthParameters={
30
+ 'USERNAME': username,
31
+ 'PASSWORD': password,
32
+ },
33
+ ClientId=client_id
34
+ )
35
+
36
+ # If successful, you'll receive an AuthenticationResult in the response
37
+ if response.get('AuthenticationResult'):
38
+ return True
39
+ else:
40
+ return False
41
+
42
+ except client.exceptions.NotAuthorizedException:
43
+ return False
44
+ except client.exceptions.UserNotFoundException:
45
+ return False
46
+ except Exception as e:
47
+ print(f"An error occurred: {e}")
48
+ return False
search_funcs/helper_functions.py CHANGED
@@ -9,7 +9,6 @@ import gzip
9
  import zipfile
10
  import pickle
11
  import numpy as np
12
-
13
  from typing import List
14
 
15
  # Openpyxl functions for output
@@ -18,7 +17,7 @@ from openpyxl.cell.text import InlineFont
18
  from openpyxl.cell.rich_text import TextBlock, CellRichText
19
  from openpyxl.styles import Font, Alignment
20
 
21
- from search_funcs.aws_functions import bucket_name
22
 
23
  megabyte = 1024 * 1024 # Bytes in a megabyte
24
  file_size_mb = 500 # Size in megabytes
@@ -39,9 +38,9 @@ def get_or_create_env_var(var_name, default_value):
39
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
40
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
41
 
42
- # Retrieving or setting RUNNING_ON_APP_RUNNER
43
- running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
44
- print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
45
 
46
 
47
 
@@ -57,21 +56,21 @@ def ensure_output_folder_exists(output_folder):
57
  else:
58
  print(f"The output folder already exists:", folder_name)
59
 
60
- def get_connection_params(request: gr.Request):
61
  if request:
62
- #request_data = request.json() # Parse JSON body
63
- #print("All request data:", request_data)
64
  #context_value = request_data.get('context')
65
  #if 'context' in request_data:
66
  # print("Request context dictionary:", request_data['context'])
67
 
68
- #print("Request headers dictionary:", request.headers)
69
- #print("All host elements", request.client)
70
- #print("IP address:", request.client.host)
71
  #print("Query parameters:", dict(request.query_params))
72
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
73
  #print("Request dictionary to object:", request.request.body())
74
- #print("Session hash:", request.session_hash)
75
 
76
  # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
77
  CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
@@ -97,7 +96,7 @@ def get_connection_params(request: gr.Request):
97
  else:
98
  out_session_hash = request.session_hash
99
  base_folder = "temp-files/"
100
- #print("Cognito ID not found. Using session hash as save folder.")
101
 
102
  output_folder = base_folder + out_session_hash + "/"
103
  #if bucket_name:
@@ -109,9 +108,9 @@ def get_connection_params(request: gr.Request):
109
  return "", ""
110
 
111
  # Attempt to delete content of gradio temp folder
112
- def get_temp_folder_path():
113
- username = getpass.getuser()
114
- return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
115
 
116
  def empty_folder(directory_path):
117
  if not os.path.exists(directory_path):
 
9
  import zipfile
10
  import pickle
11
  import numpy as np
 
12
  from typing import List
13
 
14
  # Openpyxl functions for output
 
17
  from openpyxl.cell.rich_text import TextBlock, CellRichText
18
  from openpyxl.styles import Font, Alignment
19
 
20
+ #from search_funcs.aws_functions import bucket_name
21
 
22
  megabyte = 1024 * 1024 # Bytes in a megabyte
23
  file_size_mb = 500 # Size in megabytes
 
38
  output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
39
  print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
40
 
41
+ # Retrieving or setting RUNNING_ON_APP_RUNNER (not used at the moment)
42
+ # running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
43
+ # print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
44
 
45
 
46
 
 
56
  else:
57
  print(f"The output folder already exists:", folder_name)
58
 
59
+ async def get_connection_params(request: gr.Request):
60
  if request:
61
+ # request_data = await request.json() # Parse JSON body
62
+ # print("All request data:", request_data)
63
  #context_value = request_data.get('context')
64
  #if 'context' in request_data:
65
  # print("Request context dictionary:", request_data['context'])
66
 
67
+ # print("Request headers dictionary:", request.headers)
68
+ # print("All host elements", request.client)
69
+ # print("IP address:", request.client.host)
70
  #print("Query parameters:", dict(request.query_params))
71
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
72
  #print("Request dictionary to object:", request.request.body())
73
+ print("Session hash:", request.session_hash)
74
 
75
  # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
76
  CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
 
96
  else:
97
  out_session_hash = request.session_hash
98
  base_folder = "temp-files/"
99
+ # print("Cognito ID not found. Using session hash as save folder.")
100
 
101
  output_folder = base_folder + out_session_hash + "/"
102
  #if bucket_name:
 
108
  return "", ""
109
 
110
  # Attempt to delete content of gradio temp folder
111
+ # def get_temp_folder_path():
112
+ # username = getpass.getuser()
113
+ # return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
114
 
115
  def empty_folder(directory_path):
116
  if not os.path.exists(directory_path):