Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
•
759001a
1
Parent(s):
f9e3451
Cognito authorisation option added to app, some other minor changes.
Browse files- app.py +14 -9
- how_to_create_exe_dist.txt +2 -2
- requirements.txt +1 -1
- requirements_keyword_only.txt +11 -0
- search_funcs/auth.py +48 -0
- search_funcs/helper_functions.py +15 -16
app.py
CHANGED
@@ -2,19 +2,20 @@ from typing import Type
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
-
|
6 |
PandasDataFrame = Type[pd.DataFrame]
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
|
11 |
-
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder,
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
|
|
14 |
|
15 |
-
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
|
16 |
-
temp_folder_path = get_temp_folder_path()
|
17 |
-
empty_folder(temp_folder_path)
|
18 |
|
19 |
## Gradio app - BM25 search
|
20 |
app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
|
@@ -24,9 +25,7 @@ with app:
|
|
24 |
|
25 |
# BM25 state objects
|
26 |
orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
27 |
-
#orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
28 |
prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
29 |
-
#tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
|
30 |
tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
|
31 |
bm25_search_index_state = gr.State()
|
32 |
|
@@ -211,9 +210,15 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
211 |
|
212 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
213 |
|
214 |
-
|
|
|
|
|
215 |
if __name__ == "__main__":
|
216 |
-
|
|
|
|
|
|
|
|
|
217 |
|
218 |
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d # Need to download OpenSSL and create own keys
|
219 |
# app.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
+
import os
|
6 |
PandasDataFrame = Type[pd.DataFrame]
|
7 |
|
8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
10 |
from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
|
11 |
+
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
|
12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
13 |
from search_funcs.aws_functions import load_data_from_aws
|
14 |
+
from search_funcs.auth import authenticate_user
|
15 |
|
16 |
+
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows (not used at the moment).
|
17 |
+
# temp_folder_path = get_temp_folder_path()
|
18 |
+
# empty_folder(temp_folder_path)
|
19 |
|
20 |
## Gradio app - BM25 search
|
21 |
app = gr.Blocks(theme = gr.themes.Base()) # , css="theme.css"
|
|
|
25 |
|
26 |
# BM25 state objects
|
27 |
orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
|
|
28 |
prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
|
|
|
29 |
tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
|
30 |
bm25_search_index_state = gr.State()
|
31 |
|
|
|
210 |
|
211 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
|
212 |
|
213 |
+
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
214 |
+
print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
215 |
+
|
216 |
if __name__ == "__main__":
|
217 |
+
|
218 |
+
if os.environ['COGNITO_AUTH'] == "1":
|
219 |
+
app.queue().launch(show_error=True, auth=authenticate_user)
|
220 |
+
else:
|
221 |
+
app.queue().launch(show_error=True, inbrowser=True)
|
222 |
|
223 |
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d # Need to download OpenSSL and create own keys
|
224 |
# app.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
|
how_to_create_exe_dist.txt
CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -28,7 +28,7 @@ a = Analysis(
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --name DataSearchApp_0.6_kword app.py
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.6_kword.spec
|
32 |
|
33 |
|
34 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
requirements.txt
CHANGED
@@ -8,4 +8,4 @@ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_
|
|
8 |
gradio
|
9 |
sentence_transformers==3.0.1
|
10 |
lxml==5.2.2
|
11 |
-
boto3==1.34.
|
|
|
8 |
gradio
|
9 |
sentence_transformers==3.0.1
|
10 |
lxml==5.2.2
|
11 |
+
boto3==1.34.142
|
requirements_keyword_only.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
polars==0.20.3
|
3 |
+
pyarrow==14.0.2
|
4 |
+
openpyxl==3.1.3
|
5 |
+
#torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
|
6 |
+
spacy
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
+
gradio
|
9 |
+
#sentence_transformers==3.0.1
|
10 |
+
lxml==5.2.2
|
11 |
+
#boto3==1.34.103
|
search_funcs/auth.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import boto3
|
3 |
+
from search_funcs.helper_functions import get_or_create_env_var
|
4 |
+
|
5 |
+
client_id = get_or_create_env_var('AWS_CLIENT_ID', 'l762du1rg94e1r2q0ii7ls0ef') # This client id is borrowed from async gradio app client
|
6 |
+
print(f'The value of AWS_CLIENT_ID is {client_id}')
|
7 |
+
|
8 |
+
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'eu-west-2_8fCzl8qej')
|
9 |
+
print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
10 |
+
|
11 |
+
def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
|
12 |
+
"""Authenticates a user against an AWS Cognito user pool.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
user_pool_id (str): The ID of the Cognito user pool.
|
16 |
+
client_id (str): The ID of the Cognito user pool client.
|
17 |
+
username (str): The username of the user.
|
18 |
+
password (str): The password of the user.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
bool: True if the user is authenticated, False otherwise.
|
22 |
+
"""
|
23 |
+
|
24 |
+
client = boto3.client('cognito-idp') # Cognito Identity Provider client
|
25 |
+
|
26 |
+
try:
|
27 |
+
response = client.initiate_auth(
|
28 |
+
AuthFlow='USER_PASSWORD_AUTH',
|
29 |
+
AuthParameters={
|
30 |
+
'USERNAME': username,
|
31 |
+
'PASSWORD': password,
|
32 |
+
},
|
33 |
+
ClientId=client_id
|
34 |
+
)
|
35 |
+
|
36 |
+
# If successful, you'll receive an AuthenticationResult in the response
|
37 |
+
if response.get('AuthenticationResult'):
|
38 |
+
return True
|
39 |
+
else:
|
40 |
+
return False
|
41 |
+
|
42 |
+
except client.exceptions.NotAuthorizedException:
|
43 |
+
return False
|
44 |
+
except client.exceptions.UserNotFoundException:
|
45 |
+
return False
|
46 |
+
except Exception as e:
|
47 |
+
print(f"An error occurred: {e}")
|
48 |
+
return False
|
search_funcs/helper_functions.py
CHANGED
@@ -9,7 +9,6 @@ import gzip
|
|
9 |
import zipfile
|
10 |
import pickle
|
11 |
import numpy as np
|
12 |
-
|
13 |
from typing import List
|
14 |
|
15 |
# Openpyxl functions for output
|
@@ -18,7 +17,7 @@ from openpyxl.cell.text import InlineFont
|
|
18 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
19 |
from openpyxl.styles import Font, Alignment
|
20 |
|
21 |
-
from search_funcs.aws_functions import bucket_name
|
22 |
|
23 |
megabyte = 1024 * 1024 # Bytes in a megabyte
|
24 |
file_size_mb = 500 # Size in megabytes
|
@@ -39,9 +38,9 @@ def get_or_create_env_var(var_name, default_value):
|
|
39 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
40 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
41 |
|
42 |
-
# Retrieving or setting RUNNING_ON_APP_RUNNER
|
43 |
-
running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
|
44 |
-
print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
|
45 |
|
46 |
|
47 |
|
@@ -57,21 +56,21 @@ def ensure_output_folder_exists(output_folder):
|
|
57 |
else:
|
58 |
print(f"The output folder already exists:", folder_name)
|
59 |
|
60 |
-
def get_connection_params(request: gr.Request):
|
61 |
if request:
|
62 |
-
#request_data = request.json() # Parse JSON body
|
63 |
-
#print("All request data:", request_data)
|
64 |
#context_value = request_data.get('context')
|
65 |
#if 'context' in request_data:
|
66 |
# print("Request context dictionary:", request_data['context'])
|
67 |
|
68 |
-
#print("Request headers dictionary:", request.headers)
|
69 |
-
#print("All host elements", request.client)
|
70 |
-
#print("IP address:", request.client.host)
|
71 |
#print("Query parameters:", dict(request.query_params))
|
72 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
73 |
#print("Request dictionary to object:", request.request.body())
|
74 |
-
|
75 |
|
76 |
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
|
77 |
CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
|
@@ -97,7 +96,7 @@ def get_connection_params(request: gr.Request):
|
|
97 |
else:
|
98 |
out_session_hash = request.session_hash
|
99 |
base_folder = "temp-files/"
|
100 |
-
#print("Cognito ID not found. Using session hash as save folder.")
|
101 |
|
102 |
output_folder = base_folder + out_session_hash + "/"
|
103 |
#if bucket_name:
|
@@ -109,9 +108,9 @@ def get_connection_params(request: gr.Request):
|
|
109 |
return "", ""
|
110 |
|
111 |
# Attempt to delete content of gradio temp folder
|
112 |
-
def get_temp_folder_path():
|
113 |
-
|
114 |
-
|
115 |
|
116 |
def empty_folder(directory_path):
|
117 |
if not os.path.exists(directory_path):
|
|
|
9 |
import zipfile
|
10 |
import pickle
|
11 |
import numpy as np
|
|
|
12 |
from typing import List
|
13 |
|
14 |
# Openpyxl functions for output
|
|
|
17 |
from openpyxl.cell.rich_text import TextBlock, CellRichText
|
18 |
from openpyxl.styles import Font, Alignment
|
19 |
|
20 |
+
#from search_funcs.aws_functions import bucket_name
|
21 |
|
22 |
megabyte = 1024 * 1024 # Bytes in a megabyte
|
23 |
file_size_mb = 500 # Size in megabytes
|
|
|
38 |
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
39 |
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
40 |
|
41 |
+
# Retrieving or setting RUNNING_ON_APP_RUNNER (not used at the moment)
|
42 |
+
# running_on_app_runner_var = get_or_create_env_var('RUNNING_ON_APP_RUNNER', '0')
|
43 |
+
# print(f'The value of RUNNING_ON_APP_RUNNER is {running_on_app_runner_var}')
|
44 |
|
45 |
|
46 |
|
|
|
56 |
else:
|
57 |
print(f"The output folder already exists:", folder_name)
|
58 |
|
59 |
+
async def get_connection_params(request: gr.Request):
|
60 |
if request:
|
61 |
+
# request_data = await request.json() # Parse JSON body
|
62 |
+
# print("All request data:", request_data)
|
63 |
#context_value = request_data.get('context')
|
64 |
#if 'context' in request_data:
|
65 |
# print("Request context dictionary:", request_data['context'])
|
66 |
|
67 |
+
# print("Request headers dictionary:", request.headers)
|
68 |
+
# print("All host elements", request.client)
|
69 |
+
# print("IP address:", request.client.host)
|
70 |
#print("Query parameters:", dict(request.query_params))
|
71 |
# To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
|
72 |
#print("Request dictionary to object:", request.request.body())
|
73 |
+
print("Session hash:", request.session_hash)
|
74 |
|
75 |
# Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
|
76 |
CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
|
|
|
96 |
else:
|
97 |
out_session_hash = request.session_hash
|
98 |
base_folder = "temp-files/"
|
99 |
+
# print("Cognito ID not found. Using session hash as save folder.")
|
100 |
|
101 |
output_folder = base_folder + out_session_hash + "/"
|
102 |
#if bucket_name:
|
|
|
108 |
return "", ""
|
109 |
|
110 |
# Attempt to delete content of gradio temp folder
|
111 |
+
# def get_temp_folder_path():
|
112 |
+
# username = getpass.getuser()
|
113 |
+
# return os.path.join('C:\\Users', username, 'AppData\\Local\\Temp\\gradio')
|
114 |
|
115 |
def empty_folder(directory_path):
|
116 |
if not os.path.exists(directory_path):
|