seanpedrickcase commited on
Commit
1e2bb3e
1 Parent(s): 55f0ce3

Only aggregate topics not 'other', allowed for minimum sentence length, default max_topics now will auto aggregate topics. Added Cognito Auth functionality (boto3 with AWS).

Browse files
app.py CHANGED
@@ -1,14 +1,12 @@
1
- # Dendrograms will not work with the latest version of scipy (1.12.0), so run the following here/in your environment if you come across issues
2
- # import os
3
- # os.system("pip install scipy==1.11.4")
4
-
5
  import gradio as gr
6
  import pandas as pd
7
  import numpy as np
8
 
9
  from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
10
- from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params
11
  from sklearn.feature_extraction.text import CountVectorizer
 
12
 
13
  min_word_occurence_slider_default = 0.01
14
  max_word_occurence_slider_default = 0.95
@@ -34,6 +32,7 @@ with block:
34
  vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
35
 
36
  session_hash_state = gr.State("")
 
37
 
38
  gr.Markdown(
39
  """
@@ -55,10 +54,14 @@ with block:
55
 
56
  with gr.Accordion("Clean data", open = False):
57
  with gr.Row():
58
- clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, > 1 digit nums, emails, postcodes (UK).")
59
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
60
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
 
61
  split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
 
 
 
62
  with gr.Row():
63
  custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
64
  gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
@@ -76,8 +79,8 @@ with block:
76
  with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
77
 
78
  with gr.Row():
79
- min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 3, step = 1, label = "Minimum number of similar documents needed to make a topic.")
80
- max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 100, step = 1, label = "Maximum number of topics")
81
  with gr.Row():
82
  min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
83
  max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
@@ -131,7 +134,7 @@ with block:
131
 
132
  # Clean data
133
  custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
134
- clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
135
 
136
  # Optimise for keeping only zero-shot topics
137
  zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
@@ -152,8 +155,14 @@ with block:
152
  plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
153
 
154
  # Get session hash from connection parameters
155
- block.load(get_connection_params, inputs=None, outputs=[session_hash_state])
 
 
 
 
156
 
157
- # Launch the Gradio app
158
  if __name__ == "__main__":
159
- block.queue().launch(show_error=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
 
 
 
 
1
+ import os
 
 
 
2
  import gradio as gr
3
  import pandas as pd
4
  import numpy as np
5
 
6
  from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
7
+ from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
8
  from sklearn.feature_extraction.text import CountVectorizer
9
+ from funcs.auth import authenticate_user, download_file_from_s3
10
 
11
  min_word_occurence_slider_default = 0.01
12
  max_word_occurence_slider_default = 0.95
 
32
  vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
33
 
34
  session_hash_state = gr.State("")
35
+ s3_output_folder_state = gr.State("")
36
 
37
  gr.Markdown(
38
  """
 
54
 
55
  with gr.Accordion("Clean data", open = False):
56
  with gr.Row():
57
+ clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
58
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
59
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
60
+ #with gr.Row():
61
  split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
62
+ #additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
63
+ min_sentence_length_num = gr.Number(value=5, label="Min char length of split sentences")
64
+
65
  with gr.Row():
66
  custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
67
  gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
 
79
  with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
80
 
81
  with gr.Row():
82
+ min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 5, step = 1, label = "Minimum number of similar documents needed to make a topic.")
83
+ max_topics_slider = gr.Slider(minimum = 0, maximum = 500, value = 0, step = 1, label = "Maximum number of topics. If set to 0, then will choose topics to merge automatically.")
84
  with gr.Row():
85
  min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
86
  max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
 
134
 
135
  # Clean data
136
  custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
137
+ clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop, min_sentence_length_num, embeddings_state], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
138
 
139
  # Optimise for keeping only zero-shot topics
140
  zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
 
155
  plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
156
 
157
  # Get session hash from connection parameters
158
+ block.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
159
+
160
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
161
+ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
162
+
163
 
 
164
  if __name__ == "__main__":
165
+ if os.environ['COGNITO_AUTH'] == "1":
166
+ block.queue().launch(show_error=True, auth=authenticate_user)
167
+ else:
168
+ block.queue().launch(show_error=True, inbrowser=True)
funcs/anonymiser.py CHANGED
@@ -1,5 +1,6 @@
1
  from spacy.cli import download
2
  import spacy
 
3
  from funcs.presidio_analyzer_custom import analyze_dict
4
  spacy.prefer_gpu()
5
 
@@ -24,11 +25,6 @@ def spacy_model_installed(model_name):
24
  model_name = "en_core_web_sm"
25
  nlp = spacy_model_installed(model_name)
26
 
27
- #spacy.load(model_name)
28
- # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
29
- #os.system("pip uninstall -y gradio")
30
- #os.system("pip install gradio==3.50.0")
31
- #os.system("python -m spacy download en_core_web_lg")
32
 
33
  import re
34
  import secrets
@@ -43,16 +39,63 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecogn
43
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
44
  from presidio_anonymizer.entities import OperatorConfig
45
 
 
 
46
  # Function to Split Text and Create DataFrame using SpaCy
47
- def expand_sentences_spacy(df, colname, nlp=nlp):
48
  expanded_data = []
 
 
 
 
49
  df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
 
 
 
 
 
 
 
 
 
50
  for index, row in df.iterrows():
51
  doc = nlp(row[colname])
52
  for sent in doc.sents:
53
  expanded_data.append({'document_index': row['index'], colname: sent.text})
54
  return pd.DataFrame(expanded_data)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def anon_consistent_names(df):
57
  # ## Pick out common names and replace them with the same person value
58
  df_dict = df.to_dict(orient="list")
 
1
  from spacy.cli import download
2
  import spacy
3
+ from spacy.pipeline import Sentencizer
4
  from funcs.presidio_analyzer_custom import analyze_dict
5
  spacy.prefer_gpu()
6
 
 
25
  model_name = "en_core_web_sm"
26
  nlp = spacy_model_installed(model_name)
27
 
 
 
 
 
 
28
 
29
  import re
30
  import secrets
 
39
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
40
  from presidio_anonymizer.entities import OperatorConfig
41
 
42
+ from typing import List
43
+
44
  # Function to Split Text and Create DataFrame using SpaCy
45
+ def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
46
  expanded_data = []
47
+
48
+ # if not custom_delimiters:
49
+ # custom_delimiters = []
50
+
51
  df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
52
+
53
+ # sentencizer = Sentencizer()
54
+
55
+ # new_punct_chars = sentencizer.default_punct_chars
56
+ # new_punct_chars.extend(custom_delimiters)
57
+
58
+ # config = {"punct_chars": new_punct_chars}
59
+ # nlp.add_pipe("sentencizer", config=config)
60
+
61
  for index, row in df.iterrows():
62
  doc = nlp(row[colname])
63
  for sent in doc.sents:
64
  expanded_data.append({'document_index': row['index'], colname: sent.text})
65
  return pd.DataFrame(expanded_data)
66
 
67
+ # def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
68
+
69
+ # #print("Custom delimiters:", custom_delimiters)
70
+
71
+ # expanded_data = []
72
+ # df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
73
+
74
+ # sentencizer = Sentencizer()
75
+
76
+ # new_punct_chars = sentencizer.default_punct_chars
77
+ # if custom_delimiters:
78
+ # new_punct_chars.extend(custom_delimiters)
79
+
80
+ # pattern = "(" + "|".join(re.escape(punct) for punct in new_punct_chars) + ")"
81
+ # #print("Patterns:", pattern)
82
+ # split_list = []
83
+
84
+ # for idx, string in enumerate(df[colname]):
85
+ # new_split = re.split(pattern, string)
86
+ # for n, sentence in enumerate(new_split):
87
+ # if sentence:
88
+ # # If there is a split delimiter in the 'sentence' after, add it to the previous sentence as it will be removed at a later step
89
+ # if n + 1 < len(new_split):
90
+ # if new_split[n + 1]:
91
+ # # If the next split is in the list of split characters, then add it to this current sentence
92
+ # if new_split[n + 1] in new_punct_chars:
93
+ # split_list.append({'document_index': idx, colname: sentence + new_split[n + 1]})
94
+ # else:
95
+ # split_list.append({'document_index': idx, colname: sentence})
96
+
97
+ # return pd.DataFrame(split_list)
98
+
99
  def anon_consistent_names(df):
100
  # ## Pick out common names and replace them with the same person value
101
  df_dict = df.to_dict(orient="list")
funcs/auth.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ from funcs.helper_functions import get_or_create_env_var
3
+
4
+ client_id = get_or_create_env_var('AWS_CLIENT_ID', 'aws_client_placeholder') # This client id is borrowed from async gradio app client
5
+ print(f'The value of AWS_CLIENT_ID is {client_id}')
6
+
7
+ user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'aws_user_pool_placeholder')
8
+ print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
9
+
10
+ def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
11
+ """Authenticates a user against an AWS Cognito user pool.
12
+
13
+ Args:
14
+ user_pool_id (str): The ID of the Cognito user pool.
15
+ client_id (str): The ID of the Cognito user pool client.
16
+ username (str): The username of the user.
17
+ password (str): The password of the user.
18
+
19
+ Returns:
20
+ bool: True if the user is authenticated, False otherwise.
21
+ """
22
+
23
+ client = boto3.client('cognito-idp') # Cognito Identity Provider client
24
+
25
+ try:
26
+ response = client.initiate_auth(
27
+ AuthFlow='USER_PASSWORD_AUTH',
28
+ AuthParameters={
29
+ 'USERNAME': username,
30
+ 'PASSWORD': password,
31
+ },
32
+ ClientId=client_id
33
+ )
34
+
35
+ # If successful, you'll receive an AuthenticationResult in the response
36
+ if response.get('AuthenticationResult'):
37
+ return True
38
+ else:
39
+ return False
40
+
41
+ except client.exceptions.NotAuthorizedException:
42
+ return False
43
+ except client.exceptions.UserNotFoundException:
44
+ return False
45
+ except Exception as e:
46
+ print(f"An error occurred: {e}")
47
+ return False
48
+
49
+
50
+ def download_file_from_s3(bucket_name, key, local_file_path):
51
+
52
+ s3 = boto3.client('s3')
53
+ s3.download_file(bucket_name, key, local_file_path)
54
+ print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
funcs/clean_funcs.py CHANGED
@@ -8,26 +8,40 @@ custom_words = []
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
 
11
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
12
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 
13
  email_pattern_regex = r'\S*@\S*\s?'
14
  num_pattern_regex = r'[0-9]+'
15
- nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
16
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
17
  multiple_spaces_regex = r'\s{2,}'
18
 
19
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
 
20
  texts = pl.Series(texts).str.strip_chars()
21
- text = texts.str.replace_all(html_pattern_regex, ' ')
22
- text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
23
- text = text.str.replace_all(email_pattern_regex, ' ')
24
- text = text.str.replace_all(nums_two_more_regex, ' ')
25
- text = text.str.replace_all(postcode_pattern_regex, ' ')
26
- text = text.str.replace_all(multiple_spaces_regex, ' ')
27
-
28
- text = text.to_list()
29
 
30
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def regex_clean(texts, custom_regex, progress=gr.Progress()):
33
  texts = pl.Series(texts).str.strip_chars()
 
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
11
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
12
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
13
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
14
+ non_ascii_pattern = r'[^\x00-\x7F]+'
15
  email_pattern_regex = r'\S*@\S*\s?'
16
  num_pattern_regex = r'[0-9]+'
17
+ nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
18
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
19
  multiple_spaces_regex = r'\s{2,}'
20
 
21
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
22
+ # Convert to polars Series
23
  texts = pl.Series(texts).str.strip_chars()
 
 
 
 
 
 
 
 
24
 
25
+ # Define a list of patterns and their replacements
26
+ patterns = [
27
+ (url_pattern, ' '),
28
+ (html_pattern_regex, ' '),
29
+ (html_start_pattern_end_dots_regex, ' '),
30
+ (non_ascii_pattern, ' '),
31
+ (email_pattern_regex, ' '),
32
+ (nums_two_more_regex, ' '),
33
+ (postcode_pattern_regex, ' '),
34
+ (multiple_spaces_regex, ' ')
35
+ ]
36
+
37
+ # Apply each regex replacement
38
+ for pattern, replacement in patterns:
39
+ texts = texts.str.replace_all(pattern, replacement)
40
+
41
+ # Convert the series back to a list
42
+ texts = texts.to_list()
43
+
44
+ return texts
45
 
46
  def regex_clean(texts, custom_regex, progress=gr.Progress()):
47
  texts = pl.Series(texts).str.strip_chars()
funcs/helper_functions.py CHANGED
@@ -1,4 +1,3 @@
1
- import sys
2
  import os
3
  import zipfile
4
  import re
@@ -45,35 +44,66 @@ def ensure_output_folder_exists():
45
  else:
46
  print(f"The 'output/' folder already exists.")
47
 
48
- def get_connection_params(request: gr.Request):
49
- '''
50
- Get connection parameter values from request object.
51
- '''
52
  if request:
 
 
 
 
 
 
 
53
 
54
  # print("Request headers dictionary:", request.headers)
55
  # print("All host elements", request.client)
56
  # print("IP address:", request.client.host)
57
  # print("Query parameters:", dict(request.query_params))
 
 
58
  print("Session hash:", request.session_hash)
59
 
60
- if 'x-cognito-id' in request.headers:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  out_session_hash = request.headers['x-cognito-id']
62
  base_folder = "user-files/"
63
- #print("Cognito ID found:", out_session_hash)
64
 
65
  else:
66
  out_session_hash = request.session_hash
67
  base_folder = "temp-files/"
68
- #print("Cognito ID not found. Using session hash as save folder.")
69
 
70
  output_folder = base_folder + out_session_hash + "/"
71
- #print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
 
72
 
73
- return out_session_hash
74
  else:
75
  print("No session parameters found.")
76
- return ""
77
 
78
  def detect_file_type(filename):
79
  """Detect the file type based on its extension."""
@@ -286,6 +316,24 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
286
  columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
287
  doc_dets = topic_model.get_document_info(docs)[columns_found]
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
290
  try:
291
  if split_sentence_drop == "Yes":
@@ -296,21 +344,42 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
296
  grouped = doc_dets.groupby('parent_document_index')
297
 
298
  # 2. Aggregate Topics and Probabilities:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  def aggregate_topics(group):
300
  original_text = ' '.join(group['Document'])
301
- topics = group['Topic'].tolist()
 
 
302
 
303
  if 'Name' in group.columns:
304
- topic_names = group['Name'].tolist()
 
305
  else:
306
  topic_names = None
307
 
308
  if 'Probability' in group.columns:
309
- probabilities = group['Probability'].tolist()
 
310
  else:
311
- probabilities = None # Or any other default value you prefer
 
 
312
 
313
- return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
314
 
315
  #result_df = grouped.apply(aggregate_topics).reset_index()
316
  doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()
 
 
1
  import os
2
  import zipfile
3
  import re
 
44
  else:
45
  print(f"The 'output/' folder already exists.")
46
 
47
+ async def get_connection_params(request: gr.Request):
48
+ base_folder = ""
49
+
 
50
  if request:
51
+ #print("request user:", request.username)
52
+
53
+ #request_data = await request.json() # Parse JSON body
54
+ #print("All request data:", request_data)
55
+ #context_value = request_data.get('context')
56
+ #if 'context' in request_data:
57
+ # print("Request context dictionary:", request_data['context'])
58
 
59
  # print("Request headers dictionary:", request.headers)
60
  # print("All host elements", request.client)
61
  # print("IP address:", request.client.host)
62
  # print("Query parameters:", dict(request.query_params))
63
+ # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
64
+ #print("Request dictionary to object:", request.request.body())
65
  print("Session hash:", request.session_hash)
66
 
67
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
68
+ CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
69
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
70
+
71
+ # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
72
+ CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
73
+ #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
74
+
75
+ if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
76
+ if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
77
+ supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
78
+ if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
79
+ print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
80
+ else:
81
+ raise(ValueError, "Custom Cloudfront header value does not match expected value.")
82
+
83
+ # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
84
+
85
+ if request.username:
86
+ out_session_hash = request.username
87
+ base_folder = "user-files/"
88
+
89
+ elif 'x-cognito-id' in request.headers:
90
  out_session_hash = request.headers['x-cognito-id']
91
  base_folder = "user-files/"
92
+ print("Cognito ID found:", out_session_hash)
93
 
94
  else:
95
  out_session_hash = request.session_hash
96
  base_folder = "temp-files/"
97
+ # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
98
 
99
  output_folder = base_folder + out_session_hash + "/"
100
+ #if bucket_name:
101
+ # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
102
 
103
+ return out_session_hash, output_folder
104
  else:
105
  print("No session parameters found.")
106
+ return "",""
107
 
108
  def detect_file_type(filename):
109
  """Detect the file type based on its extension."""
 
316
  columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
317
  doc_dets = topic_model.get_document_info(docs)[columns_found]
318
 
319
+ ### If there are full topic probabilities, join these on to the document details df
320
+ def is_valid_dataframe(df):
321
+ """
322
+ Checks if the given object is a non-empty pandas DataFrame.
323
+
324
+ Args:
325
+ df: The object to check.
326
+
327
+ Returns:
328
+ True if df is a non-empty DataFrame, False otherwise.
329
+ """
330
+ if df is None: # Check for None first
331
+ return False
332
+ return isinstance(df, pd.DataFrame) and not df.empty
333
+
334
+ if is_valid_dataframe(topic_model.probabilities_):
335
+ doc_dets = doc_dets.merge(topic_model.probabilities_, left_index=True, right_index=True, how="left")
336
+
337
  # If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
338
  try:
339
  if split_sentence_drop == "Yes":
 
344
  grouped = doc_dets.groupby('parent_document_index')
345
 
346
  # 2. Aggregate Topics and Probabilities:
347
+ # def aggregate_topics(group):
348
+ # original_text = ' '.join(group['Document'])
349
+ # topics = group['Topic'].tolist()
350
+
351
+ # if 'Name' in group.columns:
352
+ # topic_names = group['Name'].tolist()
353
+ # else:
354
+ # topic_names = None
355
+
356
+ # if 'Probability' in group.columns:
357
+ # probabilities = group['Probability'].tolist()
358
+ # else:
359
+ # probabilities = None # Or any other default value you prefer
360
+
361
+ # return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
362
+
363
  def aggregate_topics(group):
364
  original_text = ' '.join(group['Document'])
365
+
366
+ # Filter out topics starting with '-1'
367
+ topics = [topic for topic in group['Topic'].tolist() if not str(topic).startswith('-1')]
368
 
369
  if 'Name' in group.columns:
370
+ # Filter out topic names corresponding to excluded topics
371
+ topic_names = [name for topic, name in zip(group['Topic'], group['Name'].tolist()) if not str(topic).startswith('-1')]
372
  else:
373
  topic_names = None
374
 
375
  if 'Probability' in group.columns:
376
+ # Filter out probabilities corresponding to excluded topics
377
+ probabilities = [prob for topic, prob in zip(group['Topic'], group['Probability'].tolist()) if not str(topic).startswith('-1')]
378
  else:
379
+ probabilities = None
380
+
381
+ return pd.Series({'Document': original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
382
 
 
383
 
384
  #result_df = grouped.apply(aggregate_topics).reset_index()
385
  doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()
funcs/topic_core_funcs.py CHANGED
@@ -52,7 +52,7 @@ def change_default_vis_col(in_colnames:List[str]):
52
  else:
53
  return gr.Dropdown()
54
 
55
- def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
56
  """
57
  Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
58
 
@@ -65,6 +65,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
65
  drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
66
  anonymise_drop (str): Option to anonymize data ("Yes" or "No").
67
  sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
 
68
  embeddings_state (dict): State of the embeddings.
69
  progress (gr.Progress, optional): Progress tracker for the cleaning process.
70
 
@@ -140,6 +141,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
140
  anon_toc = time.perf_counter()
141
  time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
142
 
 
 
143
  if sentence_split_drop == "Yes":
144
  progress(0.6, desc= "Splitting text into sentences")
145
 
@@ -149,11 +152,14 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
149
  anon_tic = time.perf_counter()
150
 
151
  data = expand_sentences_spacy(data, in_colnames_list_first)
152
- data = data[data[in_colnames_list_first].str.len() >= 25] # Keep only rows with at least 25 characters
 
153
  data.reset_index(inplace=True, drop=True)
154
 
155
  anon_toc = time.perf_counter()
156
- time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
 
 
157
 
158
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
159
  data.to_csv(out_data_name)
@@ -252,6 +258,9 @@ def extract_topics(
252
  elif calc_probs == "Yes":
253
  print("Calculating all probabilities.")
254
  calc_probs = True
 
 
 
255
 
256
  if not in_colnames:
257
  error_message = "Please enter one column name to use for cleaning and finding topics."
@@ -279,7 +288,7 @@ def extract_topics(
279
  # Attempt to load the model from each local location
280
  for location in local_embeddings_locations:
281
  try:
282
- embedding_model = SentenceTransformer(location, truncate_dim=512)
283
  print(f"Found local model installation at: {location}")
284
  break # Exit the loop if the model is found
285
  except Exception as e:
@@ -287,7 +296,7 @@ def extract_topics(
287
  continue
288
  else:
289
  # If the loop completes without finding the model in any local location
290
- embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
291
  print("Could not find local model installation. Downloading from Huggingface")
292
 
293
  #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
@@ -343,6 +352,7 @@ def extract_topics(
343
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
344
 
345
  if calc_probs == True:
 
346
  topics_probs_out = pd.DataFrame(topic_model.probabilities_)
347
  topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
348
  topics_probs_out.to_csv(topics_probs_out_name)
@@ -385,6 +395,10 @@ def extract_topics(
385
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
386
 
387
  if calc_probs == True:
 
 
 
 
388
  topics_probs_out = pd.DataFrame(topic_model.probabilities_)
389
  topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
390
  topics_probs_out.to_csv(topics_probs_out_name)
@@ -424,7 +438,7 @@ def extract_topics(
424
 
425
  # Tidy up topic label format a bit to have commas and spaces by default
426
  if not candidate_topics:
427
- print("Zero shot topics found, so not renaming")
428
  new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
429
  topic_model.set_topic_labels(new_topic_labels)
430
  if candidate_topics:
@@ -447,7 +461,7 @@ def extract_topics(
447
  # If you want to save your embedding files
448
  if return_intermediate_files == "Yes":
449
  print("Saving embeddings to file")
450
- if high_quality_mode == "Yes":
451
  embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
452
  else:
453
  if embeddings_super_compress == "No":
 
52
  else:
53
  return gr.Dropdown()
54
 
55
+ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
56
  """
57
  Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
58
 
 
65
  drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
66
  anonymise_drop (str): Option to anonymize data ("Yes" or "No").
67
  sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
68
+ min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
69
  embeddings_state (dict): State of the embeddings.
70
  progress (gr.Progress, optional): Progress tracker for the cleaning process.
71
 
 
141
  anon_toc = time.perf_counter()
142
  time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
143
 
144
+ print(time_out)
145
+
146
  if sentence_split_drop == "Yes":
147
  progress(0.6, desc= "Splitting text into sentences")
148
 
 
152
  anon_tic = time.perf_counter()
153
 
154
  data = expand_sentences_spacy(data, in_colnames_list_first)
155
+ data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
156
+ data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
157
  data.reset_index(inplace=True, drop=True)
158
 
159
  anon_toc = time.perf_counter()
160
+ time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
161
+
162
+ print(time_out)
163
 
164
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
165
  data.to_csv(out_data_name)
 
258
  elif calc_probs == "Yes":
259
  print("Calculating all probabilities.")
260
  calc_probs = True
261
+
262
+ if max_topics_slider == 0:
263
+ max_topics_slider = 'auto'
264
 
265
  if not in_colnames:
266
  error_message = "Please enter one column name to use for cleaning and finding topics."
 
288
  # Attempt to load the model from each local location
289
  for location in local_embeddings_locations:
290
  try:
291
+ embedding_model = SentenceTransformer(location)#, truncate_dim=512)
292
  print(f"Found local model installation at: {location}")
293
  break # Exit the loop if the model is found
294
  except Exception as e:
 
296
  continue
297
  else:
298
  # If the loop completes without finding the model in any local location
299
+ embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
300
  print("Could not find local model installation. Downloading from Huggingface")
301
 
302
  #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
 
352
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
353
 
354
  if calc_probs == True:
355
+
356
  topics_probs_out = pd.DataFrame(topic_model.probabilities_)
357
  topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
358
  topics_probs_out.to_csv(topics_probs_out_name)
 
395
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
396
 
397
  if calc_probs == True:
398
+
399
+ assigned_topics, probs = topic_model.transform(docs, embeddings_out)
400
+ print("Probs:", probs)
401
+ topic_model.probabilities_ = probs
402
  topics_probs_out = pd.DataFrame(topic_model.probabilities_)
403
  topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
404
  topics_probs_out.to_csv(topics_probs_out_name)
 
438
 
439
  # Tidy up topic label format a bit to have commas and spaces by default
440
  if not candidate_topics:
441
+ print("Zero shot topics not found, so not renaming")
442
  new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
443
  topic_model.set_topic_labels(new_topic_labels)
444
  if candidate_topics:
 
461
  # If you want to save your embedding files
462
  if return_intermediate_files == "Yes":
463
  print("Saving embeddings to file")
464
+ if high_quality_mode == "No":
465
  embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
466
  else:
467
  if embeddings_super_compress == "No":
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 
2
  transformers==4.41.2
3
  accelerate==0.26.1
4
  torch==2.3.1
 
1
  gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
2
+ boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  torch==2.3.1
requirements_gpu.txt CHANGED
@@ -1,11 +1,12 @@
1
  gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 
2
  transformers==4.41.2
3
  accelerate==0.26.1
4
  bertopic==0.16.2
5
  spacy==3.7.4
6
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
7
  pyarrow==14.0.2
8
- openpyxl==3.1.2
9
  Faker==22.2.0
10
  presidio_analyzer==2.2.354
11
  presidio_anonymizer==2.2.354
 
1
  gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
2
+ boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  bertopic==0.16.2
6
  spacy==3.7.4
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  pyarrow==14.0.2
9
+ openpyxl==3.1.3
10
  Faker==22.2.0
11
  presidio_analyzer==2.2.354
12
  presidio_anonymizer==2.2.354