Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Aug 9, 2024

Commit

1e2bb3e

1 Parent(s): 55f0ce3

Only aggregate topics not 'other', allowed for minimum sentence length, default max_topics now will auto aggregate topics. Added Cognito Auth functionality (boto3 with AWS).

Browse files

Files changed (8) hide show

app.py +21 -12
funcs/anonymiser.py +49 -6
funcs/auth.py +54 -0
funcs/clean_funcs.py +24 -10
funcs/helper_functions.py +85 -16
funcs/topic_core_funcs.py +21 -7
requirements.txt +1 -0
requirements_gpu.txt +2 -1

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
-# Dendrograms will not work with the latest version of scipy (1.12.0), so run the following here/in your environment if you come across issues
-# import os
-# os.system("pip install scipy==1.11.4")
 import gradio as gr
 import pandas as pd
 import numpy as np
 from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
-from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params
 from sklearn.feature_extraction.text import CountVectorizer
 min_word_occurence_slider_default = 0.01
 max_word_occurence_slider_default = 0.95
@@ -34,6 +32,7 @@ with block:
     vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
     session_hash_state = gr.State("")
     gr.Markdown(
     """
@@ -55,10 +54,14 @@ with block:
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
-                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, > 1 digit nums, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
                 split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
             with gr.Row():
                 custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
                 gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
@@ -76,8 +79,8 @@ with block:
             with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
                 with gr.Row():
-                    min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 3, step = 1, label = "Minimum number of similar documents needed to make a topic.")
-                    max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 100, step = 1, label = "Maximum number of topics")
                 with gr.Row():
                     min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
                     max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
@@ -131,7 +134,7 @@ with block:
     # Clean data
     custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
-    clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
     # Optimise for keeping only zero-shot topics
     zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
@@ -152,8 +155,14 @@ with block:
     plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
     # Get session hash from connection parameters
-    block.load(get_connection_params, inputs=None, outputs=[session_hash_state])
-# Launch the Gradio app
 if __name__ == "__main__":
-    block.queue().launch(show_error=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

+import os
 import gradio as gr
 import pandas as pd
 import numpy as np
 from funcs.topic_core_funcs import pre_clean, optimise_zero_shot, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model, change_default_vis_col
+from funcs.helper_functions import initial_file_load, custom_regex_load, ensure_output_folder_exists, output_folder, get_connection_params, get_or_create_env_var
 from sklearn.feature_extraction.text import CountVectorizer
+from funcs.auth import authenticate_user, download_file_from_s3
 min_word_occurence_slider_default = 0.01
 max_word_occurence_slider_default = 0.95
     vectoriser_state = gr.State(CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=min_word_occurence_slider_default, max_df=max_word_occurence_slider_default))
     session_hash_state = gr.State("")
+    s3_output_folder_state = gr.State("")
     gr.Markdown(
     """
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
+                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective and slow!")
+                #with gr.Row():
                 split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split text into sentences. Useful for small datasets.")
+                #additional_custom_delimiters_drop = gr.Dropdown(choices=["and", ",", "as well as", "also"], multiselect=True, label="Additional custom delimiters to split sentences.")
+                min_sentence_length_num = gr.Number(value=5, label="Min char length of split sentences")
             with gr.Row():
                 custom_regex = gr.UploadButton(label="Import custom regex removal file", file_count="multiple")
                 gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Strings matching this pattern will be removed. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
             with gr.Accordion("Topic modelling settings - change documents per topic, max topics, frequency of terms", open = False):
                 with gr.Row():
+                    min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 5, step = 1, label = "Minimum number of similar documents needed to make a topic.")
+                    max_topics_slider = gr.Slider(minimum = 0, maximum = 500, value = 0, step = 1, label = "Maximum number of topics. If set to 0, then will choose topics to merge automatically.")
                 with gr.Row():
                     min_word_occurence_slider = gr.Slider(minimum = 0.001, maximum = 0.9, value = min_word_occurence_slider_default, step = 0.001, label = "Keep terms that appear in this minimum proportion of documents. Avoids creating topics with very uncommon words.")
                     max_word_occurence_slider = gr.Slider(minimum = 0.1, maximum = 1.0, value =max_word_occurence_slider_default, step = 0.01, label = "Keep terms that appear in less than this maximum proportion of documents. Avoids very common words in topic names.")
     # Clean data
     custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
+    clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop, min_sentence_length_num, embeddings_state], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state, embeddings_state], api_name="clean")
     # Optimise for keeping only zero-shot topics
     zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
     plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, quality_mode_drop, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
     # Get session hash from connection parameters
+    block.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
+COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
+print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
 if __name__ == "__main__":
+    if os.environ['COGNITO_AUTH'] == "1":
+        block.queue().launch(show_error=True, auth=authenticate_user)
+    else:
+        block.queue().launch(show_error=True, inbrowser=True)

funcs/anonymiser.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from spacy.cli import download
 import spacy
 from funcs.presidio_analyzer_custom import analyze_dict
 spacy.prefer_gpu()
@@ -24,11 +25,6 @@ def spacy_model_installed(model_name):
 model_name = "en_core_web_sm"
 nlp = spacy_model_installed(model_name)
-#spacy.load(model_name)
-# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
-#os.system("pip uninstall -y gradio")
-#os.system("pip install gradio==3.50.0")
-#os.system("python -m spacy download en_core_web_lg")
 import re
 import secrets
@@ -43,16 +39,63 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecogn
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 # Function to Split Text and Create DataFrame using SpaCy
-def expand_sentences_spacy(df, colname, nlp=nlp):
     expanded_data = []
     df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
     for index, row in df.iterrows():
         doc = nlp(row[colname])
         for sent in doc.sents:
             expanded_data.append({'document_index': row['index'], colname: sent.text})
     return pd.DataFrame(expanded_data)
 def anon_consistent_names(df):
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")

 from spacy.cli import download
 import spacy
+from spacy.pipeline import Sentencizer
 from funcs.presidio_analyzer_custom import analyze_dict
 spacy.prefer_gpu()
 model_name = "en_core_web_sm"
 nlp = spacy_model_installed(model_name)
 import re
 import secrets
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
+from typing import List
 # Function to Split Text and Create DataFrame using SpaCy
+def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
     expanded_data = []
+    # if not custom_delimiters:
+    #     custom_delimiters = []
     df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
+    # sentencizer = Sentencizer()
+    # new_punct_chars = sentencizer.default_punct_chars
+    # new_punct_chars.extend(custom_delimiters)
+    # config = {"punct_chars": new_punct_chars}
+    # nlp.add_pipe("sentencizer", config=config)
     for index, row in df.iterrows():
         doc = nlp(row[colname])
         for sent in doc.sents:
             expanded_data.append({'document_index': row['index'], colname: sent.text})
     return pd.DataFrame(expanded_data)
+# def expand_sentences_spacy(df, colname, custom_delimiters:List[str]=[], nlp=nlp):
+#     #print("Custom delimiters:", custom_delimiters)
+#     expanded_data = []
+#     df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index')
+#     sentencizer = Sentencizer()
+#     new_punct_chars = sentencizer.default_punct_chars
+#     if custom_delimiters:
+#         new_punct_chars.extend(custom_delimiters)
+#     pattern = "(" + "|".join(re.escape(punct) for punct in new_punct_chars) + ")"
+#     #print("Patterns:", pattern)
+#     split_list = []
+#     for idx, string in enumerate(df[colname]):
+#         new_split = re.split(pattern, string)
+#         for n, sentence in enumerate(new_split):
+#             if sentence:
+#                 # If there is a split delimiter in the 'sentence' after, add it to the previous sentence as it will be removed at a later step
+#                 if n + 1 < len(new_split):
+#                     if new_split[n + 1]:
+#                         # If the next split is in the list of split characters, then add it to this current sentence
+#                         if new_split[n + 1] in new_punct_chars:
+#                             split_list.append({'document_index': idx, colname: sentence + new_split[n + 1]})
+#                 else:
+#                     split_list.append({'document_index': idx, colname: sentence})
+#     return pd.DataFrame(split_list)
 def anon_consistent_names(df):
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")

funcs/auth.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import boto3
+from funcs.helper_functions import get_or_create_env_var
+client_id = get_or_create_env_var('AWS_CLIENT_ID', 'aws_client_placeholder') # This client id is borrowed from async gradio app client
+print(f'The value of AWS_CLIENT_ID is {client_id}')
+user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', 'aws_user_pool_placeholder')
+print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
+def authenticate_user(username, password, user_pool_id=user_pool_id, client_id=client_id):
+    """Authenticates a user against an AWS Cognito user pool.
+    Args:
+        user_pool_id (str): The ID of the Cognito user pool.
+        client_id (str): The ID of the Cognito user pool client.
+        username (str): The username of the user.
+        password (str): The password of the user.
+    Returns:
+        bool: True if the user is authenticated, False otherwise.
+    """
+    client = boto3.client('cognito-idp')  # Cognito Identity Provider client
+    try:
+        response = client.initiate_auth(
+            AuthFlow='USER_PASSWORD_AUTH',
+            AuthParameters={
+                'USERNAME': username,
+                'PASSWORD': password,
+            },
+            ClientId=client_id
+        )
+        # If successful, you'll receive an AuthenticationResult in the response
+        if response.get('AuthenticationResult'):
+            return True
+        else:
+            return False
+    except client.exceptions.NotAuthorizedException:
+        return False
+    except client.exceptions.UserNotFoundException:
+        return False
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return False
+def download_file_from_s3(bucket_name, key, local_file_path):
+    s3 = boto3.client('s3')
+    s3.download_file(bucket_name, key, local_file_path)
+    print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")

funcs/clean_funcs.py CHANGED Viewed

@@ -8,26 +8,40 @@ custom_words = []
 my_stop_words = custom_words
 # #### Some of my cleaning functions
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
-nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()
-    text = texts.str.replace_all(html_pattern_regex, ' ')
-    text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
-    text = text.str.replace_all(email_pattern_regex, ' ')
-    text = text.str.replace_all(nums_two_more_regex, ' ')
-    text = text.str.replace_all(postcode_pattern_regex, ' ')
-    text = text.str.replace_all(multiple_spaces_regex, ' ')
-    text = text.to_list()
-    return text
 def regex_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()

 my_stop_words = custom_words
 # #### Some of my cleaning functions
+url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
+non_ascii_pattern = r'[^\x00-\x7F]+'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
+nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
+    # Convert to polars Series
     texts = pl.Series(texts).str.strip_chars()
+    # Define a list of patterns and their replacements
+    patterns = [
+        (url_pattern, ' '),
+        (html_pattern_regex, ' '),
+        (html_start_pattern_end_dots_regex, ' '),
+        (non_ascii_pattern, ' '),
+        (email_pattern_regex, ' '),
+        (nums_two_more_regex, ' '),
+        (postcode_pattern_regex, ' '),
+        (multiple_spaces_regex, ' ')
+    ]
+    # Apply each regex replacement
+    for pattern, replacement in patterns:
+        texts = texts.str.replace_all(pattern, replacement)
+    # Convert the series back to a list
+    texts = texts.to_list()
+    return texts
 def regex_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()

funcs/helper_functions.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import sys
 import os
 import zipfile
 import re
@@ -45,35 +44,66 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
-def get_connection_params(request: gr.Request):
-    '''
-    Get connection parameter values from request object.
-    '''
     if request:
         # print("Request headers dictionary:", request.headers)
         # print("All host elements", request.client)
         # print("IP address:", request.client.host)
         # print("Query parameters:", dict(request.query_params))
         print("Session hash:", request.session_hash)
-        if 'x-cognito-id' in request.headers:
             out_session_hash = request.headers['x-cognito-id']
             base_folder = "user-files/"
-            #print("Cognito ID found:", out_session_hash)
         else:
             out_session_hash = request.session_hash
             base_folder = "temp-files/"
-            #print("Cognito ID not found. Using session hash as save folder.")
         output_folder = base_folder + out_session_hash + "/"
-        #print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
-        return out_session_hash
     else:
         print("No session parameters found.")
-        return ""
 def detect_file_type(filename):
     """Detect the file type based on its extension."""
@@ -286,6 +316,24 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
     columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
     doc_dets = topic_model.get_document_info(docs)[columns_found]
     # If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
     try:
         if split_sentence_drop == "Yes":
@@ -296,21 +344,42 @@ def save_topic_outputs(topic_model: BERTopic, data_file_name_no_ext: str, output
             grouped = doc_dets.groupby('parent_document_index')
             # 2. Aggregate Topics and Probabilities:
             def aggregate_topics(group):
                 original_text = ' '.join(group['Document'])
-                topics = group['Topic'].tolist()
                 if 'Name' in group.columns:
-                    topic_names = group['Name'].tolist()
                 else:
                     topic_names = None
                 if 'Probability' in group.columns:
-                    probabilities = group['Probability'].tolist()
                 else:
-                    probabilities = None  # Or any other default value you prefer
-                return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
             #result_df = grouped.apply(aggregate_topics).reset_index()
             doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()

 import os
 import zipfile
 import re
     else:
         print(f"The 'output/' folder already exists.")
+async def get_connection_params(request: gr.Request):
+    base_folder = ""
     if request:
+        #print("request user:", request.username)
+        #request_data = await request.json()  # Parse JSON body
+        #print("All request data:", request_data)
+        #context_value = request_data.get('context')
+        #if 'context' in request_data:
+        #     print("Request context dictionary:", request_data['context'])
         # print("Request headers dictionary:", request.headers)
         # print("All host elements", request.client)
         # print("IP address:", request.client.host)
         # print("Query parameters:", dict(request.query_params))
+        # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
+        #print("Request dictionary to object:", request.request.body())
         print("Session hash:", request.session_hash)
+        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER
+        CUSTOM_CLOUDFRONT_HEADER_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER', '')
+        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER is {CUSTOM_CLOUDFRONT_HEADER_var}')
+        # Retrieving or setting CUSTOM_CLOUDFRONT_HEADER_VALUE
+        CUSTOM_CLOUDFRONT_HEADER_VALUE_var = get_or_create_env_var('CUSTOM_CLOUDFRONT_HEADER_VALUE', '')
+        #print(f'The value of CUSTOM_CLOUDFRONT_HEADER_VALUE_var is {CUSTOM_CLOUDFRONT_HEADER_VALUE_var}')
+        if CUSTOM_CLOUDFRONT_HEADER_var and CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+            if CUSTOM_CLOUDFRONT_HEADER_var in request.headers:
+                supplied_cloudfront_custom_value = request.headers[CUSTOM_CLOUDFRONT_HEADER_var]
+                if supplied_cloudfront_custom_value == CUSTOM_CLOUDFRONT_HEADER_VALUE_var:
+                    print("Custom Cloudfront header found:", supplied_cloudfront_custom_value)
+                else:
+                    raise(ValueError, "Custom Cloudfront header value does not match expected value.")
+        # Get output save folder from 1 - username passed in from direct Cognito login, 2 - Cognito ID header passed through a Lambda authenticator, 3 - the session hash.
+        if request.username:
+            out_session_hash = request.username
+            base_folder = "user-files/"
+        elif 'x-cognito-id' in request.headers:
             out_session_hash = request.headers['x-cognito-id']
             base_folder = "user-files/"
+            print("Cognito ID found:", out_session_hash)
         else:
             out_session_hash = request.session_hash
             base_folder = "temp-files/"
+            # print("Cognito ID not found. Using session hash as save folder:", out_session_hash)
         output_folder = base_folder + out_session_hash + "/"
+        #if bucket_name:
+        #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
+        return out_session_hash, output_folder
     else:
         print("No session parameters found.")
+        return "",""
 def detect_file_type(filename):
     """Detect the file type based on its extension."""
     columns_found = [column for column in columns_to_check if column in topic_model.get_document_info(docs).columns]
     doc_dets = topic_model.get_document_info(docs)[columns_found]
+    ### If there are full topic probabilities, join these on to the document details df
+    def is_valid_dataframe(df):
+        """
+        Checks if the given object is a non-empty pandas DataFrame.
+        Args:
+            df: The object to check.
+        Returns:
+            True if df is a non-empty DataFrame, False otherwise.
+        """
+        if df is None:  # Check for None first
+            return False
+        return isinstance(df, pd.DataFrame) and not df.empty
+    if is_valid_dataframe(topic_model.probabilities_):
+        doc_dets = doc_dets.merge(topic_model.probabilities_, left_index=True, right_index=True, how="left")
     # If you have created a 'sentence split' dataset from the cleaning options, map these sentences back to the original document.
     try:
         if split_sentence_drop == "Yes":
             grouped = doc_dets.groupby('parent_document_index')
             # 2. Aggregate Topics and Probabilities:
+            # def aggregate_topics(group):
+            #     original_text = ' '.join(group['Document'])
+            #     topics = group['Topic'].tolist()
+            #     if 'Name' in group.columns:
+            #         topic_names = group['Name'].tolist()
+            #     else:
+            #         topic_names = None
+            #     if 'Probability' in group.columns:
+            #         probabilities = group['Probability'].tolist()
+            #     else:
+            #         probabilities = None  # Or any other default value you prefer
+            #     return pd.Series({'Document':original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
             def aggregate_topics(group):
                 original_text = ' '.join(group['Document'])
+                # Filter out topics starting with '-1'
+                topics = [topic for topic in group['Topic'].tolist() if not str(topic).startswith('-1')]
                 if 'Name' in group.columns:
+                    # Filter out topic names corresponding to excluded topics
+                    topic_names = [name for topic, name in zip(group['Topic'], group['Name'].tolist()) if not str(topic).startswith('-1')]
                 else:
                     topic_names = None
                 if 'Probability' in group.columns:
+                    # Filter out probabilities corresponding to excluded topics
+                    probabilities = [prob for topic, prob in zip(group['Topic'], group['Probability'].tolist()) if not str(topic).startswith('-1')]
                 else:
+                    probabilities = None
+                return pd.Series({'Document': original_text, 'Topic numbers': topics, 'Topic names': topic_names, 'Probabilities': probabilities})
             #result_df = grouped.apply(aggregate_topics).reset_index()
             doc_det_agg = grouped.apply(lambda x: aggregate_topics(x)).reset_index()

funcs/topic_core_funcs.py CHANGED Viewed

@@ -52,7 +52,7 @@ def change_default_vis_col(in_colnames:List[str]):
     else:
         return gr.Dropdown()
-def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
@@ -65,6 +65,7 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
         anonymise_drop (str): Option to anonymize data ("Yes" or "No").
         sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
         embeddings_state (dict): State of the embeddings.
         progress (gr.Progress, optional): Progress tracker for the cleaning process.
@@ -140,6 +141,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
     if sentence_split_drop == "Yes":
         progress(0.6, desc= "Splitting text into sentences")
@@ -149,11 +152,14 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         anon_tic = time.perf_counter()
         data = expand_sentences_spacy(data, in_colnames_list_first)
-        data = data[data[in_colnames_list_first].str.len() >= 25] # Keep only rows with at least 25 characters
         data.reset_index(inplace=True, drop=True)
         anon_toc = time.perf_counter()
-        time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
@@ -252,6 +258,9 @@ def extract_topics(
     elif calc_probs == "Yes":
         print("Calculating all probabilities.")
         calc_probs = True
     if not in_colnames:
         error_message = "Please enter one column name to use for cleaning and finding topics."
@@ -279,7 +288,7 @@ def extract_topics(
         # Attempt to load the model from each local location
         for location in local_embeddings_locations:
             try:
-                embedding_model = SentenceTransformer(location, truncate_dim=512)
                 print(f"Found local model installation at: {location}")
                 break  # Exit the loop if the model is found
             except Exception as e:
@@ -287,7 +296,7 @@ def extract_topics(
                 continue
         else:
             # If the loop completes without finding the model in any local location
-            embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
             print("Could not find local model installation. Downloading from Huggingface")
         #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
@@ -343,6 +352,7 @@ def extract_topics(
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
             if calc_probs == True:
                 topics_probs_out = pd.DataFrame(topic_model.probabilities_)
                 topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
                 topics_probs_out.to_csv(topics_probs_out_name)
@@ -385,6 +395,10 @@ def extract_topics(
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
             if calc_probs == True:
                 topics_probs_out = pd.DataFrame(topic_model.probabilities_)
                 topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
                 topics_probs_out.to_csv(topics_probs_out_name)
@@ -424,7 +438,7 @@ def extract_topics(
     # Tidy up topic label format a bit to have commas and spaces by default
     if not candidate_topics:
-        print("Zero shot topics found, so not renaming")
         new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
         topic_model.set_topic_labels(new_topic_labels)
     if candidate_topics:
@@ -447,7 +461,7 @@ def extract_topics(
      # If you want to save your embedding files
     if return_intermediate_files == "Yes":
         print("Saving embeddings to file")
-        if high_quality_mode == "Yes":
             embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
         else:
             if embeddings_super_compress == "No":

     else:
         return gr.Dropdown()
+def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str, custom_regex: pd.DataFrame, clean_text: str, drop_duplicate_text: str, anonymise_drop: str, sentence_split_drop: str, min_sentence_length: int, embeddings_state: dict, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> tuple:
     """
     Pre-processes the input data by cleaning text, removing duplicates, anonymizing data, and splitting sentences based on the provided options.
         drop_duplicate_text (str): Option to drop duplicate text ("Yes" or "No").
         anonymise_drop (str): Option to anonymize data ("Yes" or "No").
         sentence_split_drop (str): Option to split text into sentences ("Yes" or "No").
+        min_sentence_length (int): Minimum length of sentences after split (integer value of character length)
         embeddings_state (dict): State of the embeddings.
         progress (gr.Progress, optional): Progress tracker for the cleaning process.
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
+        print(time_out)
     if sentence_split_drop == "Yes":
         progress(0.6, desc= "Splitting text into sentences")
         anon_tic = time.perf_counter()
         data = expand_sentences_spacy(data, in_colnames_list_first)
+        data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
+        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
         data.reset_index(inplace=True, drop=True)
         anon_toc = time.perf_counter()
+        time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
+        print(time_out)
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
     elif calc_probs == "Yes":
         print("Calculating all probabilities.")
         calc_probs = True
+    if max_topics_slider == 0:
+        max_topics_slider = 'auto'
     if not in_colnames:
         error_message = "Please enter one column name to use for cleaning and finding topics."
         # Attempt to load the model from each local location
         for location in local_embeddings_locations:
             try:
+                embedding_model = SentenceTransformer(location)#, truncate_dim=512)
                 print(f"Found local model installation at: {location}")
                 break  # Exit the loop if the model is found
             except Exception as e:
                 continue
         else:
             # If the loop completes without finding the model in any local location
+            embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
             print("Could not find local model installation. Downloading from Huggingface")
         #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
             if calc_probs == True:
                 topics_probs_out = pd.DataFrame(topic_model.probabilities_)
                 topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
                 topics_probs_out.to_csv(topics_probs_out_name)
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
             if calc_probs == True:
+                assigned_topics, probs = topic_model.transform(docs, embeddings_out)
+                print("Probs:", probs)
+                topic_model.probabilities_ = probs
                 topics_probs_out = pd.DataFrame(topic_model.probabilities_)
                 topics_probs_out_name = output_folder + "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
                 topics_probs_out.to_csv(topics_probs_out_name)
     # Tidy up topic label format a bit to have commas and spaces by default
     if not candidate_topics:
+        print("Zero shot topics not found, so not renaming")
         new_topic_labels = topic_model.generate_topic_labels(nr_words=3, separator=", ")
         topic_model.set_topic_labels(new_topic_labels)
     if candidate_topics:
      # If you want to save your embedding files
     if return_intermediate_files == "Yes":
         print("Saving embeddings to file")
+        if high_quality_mode == "No":
             embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
         else:
             if embeddings_super_compress == "No":

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 transformers==4.41.2
 accelerate==0.26.1
 torch==2.3.1

 gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
+boto3
 transformers==4.41.2
 accelerate==0.26.1
 torch==2.3.1

requirements_gpu.txt CHANGED Viewed

@@ -1,11 +1,12 @@
 gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 transformers==4.41.2
 accelerate==0.26.1
 bertopic==0.16.2
 spacy==3.7.4
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 pyarrow==14.0.2
-openpyxl==3.1.2
 Faker==22.2.0
 presidio_analyzer==2.2.354
 presidio_anonymizer==2.2.354

 gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
+boto3
 transformers==4.41.2
 accelerate==0.26.1
 bertopic==0.16.2
 spacy==3.7.4
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 pyarrow==14.0.2
+openpyxl==3.1.3
 Faker==22.2.0
 presidio_analyzer==2.2.354
 presidio_anonymizer==2.2.354