Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jun 21, 2024

Commit

7810536

1 Parent(s): 12224f5

Can now redaction text or csv/xlsx files. Can redact multiple files. Embeds redactions as image-based file by default

Browse files

Files changed (9) hide show

Dockerfile +1 -1
app.py +73 -43
requirements.txt +3 -7
tools/aws_functions.py +36 -42
tools/data_anonymise.py +289 -0
tools/file_conversion.py +53 -33
tools/file_redaction.py +73 -133
tools/helper_functions.py +46 -0
tools/presidio_analyzer_custom.py +119 -0

Dockerfile CHANGED Viewed

@@ -16,7 +16,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir gradio==4.33.0
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir gradio==4.36.1
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import os
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
 from tools.file_redaction import choose_and_run_redactor
-from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
-from tools.aws_functions import load_data_from_aws
 import gradio as gr
 add_folder_to_path("_internal/tesseract/")
@@ -18,9 +19,7 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 # Create the gradio interface
 block = gr.Blocks(theme = gr.themes.Base())
 with block:
@@ -32,61 +31,92 @@ with block:
     gr.Markdown(
     """
     # Document redaction
-    Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
-    WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed. Also, the output from the Text analysis ending 'as_text.pdf' is an annotated pdf, which is a layer on top of the text that can be removed. So the text has not truly been redacted. Use the '...as_img.pdf' versions instead for safer redaction.
     Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
     """)
-    with gr.Tab("Redact document"):
-        with gr.Accordion("Input document", open = True):
-            in_file = gr.File(label="Choose document file", file_count= "single")
-            in_redaction_method = gr.Radio(label="Redaction method - text analysis is faster but will fail on images or image-based PDFs.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
-            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
-            in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
-            in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
-        redact_btn = gr.Button("Redact document", variant="primary")
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
         with gr.Row():
-            convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
-    with gr.Tab(label="Advanced options"):
-        with gr.Accordion(label = "AWS data access", open = True):
-                aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
-                with gr.Row():
-                    in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
-                    load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
-                aws_log_box = gr.Textbox(label="AWS data load status")
-    ### Loading AWS data ###
-    load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
     then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
-                    outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
-    convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
-                    outputs=[output_summary, output_file])
-# Simple run for HF spaces or local on your computer
-#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
-# Simple run for AWS server
-block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
-# Download OpenSSL from here:
-# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
-#block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
-#                     ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
-# Running on local server without https
-#block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df
 from tools.file_redaction import choose_and_run_redactor
+from tools.file_conversion import prepare_image_or_text_pdf
+from tools.data_anonymise import do_anonymise
+#from tools.aws_functions import load_data_from_aws
 import gradio as gr
 add_folder_to_path("_internal/tesseract/")
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 # Create the gradio interface
 block = gr.Blocks(theme = gr.themes.Base())
 with block:
     gr.Markdown(
     """
     # Document redaction
+    Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
+    WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
     Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
     """)
+    with gr.Tab("PDFs/images"):
+        with gr.Accordion("Redact document", open = True):
+            in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png'])
+            redact_btn = gr.Button("Redact document(s)", variant="primary")
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
         with gr.Row():
+            convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
+    with gr.Tab(label="Open text or Excel/csv files"):
+        gr.Markdown(
+    """
+    ### Choose open text or a tabular data file (xlsx or csv) to redact.
+    """
+        )
+        with gr.Accordion("Paste open text", open = False):
+            in_text = gr.Textbox(label="Enter open text", lines=10)
+        with gr.Accordion("Upload xlsx (first sheet read only) or csv file(s)", open = False):
+            in_file_text = gr.File(label="Choose an xlsx (first sheet read only) or csv files", file_count= "multiple", file_types=['.xlsx', '.csv', '.parquet', '.csv.gz'])
+        in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select columns that you want to anonymise. Ensure that at least one named column exists in all files.")
+        match_btn = gr.Button("Anonymise text", variant="primary")
+        with gr.Row():
+            text_output_summary = gr.Textbox(label="Output result")
+            text_output_file = gr.File(label="Output file")
+    with gr.Tab(label="Redaction settings"):
+        gr.Markdown(
+    """
+    Define redaction settings that affect both document and open text redaction.
+    """)
+        with gr.Accordion("Settings for documents", open = True):
+            in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
+        with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
+            anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
+        with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
+            in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
+            with gr.Row():
+                in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
+                in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
+    # AWS options - not yet implemented
+    # with gr.Tab(label="Advanced options"):
+    #     with gr.Accordion(label = "AWS data access", open = True):
+    #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
+    #         with gr.Row():
+    #             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
+    #             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
+    #         aws_log_box = gr.Textbox(label="AWS data load status")
+    # ### Loading AWS data ###
+    # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
                     outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
     then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
+                    outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")#.\
+                    #then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
+                    #outputs=[output_summary, output_file])
+    #convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
+    #                outputs=[output_summary, output_file], api_name="convert_to_img")
+     # Open text interaction
+    in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
+    match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
+# Launch the Gradio app
+if __name__ == "__main__":
+    block.queue().launch(show_error=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861

requirements.txt CHANGED Viewed

@@ -10,10 +10,6 @@ spacy # Not specified as latest versions create a conflict with latest versions
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
 gradio # Not specified as latest versions create a conflict with latest versions of spacy
 boto3==1.34.103
-# Following are not currently necessary for the app, may be added for improved pdf parsing in future
-#unstructured
-#unstructured_inference # This is big! Only necessary if you want to use the high res strategy in pdf_partition
-#unstructured_pytesseract
-#pillow-heif
-#python-docx
-#python-pptx

 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
 gradio # Not specified as latest versions create a conflict with latest versions of spacy
 boto3==1.34.103
+faker
+openpyxl
+pyarrow

tools/aws_functions.py CHANGED Viewed

@@ -3,45 +3,44 @@ import pandas as pd
 import boto3
 import tempfile
 import os
 PandasDataFrame = Type[pd.DataFrame]
-bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
-try:
-    session = boto3.Session() # profile_name="default"
-except Exception as e:
-    print(e)
-# sts = session.client("sts")
-# Create a Session with the IAM role ARN
-# aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
-# response = sts.assume_role(
-#     RoleArn=aws_role,
-#     RoleSessionName="ecs-test-session"
-# )
-# print(response)
-def get_assumed_role_info():
-    sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
-    response = sts.get_caller_identity()
-    # Extract ARN of the assumed role
-    assumed_role_arn = response['Arn']
-    # Extract the name of the assumed role from the ARN
-    assumed_role_name = assumed_role_arn.split('/')[-1]
-    return assumed_role_arn, assumed_role_name
-try:
-    assumed_role_arn, assumed_role_name = get_assumed_role_info()
-    print("Assumed Role ARN:", assumed_role_arn)
-    print("Assumed Role Name:", assumed_role_name)
-except Exception as e:
-    print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):
@@ -50,8 +49,6 @@ def download_file_from_s3(bucket_name, key, local_file_path):
     s3.download_file(bucket_name, key, local_file_path)
     print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
-#download_file_from_s3(bucket_name, object_key, local_file_loc)
 def download_folder_from_s3(bucket_name, s3_folder, local_folder):
     """
     Download all files from an S3 folder to a local folder.
@@ -77,7 +74,6 @@ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
     """
     Download specific files from an S3 folder to a local folder.
@@ -111,8 +107,6 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
     temp_dir = tempfile.mkdtemp()

 import boto3
 import tempfile
 import os
+from tools.helper_functions import get_or_create_env_var
 PandasDataFrame = Type[pd.DataFrame]
+bucket_name=""
+# Get AWS credentials if required
+aws_var = "RUN_AWS_FUNCTIONS"
+aws_var_default = "0"
+aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
+print(f'The value of {aws_var} is {aws_var_val}')
+if aws_var_val == "1":
+    try:
+        bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
+        session = boto3.Session() # profile_name="default"
+    except Exception as e:
+        print(e)
+    def get_assumed_role_info():
+        sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
+        response = sts.get_caller_identity()
+        # Extract ARN of the assumed role
+        assumed_role_arn = response['Arn']
+        # Extract the name of the assumed role from the ARN
+        assumed_role_name = assumed_role_arn.split('/')[-1]
+        return assumed_role_arn, assumed_role_name
+    try:
+        assumed_role_arn, assumed_role_name = get_assumed_role_info()
+        print("Assumed Role ARN:", assumed_role_arn)
+        print("Assumed Role Name:", assumed_role_name)
+    except Exception as e:
+        print(e)
 # Download direct from S3 - requires login credentials
 def download_file_from_s3(bucket_name, key, local_file_path):
     s3.download_file(bucket_name, key, local_file_path)
     print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
 def download_folder_from_s3(bucket_name, s3_folder, local_folder):
     """
     Download all files from an S3 folder to a local folder.
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
     """
     Download specific files from an S3 folder to a local folder.
         except Exception as e:
             print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
 def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
     temp_dir = tempfile.mkdtemp()

tools/data_anonymise.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import re
+import secrets
+import base64
+import time
+import pandas as pd
+from faker import Faker
+from gradio import Progress
+from typing import List
+from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from tools.helper_functions import output_folder, get_file_path_end, read_file
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
+# Use custom version of analyze_dict to be able to track progress
+from tools.presidio_analyzer_custom import analyze_dict
+fake = Faker("en_UK")
+def fake_first_name(x):
+    return fake.first_name()
+def anon_consistent_names(df):
+    # ## Pick out common names and replace them with the same person value
+    df_dict = df.to_dict(orient="list")
+    analyzer = AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
+    analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
+    analyzer_results = list(analyzer_results)
+    # + tags=[]
+    text = analyzer_results[3].value
+    # + tags=[]
+    recognizer_result = str(analyzer_results[3].recognizer_results)
+    # + tags=[]
+    recognizer_result
+    # + tags=[]
+    data_str = recognizer_result  # abbreviated for brevity
+    # Adjusting the parse_dict function to handle trailing ']'
+    # Splitting the main data string into individual list strings
+    list_strs = data_str[1:-1].split('], [')
+    def parse_dict(s):
+        s = s.strip('[]')  # Removing any surrounding brackets
+        items = s.split(', ')
+        d = {}
+        for item in items:
+            key, value = item.split(': ')
+            if key == 'score':
+                d[key] = float(value)
+            elif key in ['start', 'end']:
+                d[key] = int(value)
+            else:
+                d[key] = value
+        return d
+    # Re-running the improved processing code
+    result = []
+    for lst_str in list_strs:
+        # Splitting each list string into individual dictionary strings
+        dict_strs = lst_str.split(', type: ')
+        dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]]  # Prepending "type: " back to the split strings
+        # Parsing each dictionary string
+        dicts = [parse_dict(d) for d in dict_strs]
+        result.append(dicts)
+    #result
+    # + tags=[]
+    names = []
+    for idx, paragraph in enumerate(text):
+        paragraph_texts = []
+        for dictionary in result[idx]:
+            if dictionary['type'] == 'PERSON':
+                paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
+        names.append(paragraph_texts)
+    # + tags=[]
+    # Flatten the list of lists and extract unique names
+    unique_names = list(set(name for sublist in names for name in sublist))
+    # + tags=[]
+    fake_names = pd.Series(unique_names).apply(fake_first_name)
+    # + tags=[]
+    mapping_df = pd.DataFrame(data={"Unique names":unique_names,
+                    "Fake names": fake_names})
+    # + tags=[]
+    # Convert mapping dataframe to dictionary
+    # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
+    name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
+    # + tags=[]
+    name_map
+    # + tags=[]
+    scrubbed_df_consistent_names = df.replace(name_map, regex = True)
+    # + tags=[]
+    scrubbed_df_consistent_names
+    return scrubbed_df_consistent_names
+def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
+    # DataFrame to dict
+    df_dict = df.to_dict(orient="list")
+    if allow_list:
+        allow_list_flat = [item for sublist in allow_list for item in sublist]
+    #analyzer = nlp_analyser #AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    anonymizer = AnonymizerEngine()
+    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    # analyzer_results = batch_analyzer.analyze_dict(df_dict, language=language,
+    #                                                         entities=chosen_redact_entities,
+    #                                                         score_threshold=score_threshold,
+    #                                                         return_decision_process=False,
+    #                                                         allow_list=allow_list_flat)
+    print("Identifying personal information")
+    analyse_tic = time.perf_counter()
+    print("Allow list:", allow_list)
+    # Use custom analyzer to be able to track progress with Gradio
+    analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
+                                                            entities=chosen_redact_entities,
+                                                            score_threshold=score_threshold,
+                                                            return_decision_process=False,
+                                                            allow_list=allow_list_flat)
+    analyzer_results = list(analyzer_results)
+    #analyzer_results
+    analyse_toc = time.perf_counter()
+    analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
+    print(analyse_time_out)
+    # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
+    key = secrets.token_bytes(16)  # 128 bits = 16 bytes
+    key_string = base64.b64encode(key).decode('utf-8')
+    # Create faker function (note that it has to receive a value)
+    fake = Faker("en_UK")
+    def fake_first_name(x):
+        return fake.first_name()
+    # Set up the anonymization configuration WITHOUT DATE_TIME
+    replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
+    redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
+    hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
+    mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
+    people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
+    fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
+    if anon_strat == "replace": chosen_mask_config = replace_config
+    if anon_strat == "redact": chosen_mask_config = redact_config
+    if anon_strat == "hash": chosen_mask_config = hash_config
+    if anon_strat == "mask": chosen_mask_config = mask_config
+    if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
+    elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
+    # I think in general people will want to keep date / times
+    keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
+    combined_config = {**chosen_mask_config, **keep_date_config}
+    combined_config
+    anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
+    scrubbed_df = pd.DataFrame(anonymizer_results)
+    # Create reporting message
+    out_message = "Successfully anonymised"
+    if anon_strat == "encrypt":
+        out_message = out_message + ". Your decryption key is " + key_string + "."
+    return scrubbed_df, out_message
+def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
+    def check_lists(list1, list2):
+            return any(string in list2 for string in list1)
+    def get_common_strings(list1, list2):
+        """
+        Finds the common strings between two lists.
+        Args:
+            list1: The first list of strings.
+            list2: The second list of strings.
+        Returns:
+            A list containing the common strings.
+        """
+        common_strings = []
+        for string in list1:
+            if string in list2:
+                common_strings.append(string)
+        return common_strings
+    # Load file
+    anon_df = pd.DataFrame()
+    out_files_list = []
+    # Check if files and text exist
+    if not in_file:
+        if in_text:
+            in_file=['open_text']
+        else:
+            out_message = "Please enter text or a file to redact."
+            return out_message, None
+    for match_file in progress.tqdm(in_file, desc="Anonymising files", unit = "file"):
+        if match_file=='open_text':
+            anon_df = pd.DataFrame(data={'text':[in_text]})
+            chosen_cols=['text']
+            out_file_part = match_file
+        else:
+            anon_df = read_file(match_file)
+            out_file_part = get_file_path_end(match_file.name)
+        # Check for chosen col, skip file if not found
+        all_cols_original_order = list(anon_df.columns)
+        any_cols_found = check_lists(chosen_cols, all_cols_original_order)
+        if any_cols_found == False:
+            out_message = "No chosen columns found in dataframe: " + out_file_part
+            print(out_message)
+            continue
+        else:
+            chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
+        # Split dataframe to keep only selected columns
+        print("Remaining columns to redact:", chosen_cols_in_anon_df)
+        anon_df_part = anon_df[chosen_cols_in_anon_df]
+        anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
+        # Anonymise the selected columns
+        anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
+        # Rejoin the dataframe together
+        anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
+        anon_df_out = anon_df_out[all_cols_original_order]
+        # Export file
+        # out_file_part = re.sub(r'\.csv', '', match_file.name)
+        anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat + ".csv"
+        anon_df_out.to_csv(anon_export_file_name, index = None)
+        out_files_list.append(anon_export_file_name)
+        # Print result text to output text box if just anonymising open text
+        if match_file=='open_text':
+            out_message = anon_df_out['text'][0]
+    return out_message, out_files_list

tools/file_conversion.py CHANGED Viewed

@@ -45,9 +45,10 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
     images = []
     # Open the PDF file
-    for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
-        print("Current page: ", str(page_num))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
@@ -72,48 +73,61 @@ def process_file(file_path):
     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
-        out_path = [Image.open(file_path)]
     # Check if the file is a PDF
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
-        out_path = convert_pdf_to_images(file_path)
     else:
         print(f"{file_path} is not an image or PDF file.")
-        out_path = ['']
-    return out_path
-def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
     out_message = ''
     out_file_paths = []
-    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    if file_path:
-        file_path_without_ext = get_file_path_end(file_path)
-    else:
-        out_message = "No file selected"
-        print(out_message)
-        return out_message, out_file_paths
-    if in_redact_method == "Image analysis":
-        # Analyse and redact image-based pdf or image
-        if is_pdf_or_image(file_path) == False:
-            return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        out_file_path = process_file(file_path)
-    elif in_redact_method == "Text analysis":
-        if is_pdf(file_path) == False:
-            return "Please upload a PDF file for text analysis.", None
-        out_file_path = file_path
-    return out_message, out_file_path
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
@@ -122,14 +136,20 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     out_file_paths = out_text_file_path
     # Convert annotated text pdf back to image to give genuine redactions
-    print("Creating image version of results")
     pdf_text_image_paths = process_file(out_text_file_path[0])
-    out_text_image_file_path = output_folder + file_path_without_ext + "_result_as_text_back_to_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
-    out_file_paths.append(out_text_image_file_path)
-    out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
     return out_message, out_file_paths

     images = []
     # Open the PDF file
+    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+    for page_num in range(0,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
+        # print("Current page: ", str(page_num + 1))
         # Convert one page to image
         image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
+        img_object = [Image.open(file_path)]
     # Check if the file is a PDF
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
+        img_object = convert_pdf_to_images(file_path)
     else:
         print(f"{file_path} is not an image or PDF file.")
+        img_object = ['']
+    # print('Image object is:', img_object)
+    return img_object
+def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
     out_message = ''
     out_file_paths = []
+    #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    #for file in progress.tqdm(file_paths, desc="Preparing files"):
+    for file in file_paths:
+        file_path = file.name
+        #if file_path:
+        #    file_path_without_ext = get_file_path_end(file_path)
+        if not file_path:
+            out_message = "No file selected"
+            print(out_message)
+            return out_message, out_file_paths
+        if in_redact_method == "Image analysis":
+            # Analyse and redact image-based pdf or image
+            if is_pdf_or_image(file_path) == False:
+                out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                print(out_message)
+                return out_message, None
+            out_file_path = process_file(file_path)
+            print("Out file path at image conversion step:", out_file_path)
+        elif in_redact_method == "Text analysis":
+            if is_pdf(file_path) == False:
+                out_message = "Please upload a PDF file for text analysis."
+                print(out_message)
+                return out_message, None
+            out_file_path = file_path
+        out_file_paths.append(out_file_path)
+    return out_message, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     out_file_paths = out_text_file_path
     # Convert annotated text pdf back to image to give genuine redactions
+    print("Creating image version of redacted PDF to embed redactions.")
     pdf_text_image_paths = process_file(out_text_file_path[0])
+    out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
+    # out_file_paths.append(out_text_image_file_path)
+    out_file_paths = [out_text_image_file_path]
+    out_message = "PDF " + file_path_without_ext + " converted to image-based file."
+    print(out_message)
+    print("Out file paths:", out_file_paths)
     return out_message, out_file_paths

tools/file_redaction.py CHANGED Viewed

@@ -5,7 +5,7 @@ from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from tools.file_conversion import process_file
-from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
 import time
@@ -13,64 +13,89 @@ from collections import defaultdict  # For efficient grouping
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
-from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 import gradio as gr
-def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
-    out_message = ''
     out_file_paths = []
     if in_allow_list:
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    if file_path:
-         file_path_without_ext = get_file_path_end(file_path)
-    else:
-         out_message = "No file selected"
-         print(out_message)
-         return out_message, out_file_paths
-    if in_redact_method == "Image analysis":
-        # Analyse and redact image-based pdf or image
-        # if is_pdf_or_image(file_path) == False:
-        #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
-        out_image_file_path = output_folder + file_path_without_ext + "_result_as_img.pdf"
-        pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
-        out_file_paths.append(out_image_file_path)
-        out_message = "Image-based PDF successfully redacted and saved to file."
-    elif in_redact_method == "Text analysis":
-        if is_pdf(file_path) == False:
-            return "Please upload a PDF file for text analysis.", None
-        # Analyse text-based pdf
-        pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
-        out_text_file_path = output_folder + file_path_without_ext + "_result_as_text.pdf"
-        pdf_text.save(out_text_file_path)
-        out_file_paths.append(out_text_file_path)
-        out_message = "Text-based PDF successfully redacted and saved to file."
-    else:
-        out_message = "No redaction method selected"
-        print(out_message)
-        return out_message, out_file_paths
     toc = time.perf_counter()
     out_time = f"Time taken: {toc - tic:0.1f} seconds."
     print(out_time)
-    out_message = out_message + "\n\n" + out_time
-    return out_message, out_file_paths, out_file_paths
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
@@ -115,7 +140,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
-        progress(0, desc=out_message)
         image_paths = process_file(file_path)
@@ -124,9 +149,10 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     out_message = "Redacting pages"
     print(out_message)
-    progress(0.1, desc=out_message)
-    for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
         print("Redacting page ", str(i + 1))
@@ -171,7 +197,6 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     return images
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
@@ -189,9 +214,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     page_num = 0
-    for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
-        print("Page number is: ", page_num)
         annotations_on_page = []
         analyzed_bounding_boxes = []
@@ -309,88 +334,3 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
     return pdf
-# for page_num, annotations_on_page in enumerate(annotations_all_pages):
-            #     # 2. Normalize annotation heights on the same line:
-            #     line_heights = {}  # {y_coordinate: max_height}
-            #     # Get line heights for every annotation
-            #     for annotation in annotations_on_page:
-            #         if 'Rect' in annotation:
-            #             y = annotation['Rect'][1]
-            #             height = annotation['Rect'][3] - annotation['Rect'][1]
-            #             line_heights[y] = max(line_heights.get(y, 0), height)
-            #     # Update line heights for annotations
-            #     for annotation in annotations_on_page:
-            #         if 'Rect' in annotation:
-            #             y = annotation['Rect'][1]
-            #             annotation['Rect'][3] = y + line_heights[y]
-            #             # Update QuadPoints to match the new Rect coordinates
-            #             x1, y1, x2, y2 = annotation['Rect']  # Extract coordinates from Rect
-            #             annotation['QuadPoints'] = [
-            #                 x1, y2,  # Top left
-            #                 x2, y2,  # Top right
-            #                 x1, y1,  # Bottom left
-            #                 x2, y1   # Bottom right
-            #             ]
-# def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
-#     '''
-#     take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
-#     '''
-#     if not image_paths:
-#         out_message = "PDF does not exist as images. Converting pages to image"
-#         print(out_message)
-#         progress(0, desc=out_message)
-#         image_paths = process_file(file_path)
-#     # Create a new PDF
-#     #pdf = pikepdf.new()
-#     images = []
-#     number_of_pages = len(image_paths)
-#     out_message = "Redacting pages"
-#     print(out_message)
-#     progress(0.1, desc=out_message)
-#     for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
-#         print("Redacting page ", str(i + 1))
-#         # Get the image to redact using PIL lib (pillow)
-#         image = image_paths[i] #Image.open(image_paths[i])
-#         # %%
-#         image_analyser = ImageAnalyzerEngine(nlp_analyser)
-#         engine = ImageRedactorEngine(image_analyser)
-#         if language == 'en':
-#             ocr_lang = 'eng'
-#         else: ocr_lang = language
-#         # %%
-#         # Redact the image with pink color
-#         redacted_image = engine.redact(image,
-#             fill=(0, 0, 0),
-#             ocr_kwargs={"lang": ocr_lang},
-#             allow_list=allow_list,
-#             ad_hoc_recognizers= None,
-#             **{
-#                 "language": language,
-#                 "entities": chosen_redact_entities,
-#                 "score_threshold": score_threshold
-#             },
-#             )
-#         images.append(redacted_image)
-#     return images

 from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
 from tools.file_conversion import process_file
+from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
 import time
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 from tools.helper_functions import get_file_path_end, output_folder
+from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
+    out_message = []
     out_file_paths = []
     if in_allow_list:
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    print("File paths:", file_paths)
+    for file in progress.tqdm(file_paths, desc="Redacting files", unit = "files"):
+        file_path = file.name
+        if file_path:
+            file_path_without_ext = get_file_path_end(file_path)
+            if is_pdf(file_path) == False:
+                # If user has not submitted a pdf, assume it's an image
+                print("File is not a pdf, assuming that image analysis needs to be used.")
+                in_redact_method = "Image analysis"
+        else:
+            out_message = "No file selected"
+            print(out_message)
+            return out_message, out_file_paths
+        if in_redact_method == "Image analysis":
+            # Analyse and redact image-based pdf or image
+            # if is_pdf_or_image(file_path) == False:
+            #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+            print("Redacting file as image-based pdf")
+            pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
+            out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
+            pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
+            out_file_paths.append(out_image_file_path)
+            out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
+        elif in_redact_method == "Text analysis":
+            if is_pdf(file_path) == False:
+                return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
+            # Analyse text-based pdf
+            print('Redacting file as text-based PDF')
+            pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
+            out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
+            pdf_text.save(out_text_file_path)
+            #out_file_paths.append(out_text_file_path)
+            out_message_new = "File " + file_path_without_ext + " successfully redacted."
+            out_message.append(out_message_new)
+            # Convert message
+            convert_message="Converting PDF to image-based PDF to embed redactions."
+            #progress(0.8, desc=convert_message)
+            print(convert_message)
+            # Convert document to image-based document to 'embed' redactions
+            img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
+            out_file_paths.extend(img_output_file_path)
+            # Add confirmation for converting to image if you want
+            # out_message.append(img_output_summary)
+        else:
+            out_message = "No redaction method selected"
+            print(out_message)
+            return out_message, out_file_paths
     toc = time.perf_counter()
     out_time = f"Time taken: {toc - tic:0.1f} seconds."
     print(out_time)
+    out_message_out = '\n'.join(out_message)
+    out_message_out = out_message_out + "\n\n" + out_time
+    return out_message_out, out_file_paths, out_file_paths
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
+        #progress(0, desc=out_message)
         image_paths = process_file(file_path)
     out_message = "Redacting pages"
     print(out_message)
+    #progress(0.1, desc=out_message)
+    #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
+    for i in range(0, number_of_pages):
         print("Redacting page ", str(i + 1))
     return images
 def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     page_num = 0
+    #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
+    for page in pdf.pages:
+        print("Page number is: ", page_num + 1)
         annotations_on_page = []
         analyzed_bounding_boxes = []
     analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
     return pdf

tools/helper_functions.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
@@ -29,6 +31,36 @@ def get_file_path_end(file_path):
     return filename_without_extension
 def ensure_output_folder_exists():
     """Checks if the 'output/' folder exists, creates it if not."""
@@ -41,6 +73,20 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):

 import os
+import gradio as gr
+import pandas as pd
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     return filename_without_extension
+def detect_file_type(filename):
+    """Detect the file type based on its extension."""
+    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
+        return 'csv'
+    elif filename.endswith('.xlsx'):
+        return 'xlsx'
+    elif filename.endswith('.parquet'):
+        return 'parquet'
+    elif filename.endswith('.pdf'):
+        return 'pdf'
+    elif filename.endswith('.jpg'):
+        return 'jpg'
+    elif filename.endswith('.jpeg'):
+        return 'jpeg'
+    elif filename.endswith('.png'):
+        return 'png'
+    else:
+        raise ValueError("Unsupported file type.")
+def read_file(filename):
+    """Read the file based on its detected type."""
+    file_type = detect_file_type(filename)
+    if file_type == 'csv':
+        return pd.read_csv(filename, low_memory=False)
+    elif file_type == 'xlsx':
+        return pd.read_excel(filename)
+    elif file_type == 'parquet':
+        return pd.read_parquet(filename)
 def ensure_output_folder_exists():
     """Checks if the 'output/' folder exists, creates it if not."""
     else:
         print(f"The 'output/' folder already exists.")
+def put_columns_in_df(in_file):
+    new_choices = []
+    concat_choices = []
+    for file in in_file:
+        df = read_file(file.name)
+        new_choices = list(df.columns)
+        concat_choices.extend(new_choices)
+    # Drop duplicate columns
+    concat_choices = list(set(concat_choices))
+    return gr.Dropdown(choices=concat_choices, value=concat_choices)
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):

tools/presidio_analyzer_custom.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import gradio as gr
+from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
+from tqdm import tqdm
+from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
+from presidio_analyzer.nlp_engine import NlpArtifacts
+def analyze_iterator_custom(
+        self,
+        texts: Iterable[Union[str, bool, float, int]],
+        language: str,
+        list_length:int,
+        progress=gr.Progress(),
+        **kwargs,
+    ) -> List[List[RecognizerResult]]:
+        """
+        Analyze an iterable of strings.
+        :param texts: An list containing strings to be analyzed.
+        :param language: Input language
+        :param list_length: Length of the input list.
+        :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
+        """
+        # validate types
+        texts = self._validate_types(texts)
+        # Process the texts as batch for improved performance
+        nlp_artifacts_batch: Iterator[
+            Tuple[str, NlpArtifacts]
+        ] = self.analyzer_engine.nlp_engine.process_batch(
+            texts=texts, language=language
+        )
+        list_results = []
+        # Uncomment this if you want to show progress within a file
+        #for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
+        for text, nlp_artifacts in nlp_artifacts_batch:
+            results = self.analyzer_engine.analyze(
+                text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
+            )
+            list_results.append(results)
+        return list_results
+def analyze_dict(
+        self,
+        input_dict: Dict[str, Union[Any, Iterable[Any]]],
+        language: str,
+        keys_to_skip: Optional[List[str]] = None,
+        **kwargs,
+    ) -> Iterator[DictAnalyzerResult]:
+        """
+        Analyze a dictionary of keys (strings) and values/iterable of values.
+        Non-string values are returned as is.
+        :param input_dict: The input dictionary for analysis
+        :param language: Input language
+        :param keys_to_skip: Keys to ignore during analysis
+        :param kwargs: Additional keyword arguments
+        for the `AnalyzerEngine.analyze` method.
+        Use this to pass arguments to the analyze method,
+        such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
+        See `AnalyzerEngine.analyze` for the full list.
+        """
+        context = []
+        if "context" in kwargs:
+            context = kwargs["context"]
+            del kwargs["context"]
+        if not keys_to_skip:
+            keys_to_skip = []
+        for key, value in input_dict.items():
+            if not value or key in keys_to_skip:
+                yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
+                continue  # skip this key as requested
+            # Add the key as an additional context
+            specific_context = context[:]
+            specific_context.append(key)
+            if type(value) in (str, int, bool, float):
+                results: List[RecognizerResult] = self.analyzer_engine.analyze(
+                    text=str(value), language=language, context=[key], **kwargs
+                )
+            elif isinstance(value, dict):
+                new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
+                results = self.analyze_dict(
+                    input_dict=value,
+                    language=language,
+                    context=specific_context,
+                    keys_to_skip=new_keys_to_skip,
+                    **kwargs,
+                )
+            elif isinstance(value, Iterable):
+                # Recursively iterate nested dicts
+                list_length = len(value)
+                results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
+                    texts=value,
+                    language=language,
+                    context=specific_context,
+                    list_length=list_length,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(f"type {type(value)} is unsupported.")
+            yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)