seanpedrickcase commited on
Commit
7810536
·
1 Parent(s): 12224f5

Can now redaction text or csv/xlsx files. Can redact multiple files. Embeds redactions as image-based file by default

Browse files
Dockerfile CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- RUN pip install --no-cache-dir gradio==4.33.0
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
 
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
+ RUN pip install --no-cache-dir gradio==4.36.1
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
app.py CHANGED
@@ -3,10 +3,11 @@ import os
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
7
  from tools.file_redaction import choose_and_run_redactor
8
- from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
9
- from tools.aws_functions import load_data_from_aws
 
10
  import gradio as gr
11
 
12
  add_folder_to_path("_internal/tesseract/")
@@ -18,9 +19,7 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
18
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
19
  language = 'en'
20
 
21
-
22
  # Create the gradio interface
23
-
24
  block = gr.Blocks(theme = gr.themes.Base())
25
 
26
  with block:
@@ -32,61 +31,92 @@ with block:
32
  gr.Markdown(
33
  """
34
  # Document redaction
35
- Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
36
 
37
- WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed. Also, the output from the Text analysis ending 'as_text.pdf' is an annotated pdf, which is a layer on top of the text that can be removed. So the text has not truly been redacted. Use the '...as_img.pdf' versions instead for safer redaction.
 
 
38
 
39
  Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
40
  """)
41
 
42
- with gr.Tab("Redact document"):
43
-
44
- with gr.Accordion("Input document", open = True):
45
- in_file = gr.File(label="Choose document file", file_count= "single")
46
- in_redaction_method = gr.Radio(label="Redaction method - text analysis is faster but will fail on images or image-based PDFs.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
47
- in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
48
- in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
49
- in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
50
-
51
- redact_btn = gr.Button("Redact document", variant="primary")
52
 
53
  with gr.Row():
54
  output_summary = gr.Textbox(label="Output summary")
55
  output_file = gr.File(label="Output file")
56
 
57
  with gr.Row():
58
- convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
59
-
60
- with gr.Tab(label="Advanced options"):
61
- with gr.Accordion(label = "AWS data access", open = True):
62
- aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
63
- with gr.Row():
64
- in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
65
- load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
66
-
67
- aws_log_box = gr.Textbox(label="AWS data load status")
68
 
69
- ### Loading AWS data ###
70
- load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
 
 
 
 
 
72
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
73
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
74
  then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
75
- outputs=[output_summary, output_file, output_file_list_state], api_name="redact")
76
-
77
- convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
78
- outputs=[output_summary, output_file])
79
 
80
- # Simple run for HF spaces or local on your computer
81
- #block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
82
 
83
- # Simple run for AWS server
84
- block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
 
85
 
86
- # Download OpenSSL from here:
87
- # Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
88
- #block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
89
- # ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
90
 
91
- # Running on local server without https
92
- #block.queue().launch(server_name="0.0.0.0", server_port=7861, ssl_verify=False)
 
 
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df
7
  from tools.file_redaction import choose_and_run_redactor
8
+ from tools.file_conversion import prepare_image_or_text_pdf
9
+ from tools.data_anonymise import do_anonymise
10
+ #from tools.aws_functions import load_data_from_aws
11
  import gradio as gr
12
 
13
  add_folder_to_path("_internal/tesseract/")
 
19
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
20
  language = 'en'
21
 
 
22
  # Create the gradio interface
 
23
  block = gr.Blocks(theme = gr.themes.Base())
24
 
25
  with block:
 
31
  gr.Markdown(
32
  """
33
  # Document redaction
 
34
 
35
+ Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
36
+
37
+ WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
38
 
39
  Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
40
  """)
41
 
42
+ with gr.Tab("PDFs/images"):
43
+
44
+ with gr.Accordion("Redact document", open = True):
45
+ in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png'])
46
+ redact_btn = gr.Button("Redact document(s)", variant="primary")
 
 
 
 
 
47
 
48
  with gr.Row():
49
  output_summary = gr.Textbox(label="Output summary")
50
  output_file = gr.File(label="Output file")
51
 
52
  with gr.Row():
53
+ convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
 
 
 
 
 
 
 
 
 
54
 
55
+ with gr.Tab(label="Open text or Excel/csv files"):
56
+ gr.Markdown(
57
+ """
58
+ ### Choose open text or a tabular data file (xlsx or csv) to redact.
59
+ """
60
+ )
61
+ with gr.Accordion("Paste open text", open = False):
62
+ in_text = gr.Textbox(label="Enter open text", lines=10)
63
+ with gr.Accordion("Upload xlsx (first sheet read only) or csv file(s)", open = False):
64
+ in_file_text = gr.File(label="Choose an xlsx (first sheet read only) or csv files", file_count= "multiple", file_types=['.xlsx', '.csv', '.parquet', '.csv.gz'])
65
+
66
+ in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select columns that you want to anonymise. Ensure that at least one named column exists in all files.")
67
+
68
+ match_btn = gr.Button("Anonymise text", variant="primary")
69
+
70
+ with gr.Row():
71
+ text_output_summary = gr.Textbox(label="Output result")
72
+ text_output_file = gr.File(label="Output file")
73
+
74
+ with gr.Tab(label="Redaction settings"):
75
+ gr.Markdown(
76
+ """
77
+ Define redaction settings that affect both document and open text redaction.
78
+ """)
79
+ with gr.Accordion("Settings for documents", open = True):
80
+ in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
81
+ with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
82
+ anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
83
+
84
+ with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
85
+ in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
86
+ with gr.Row():
87
+ in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
88
+ in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
89
+
90
+ # AWS options - not yet implemented
91
+ # with gr.Tab(label="Advanced options"):
92
+ # with gr.Accordion(label = "AWS data access", open = True):
93
+ # aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
94
+ # with gr.Row():
95
+ # in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
96
+ # load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
97
+
98
+ # aws_log_box = gr.Textbox(label="AWS data load status")
99
 
100
+ # ### Loading AWS data ###
101
+ # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
102
+
103
+
104
+ # Document redaction
105
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
106
  outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
107
  then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
108
+ outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")#.\
109
+ #then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
110
+ #outputs=[output_summary, output_file])
 
111
 
112
+ #convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
113
+ # outputs=[output_summary, output_file], api_name="convert_to_img")
114
 
115
+ # Open text interaction
116
+ in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
117
+ match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
118
 
 
 
 
 
119
 
120
+ # Launch the Gradio app
121
+ if __name__ == "__main__":
122
+ block.queue().launch(show_error=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
requirements.txt CHANGED
@@ -10,10 +10,6 @@ spacy # Not specified as latest versions create a conflict with latest versions
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
  gradio # Not specified as latest versions create a conflict with latest versions of spacy
12
  boto3==1.34.103
13
- # Following are not currently necessary for the app, may be added for improved pdf parsing in future
14
- #unstructured
15
- #unstructured_inference # This is big! Only necessary if you want to use the high res strategy in pdf_partition
16
- #unstructured_pytesseract
17
- #pillow-heif
18
- #python-docx
19
- #python-pptx
 
10
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
11
  gradio # Not specified as latest versions create a conflict with latest versions of spacy
12
  boto3==1.34.103
13
+ faker
14
+ openpyxl
15
+ pyarrow
 
 
 
 
tools/aws_functions.py CHANGED
@@ -3,45 +3,44 @@ import pandas as pd
3
  import boto3
4
  import tempfile
5
  import os
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
-
9
- bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
10
-
11
- try:
12
- session = boto3.Session() # profile_name="default"
13
- except Exception as e:
14
- print(e)
15
-
16
- # sts = session.client("sts")
17
- # Create a Session with the IAM role ARN
18
- # aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
19
- # response = sts.assume_role(
20
- # RoleArn=aws_role,
21
- # RoleSessionName="ecs-test-session"
22
- # )
23
- # print(response)
24
-
25
-
26
- def get_assumed_role_info():
27
- sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
28
- response = sts.get_caller_identity()
29
-
30
- # Extract ARN of the assumed role
31
- assumed_role_arn = response['Arn']
32
-
33
- # Extract the name of the assumed role from the ARN
34
- assumed_role_name = assumed_role_arn.split('/')[-1]
35
-
36
- return assumed_role_arn, assumed_role_name
37
-
38
- try:
39
- assumed_role_arn, assumed_role_name = get_assumed_role_info()
40
-
41
- print("Assumed Role ARN:", assumed_role_arn)
42
- print("Assumed Role Name:", assumed_role_name)
43
- except Exception as e:
44
- print(e)
45
 
46
  # Download direct from S3 - requires login credentials
47
  def download_file_from_s3(bucket_name, key, local_file_path):
@@ -50,8 +49,6 @@ def download_file_from_s3(bucket_name, key, local_file_path):
50
  s3.download_file(bucket_name, key, local_file_path)
51
  print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
52
 
53
- #download_file_from_s3(bucket_name, object_key, local_file_loc)
54
-
55
  def download_folder_from_s3(bucket_name, s3_folder, local_folder):
56
  """
57
  Download all files from an S3 folder to a local folder.
@@ -77,7 +74,6 @@ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
77
  except Exception as e:
78
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
79
 
80
-
81
  def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
82
  """
83
  Download specific files from an S3 folder to a local folder.
@@ -111,8 +107,6 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
111
  except Exception as e:
112
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
113
 
114
-
115
-
116
  def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
117
 
118
  temp_dir = tempfile.mkdtemp()
 
3
  import boto3
4
  import tempfile
5
  import os
6
+ from tools.helper_functions import get_or_create_env_var
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
+ bucket_name=""
10
+
11
+ # Get AWS credentials if required
12
+
13
+ aws_var = "RUN_AWS_FUNCTIONS"
14
+ aws_var_default = "0"
15
+ aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
16
+ print(f'The value of {aws_var} is {aws_var_val}')
17
+
18
+ if aws_var_val == "1":
19
+ try:
20
+ bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
21
+ session = boto3.Session() # profile_name="default"
22
+ except Exception as e:
23
+ print(e)
24
+
25
+ def get_assumed_role_info():
26
+ sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
27
+ response = sts.get_caller_identity()
28
+
29
+ # Extract ARN of the assumed role
30
+ assumed_role_arn = response['Arn']
31
+
32
+ # Extract the name of the assumed role from the ARN
33
+ assumed_role_name = assumed_role_arn.split('/')[-1]
34
+
35
+ return assumed_role_arn, assumed_role_name
36
+
37
+ try:
38
+ assumed_role_arn, assumed_role_name = get_assumed_role_info()
39
+
40
+ print("Assumed Role ARN:", assumed_role_arn)
41
+ print("Assumed Role Name:", assumed_role_name)
42
+ except Exception as e:
43
+ print(e)
 
 
44
 
45
  # Download direct from S3 - requires login credentials
46
  def download_file_from_s3(bucket_name, key, local_file_path):
 
49
  s3.download_file(bucket_name, key, local_file_path)
50
  print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
51
 
 
 
52
  def download_folder_from_s3(bucket_name, s3_folder, local_folder):
53
  """
54
  Download all files from an S3 folder to a local folder.
 
74
  except Exception as e:
75
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
76
 
 
77
  def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
78
  """
79
  Download specific files from an S3 folder to a local folder.
 
107
  except Exception as e:
108
  print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
109
 
 
 
110
  def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
111
 
112
  temp_dir = tempfile.mkdtemp()
tools/data_anonymise.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import secrets
3
+ import base64
4
+ import time
5
+ import pandas as pd
6
+
7
+ from faker import Faker
8
+
9
+ from gradio import Progress
10
+ from typing import List
11
+
12
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
13
+ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
14
+ from presidio_anonymizer.entities import OperatorConfig
15
+
16
+ from tools.helper_functions import output_folder, get_file_path_end, read_file
17
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
18
+
19
+ # Use custom version of analyze_dict to be able to track progress
20
+ from tools.presidio_analyzer_custom import analyze_dict
21
+
22
+
23
+ fake = Faker("en_UK")
24
+ def fake_first_name(x):
25
+ return fake.first_name()
26
+
27
+ def anon_consistent_names(df):
28
+ # ## Pick out common names and replace them with the same person value
29
+ df_dict = df.to_dict(orient="list")
30
+
31
+ analyzer = AnalyzerEngine()
32
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
33
+
34
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
35
+ analyzer_results = list(analyzer_results)
36
+
37
+ # + tags=[]
38
+ text = analyzer_results[3].value
39
+
40
+ # + tags=[]
41
+ recognizer_result = str(analyzer_results[3].recognizer_results)
42
+
43
+ # + tags=[]
44
+ recognizer_result
45
+
46
+ # + tags=[]
47
+ data_str = recognizer_result # abbreviated for brevity
48
+
49
+ # Adjusting the parse_dict function to handle trailing ']'
50
+ # Splitting the main data string into individual list strings
51
+ list_strs = data_str[1:-1].split('], [')
52
+
53
+ def parse_dict(s):
54
+ s = s.strip('[]') # Removing any surrounding brackets
55
+ items = s.split(', ')
56
+ d = {}
57
+ for item in items:
58
+ key, value = item.split(': ')
59
+ if key == 'score':
60
+ d[key] = float(value)
61
+ elif key in ['start', 'end']:
62
+ d[key] = int(value)
63
+ else:
64
+ d[key] = value
65
+ return d
66
+
67
+ # Re-running the improved processing code
68
+
69
+ result = []
70
+
71
+ for lst_str in list_strs:
72
+ # Splitting each list string into individual dictionary strings
73
+ dict_strs = lst_str.split(', type: ')
74
+ dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
75
+
76
+ # Parsing each dictionary string
77
+ dicts = [parse_dict(d) for d in dict_strs]
78
+ result.append(dicts)
79
+
80
+ #result
81
+
82
+ # + tags=[]
83
+ names = []
84
+
85
+ for idx, paragraph in enumerate(text):
86
+ paragraph_texts = []
87
+ for dictionary in result[idx]:
88
+ if dictionary['type'] == 'PERSON':
89
+ paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
90
+ names.append(paragraph_texts)
91
+
92
+ # + tags=[]
93
+ # Flatten the list of lists and extract unique names
94
+ unique_names = list(set(name for sublist in names for name in sublist))
95
+
96
+ # + tags=[]
97
+ fake_names = pd.Series(unique_names).apply(fake_first_name)
98
+
99
+ # + tags=[]
100
+ mapping_df = pd.DataFrame(data={"Unique names":unique_names,
101
+ "Fake names": fake_names})
102
+
103
+ # + tags=[]
104
+ # Convert mapping dataframe to dictionary
105
+ # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
106
+ name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
107
+
108
+ # + tags=[]
109
+ name_map
110
+
111
+ # + tags=[]
112
+ scrubbed_df_consistent_names = df.replace(name_map, regex = True)
113
+
114
+ # + tags=[]
115
+ scrubbed_df_consistent_names
116
+
117
+ return scrubbed_df_consistent_names
118
+
119
+ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
120
+ # DataFrame to dict
121
+ df_dict = df.to_dict(orient="list")
122
+
123
+ if allow_list:
124
+ allow_list_flat = [item for sublist in allow_list for item in sublist]
125
+
126
+ #analyzer = nlp_analyser #AnalyzerEngine()
127
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
128
+
129
+ anonymizer = AnonymizerEngine()
130
+
131
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
132
+
133
+ # analyzer_results = batch_analyzer.analyze_dict(df_dict, language=language,
134
+ # entities=chosen_redact_entities,
135
+ # score_threshold=score_threshold,
136
+ # return_decision_process=False,
137
+ # allow_list=allow_list_flat)
138
+
139
+ print("Identifying personal information")
140
+ analyse_tic = time.perf_counter()
141
+
142
+ print("Allow list:", allow_list)
143
+
144
+ # Use custom analyzer to be able to track progress with Gradio
145
+ analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
146
+ entities=chosen_redact_entities,
147
+ score_threshold=score_threshold,
148
+ return_decision_process=False,
149
+ allow_list=allow_list_flat)
150
+ analyzer_results = list(analyzer_results)
151
+ #analyzer_results
152
+
153
+ analyse_toc = time.perf_counter()
154
+ analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
155
+ print(analyse_time_out)
156
+
157
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
158
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
159
+ key_string = base64.b64encode(key).decode('utf-8')
160
+
161
+ # Create faker function (note that it has to receive a value)
162
+
163
+ fake = Faker("en_UK")
164
+
165
+ def fake_first_name(x):
166
+ return fake.first_name()
167
+
168
+ # Set up the anonymization configuration WITHOUT DATE_TIME
169
+ replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
170
+ redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
171
+ hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
172
+ mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
173
+ people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
174
+ fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
175
+
176
+
177
+ if anon_strat == "replace": chosen_mask_config = replace_config
178
+ if anon_strat == "redact": chosen_mask_config = redact_config
179
+ if anon_strat == "hash": chosen_mask_config = hash_config
180
+ if anon_strat == "mask": chosen_mask_config = mask_config
181
+ if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
182
+ elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
183
+
184
+ # I think in general people will want to keep date / times
185
+ keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
186
+
187
+ combined_config = {**chosen_mask_config, **keep_date_config}
188
+ combined_config
189
+
190
+ anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
191
+
192
+ scrubbed_df = pd.DataFrame(anonymizer_results)
193
+
194
+ # Create reporting message
195
+ out_message = "Successfully anonymised"
196
+
197
+ if anon_strat == "encrypt":
198
+ out_message = out_message + ". Your decryption key is " + key_string + "."
199
+
200
+ return scrubbed_df, out_message
201
+
202
+ def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
203
+
204
+ def check_lists(list1, list2):
205
+ return any(string in list2 for string in list1)
206
+
207
+ def get_common_strings(list1, list2):
208
+ """
209
+ Finds the common strings between two lists.
210
+
211
+ Args:
212
+ list1: The first list of strings.
213
+ list2: The second list of strings.
214
+
215
+ Returns:
216
+ A list containing the common strings.
217
+ """
218
+ common_strings = []
219
+ for string in list1:
220
+ if string in list2:
221
+ common_strings.append(string)
222
+ return common_strings
223
+
224
+ # Load file
225
+
226
+ anon_df = pd.DataFrame()
227
+ out_files_list = []
228
+
229
+ # Check if files and text exist
230
+ if not in_file:
231
+ if in_text:
232
+ in_file=['open_text']
233
+ else:
234
+ out_message = "Please enter text or a file to redact."
235
+ return out_message, None
236
+
237
+ for match_file in progress.tqdm(in_file, desc="Anonymising files", unit = "file"):
238
+
239
+ if match_file=='open_text':
240
+ anon_df = pd.DataFrame(data={'text':[in_text]})
241
+ chosen_cols=['text']
242
+ out_file_part = match_file
243
+ else:
244
+ anon_df = read_file(match_file)
245
+ out_file_part = get_file_path_end(match_file.name)
246
+
247
+
248
+
249
+ # Check for chosen col, skip file if not found
250
+ all_cols_original_order = list(anon_df.columns)
251
+
252
+ any_cols_found = check_lists(chosen_cols, all_cols_original_order)
253
+
254
+ if any_cols_found == False:
255
+ out_message = "No chosen columns found in dataframe: " + out_file_part
256
+ print(out_message)
257
+ continue
258
+ else:
259
+ chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
260
+
261
+ # Split dataframe to keep only selected columns
262
+ print("Remaining columns to redact:", chosen_cols_in_anon_df)
263
+
264
+ anon_df_part = anon_df[chosen_cols_in_anon_df]
265
+ anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
266
+
267
+ # Anonymise the selected columns
268
+ anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
269
+
270
+ # Rejoin the dataframe together
271
+ anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
272
+ anon_df_out = anon_df_out[all_cols_original_order]
273
+
274
+ # Export file
275
+
276
+
277
+ # out_file_part = re.sub(r'\.csv', '', match_file.name)
278
+
279
+ anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat + ".csv"
280
+
281
+ anon_df_out.to_csv(anon_export_file_name, index = None)
282
+
283
+ out_files_list.append(anon_export_file_name)
284
+
285
+ # Print result text to output text box if just anonymising open text
286
+ if match_file=='open_text':
287
+ out_message = anon_df_out['text'][0]
288
+
289
+ return out_message, out_files_list
tools/file_conversion.py CHANGED
@@ -45,9 +45,10 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
45
  images = []
46
 
47
  # Open the PDF file
48
- for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
 
49
 
50
- print("Current page: ", str(page_num))
51
 
52
  # Convert one page to image
53
  image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
@@ -72,48 +73,61 @@ def process_file(file_path):
72
  if file_extension in ['.jpg', '.jpeg', '.png']:
73
  print(f"{file_path} is an image file.")
74
  # Perform image processing here
75
- out_path = [Image.open(file_path)]
76
 
77
  # Check if the file is a PDF
78
  elif file_extension == '.pdf':
79
  print(f"{file_path} is a PDF file. Converting to image set")
80
  # Run your function for processing PDF files here
81
- out_path = convert_pdf_to_images(file_path)
82
 
83
  else:
84
  print(f"{file_path} is not an image or PDF file.")
85
- out_path = ['']
86
 
87
- return out_path
88
 
89
- def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
 
 
90
 
91
  out_message = ''
92
  out_file_paths = []
93
 
94
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
95
-
96
- if file_path:
97
- file_path_without_ext = get_file_path_end(file_path)
98
- else:
99
- out_message = "No file selected"
100
- print(out_message)
101
- return out_message, out_file_paths
102
-
103
- if in_redact_method == "Image analysis":
104
- # Analyse and redact image-based pdf or image
105
- if is_pdf_or_image(file_path) == False:
106
- return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
107
-
108
- out_file_path = process_file(file_path)
109
-
110
- elif in_redact_method == "Text analysis":
111
- if is_pdf(file_path) == False:
112
- return "Please upload a PDF file for text analysis.", None
113
-
114
- out_file_path = file_path
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- return out_message, out_file_path
117
 
118
 
119
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
@@ -122,14 +136,20 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
122
  out_file_paths = out_text_file_path
123
 
124
  # Convert annotated text pdf back to image to give genuine redactions
125
- print("Creating image version of results")
 
126
  pdf_text_image_paths = process_file(out_text_file_path[0])
127
- out_text_image_file_path = output_folder + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
 
130
- out_file_paths.append(out_text_image_file_path)
 
 
 
 
 
131
 
132
- out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
133
 
134
  return out_message, out_file_paths
135
 
 
45
  images = []
46
 
47
  # Open the PDF file
48
+ #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
49
+ for page_num in range(0,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
50
 
51
+ # print("Current page: ", str(page_num + 1))
52
 
53
  # Convert one page to image
54
  image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
 
73
  if file_extension in ['.jpg', '.jpeg', '.png']:
74
  print(f"{file_path} is an image file.")
75
  # Perform image processing here
76
+ img_object = [Image.open(file_path)]
77
 
78
  # Check if the file is a PDF
79
  elif file_extension == '.pdf':
80
  print(f"{file_path} is a PDF file. Converting to image set")
81
  # Run your function for processing PDF files here
82
+ img_object = convert_pdf_to_images(file_path)
83
 
84
  else:
85
  print(f"{file_path} is not an image or PDF file.")
86
+ img_object = ['']
87
 
88
+ # print('Image object is:', img_object)
89
 
90
+ return img_object
91
+
92
+ def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
93
 
94
  out_message = ''
95
  out_file_paths = []
96
 
97
+ #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
98
+
99
+ #for file in progress.tqdm(file_paths, desc="Preparing files"):
100
+ for file in file_paths:
101
+ file_path = file.name
102
+
103
+ #if file_path:
104
+ # file_path_without_ext = get_file_path_end(file_path)
105
+ if not file_path:
106
+ out_message = "No file selected"
107
+ print(out_message)
108
+ return out_message, out_file_paths
109
+
110
+ if in_redact_method == "Image analysis":
111
+ # Analyse and redact image-based pdf or image
112
+ if is_pdf_or_image(file_path) == False:
113
+ out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
114
+ print(out_message)
115
+ return out_message, None
116
+
117
+ out_file_path = process_file(file_path)
118
+ print("Out file path at image conversion step:", out_file_path)
119
+
120
+ elif in_redact_method == "Text analysis":
121
+ if is_pdf(file_path) == False:
122
+ out_message = "Please upload a PDF file for text analysis."
123
+ print(out_message)
124
+ return out_message, None
125
+
126
+ out_file_path = file_path
127
+
128
+ out_file_paths.append(out_file_path)
129
 
130
+ return out_message, out_file_paths
131
 
132
 
133
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
 
136
  out_file_paths = out_text_file_path
137
 
138
  # Convert annotated text pdf back to image to give genuine redactions
139
+ print("Creating image version of redacted PDF to embed redactions.")
140
+
141
  pdf_text_image_paths = process_file(out_text_file_path[0])
142
+ out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
143
  pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
144
 
145
+ # out_file_paths.append(out_text_image_file_path)
146
+
147
+ out_file_paths = [out_text_image_file_path]
148
+
149
+ out_message = "PDF " + file_path_without_ext + " converted to image-based file."
150
+ print(out_message)
151
 
152
+ print("Out file paths:", out_file_paths)
153
 
154
  return out_message, out_file_paths
155
 
tools/file_redaction.py CHANGED
@@ -5,7 +5,7 @@ from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
5
  from presidio_image_redactor.entities import ImageRecognizerResult
6
  from pdfminer.high_level import extract_pages
7
  from tools.file_conversion import process_file
8
- from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
9
  from pikepdf import Pdf, Dictionary, Name
10
  from gradio import Progress
11
  import time
@@ -13,64 +13,89 @@ from collections import defaultdict # For efficient grouping
13
 
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
15
  from tools.helper_functions import get_file_path_end, output_folder
16
- from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
17
  import gradio as gr
18
 
19
 
20
- def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
23
 
24
- out_message = ''
25
  out_file_paths = []
26
 
27
  if in_allow_list:
28
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
29
 
30
- if file_path:
31
- file_path_without_ext = get_file_path_end(file_path)
32
- else:
33
- out_message = "No file selected"
34
- print(out_message)
35
- return out_message, out_file_paths
36
 
37
- if in_redact_method == "Image analysis":
38
- # Analyse and redact image-based pdf or image
39
- # if is_pdf_or_image(file_path) == False:
40
- # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
41
-
42
- pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
43
- out_image_file_path = output_folder + file_path_without_ext + "_result_as_img.pdf"
44
- pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
45
-
46
- out_file_paths.append(out_image_file_path)
47
- out_message = "Image-based PDF successfully redacted and saved to file."
48
-
49
- elif in_redact_method == "Text analysis":
50
- if is_pdf(file_path) == False:
51
- return "Please upload a PDF file for text analysis.", None
52
-
53
- # Analyse text-based pdf
54
- pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
55
- out_text_file_path = output_folder + file_path_without_ext + "_result_as_text.pdf"
56
- pdf_text.save(out_text_file_path)
57
-
58
- out_file_paths.append(out_text_file_path)
59
-
60
- out_message = "Text-based PDF successfully redacted and saved to file."
61
-
62
- else:
63
- out_message = "No redaction method selected"
64
- print(out_message)
65
- return out_message, out_file_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  toc = time.perf_counter()
68
  out_time = f"Time taken: {toc - tic:0.1f} seconds."
69
  print(out_time)
70
 
71
- out_message = out_message + "\n\n" + out_time
 
72
 
73
- return out_message, out_file_paths, out_file_paths
74
 
75
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
76
  merged_bboxes = []
@@ -115,7 +140,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
115
 
116
  out_message = "PDF does not exist as images. Converting pages to image"
117
  print(out_message)
118
- progress(0, desc=out_message)
119
 
120
  image_paths = process_file(file_path)
121
 
@@ -124,9 +149,10 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
124
 
125
  out_message = "Redacting pages"
126
  print(out_message)
127
- progress(0.1, desc=out_message)
128
 
129
- for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
 
130
 
131
  print("Redacting page ", str(i + 1))
132
 
@@ -171,7 +197,6 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
171
 
172
  return images
173
 
174
-
175
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
176
  '''
177
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
@@ -189,9 +214,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
189
 
190
  page_num = 0
191
 
192
- for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
193
-
194
- print("Page number is: ", page_num)
195
 
196
  annotations_on_page = []
197
  analyzed_bounding_boxes = []
@@ -309,88 +334,3 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
309
  analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
310
 
311
  return pdf
312
-
313
-
314
- # for page_num, annotations_on_page in enumerate(annotations_all_pages):
315
- # # 2. Normalize annotation heights on the same line:
316
- # line_heights = {} # {y_coordinate: max_height}
317
-
318
- # # Get line heights for every annotation
319
- # for annotation in annotations_on_page:
320
- # if 'Rect' in annotation:
321
- # y = annotation['Rect'][1]
322
- # height = annotation['Rect'][3] - annotation['Rect'][1]
323
- # line_heights[y] = max(line_heights.get(y, 0), height)
324
-
325
- # # Update line heights for annotations
326
- # for annotation in annotations_on_page:
327
- # if 'Rect' in annotation:
328
- # y = annotation['Rect'][1]
329
- # annotation['Rect'][3] = y + line_heights[y]
330
-
331
- # # Update QuadPoints to match the new Rect coordinates
332
- # x1, y1, x2, y2 = annotation['Rect'] # Extract coordinates from Rect
333
- # annotation['QuadPoints'] = [
334
- # x1, y2, # Top left
335
- # x2, y2, # Top right
336
- # x1, y1, # Bottom left
337
- # x2, y1 # Bottom right
338
- # ]
339
-
340
-
341
- # def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
342
- # '''
343
- # take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
344
- # '''
345
-
346
- # if not image_paths:
347
-
348
- # out_message = "PDF does not exist as images. Converting pages to image"
349
- # print(out_message)
350
- # progress(0, desc=out_message)
351
-
352
- # image_paths = process_file(file_path)
353
-
354
- # # Create a new PDF
355
- # #pdf = pikepdf.new()
356
-
357
- # images = []
358
- # number_of_pages = len(image_paths)
359
-
360
- # out_message = "Redacting pages"
361
- # print(out_message)
362
- # progress(0.1, desc=out_message)
363
-
364
- # for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
365
-
366
- # print("Redacting page ", str(i + 1))
367
-
368
- # # Get the image to redact using PIL lib (pillow)
369
- # image = image_paths[i] #Image.open(image_paths[i])
370
-
371
- # # %%
372
- # image_analyser = ImageAnalyzerEngine(nlp_analyser)
373
- # engine = ImageRedactorEngine(image_analyser)
374
-
375
- # if language == 'en':
376
- # ocr_lang = 'eng'
377
- # else: ocr_lang = language
378
-
379
- # # %%
380
- # # Redact the image with pink color
381
- # redacted_image = engine.redact(image,
382
- # fill=(0, 0, 0),
383
- # ocr_kwargs={"lang": ocr_lang},
384
- # allow_list=allow_list,
385
- # ad_hoc_recognizers= None,
386
- # **{
387
- # "language": language,
388
- # "entities": chosen_redact_entities,
389
- # "score_threshold": score_threshold
390
- # },
391
- # )
392
-
393
- # images.append(redacted_image)
394
-
395
-
396
- # return images
 
5
  from presidio_image_redactor.entities import ImageRecognizerResult
6
  from pdfminer.high_level import extract_pages
7
  from tools.file_conversion import process_file
8
+ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
9
  from pikepdf import Pdf, Dictionary, Name
10
  from gradio import Progress
11
  import time
 
13
 
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
15
  from tools.helper_functions import get_file_path_end, output_folder
16
+ from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
17
  import gradio as gr
18
 
19
 
20
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
23
 
24
+ out_message = []
25
  out_file_paths = []
26
 
27
  if in_allow_list:
28
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
29
 
 
 
 
 
 
 
30
 
31
+ print("File paths:", file_paths)
32
+
33
+ for file in progress.tqdm(file_paths, desc="Redacting files", unit = "files"):
34
+ file_path = file.name
35
+
36
+ if file_path:
37
+ file_path_without_ext = get_file_path_end(file_path)
38
+ if is_pdf(file_path) == False:
39
+ # If user has not submitted a pdf, assume it's an image
40
+ print("File is not a pdf, assuming that image analysis needs to be used.")
41
+ in_redact_method = "Image analysis"
42
+ else:
43
+ out_message = "No file selected"
44
+ print(out_message)
45
+ return out_message, out_file_paths
46
+
47
+ if in_redact_method == "Image analysis":
48
+ # Analyse and redact image-based pdf or image
49
+ # if is_pdf_or_image(file_path) == False:
50
+ # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
51
+
52
+ print("Redacting file as image-based pdf")
53
+ pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
54
+ out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
55
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
56
+
57
+ out_file_paths.append(out_image_file_path)
58
+ out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
59
+
60
+ elif in_redact_method == "Text analysis":
61
+ if is_pdf(file_path) == False:
62
+ return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
63
+
64
+ # Analyse text-based pdf
65
+ print('Redacting file as text-based PDF')
66
+ pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
67
+ out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
68
+ pdf_text.save(out_text_file_path)
69
+
70
+ #out_file_paths.append(out_text_file_path)
71
+ out_message_new = "File " + file_path_without_ext + " successfully redacted."
72
+ out_message.append(out_message_new)
73
+
74
+ # Convert message
75
+ convert_message="Converting PDF to image-based PDF to embed redactions."
76
+ #progress(0.8, desc=convert_message)
77
+ print(convert_message)
78
+
79
+ # Convert document to image-based document to 'embed' redactions
80
+ img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
81
+ out_file_paths.extend(img_output_file_path)
82
+
83
+ # Add confirmation for converting to image if you want
84
+ # out_message.append(img_output_summary)
85
+
86
+ else:
87
+ out_message = "No redaction method selected"
88
+ print(out_message)
89
+ return out_message, out_file_paths
90
 
91
  toc = time.perf_counter()
92
  out_time = f"Time taken: {toc - tic:0.1f} seconds."
93
  print(out_time)
94
 
95
+ out_message_out = '\n'.join(out_message)
96
+ out_message_out = out_message_out + "\n\n" + out_time
97
 
98
+ return out_message_out, out_file_paths, out_file_paths
99
 
100
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
101
  merged_bboxes = []
 
140
 
141
  out_message = "PDF does not exist as images. Converting pages to image"
142
  print(out_message)
143
+ #progress(0, desc=out_message)
144
 
145
  image_paths = process_file(file_path)
146
 
 
149
 
150
  out_message = "Redacting pages"
151
  print(out_message)
152
+ #progress(0.1, desc=out_message)
153
 
154
+ #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
155
+ for i in range(0, number_of_pages):
156
 
157
  print("Redacting page ", str(i + 1))
158
 
 
197
 
198
  return images
199
 
 
200
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
201
  '''
202
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
 
214
 
215
  page_num = 0
216
 
217
+ #for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
218
+ for page in pdf.pages:
219
+ print("Page number is: ", page_num + 1)
220
 
221
  annotations_on_page = []
222
  analyzed_bounding_boxes = []
 
334
  analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
335
 
336
  return pdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/helper_functions.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
 
3
  def get_or_create_env_var(var_name, default_value):
4
  # Get the environment variable if it exists
@@ -29,6 +31,36 @@ def get_file_path_end(file_path):
29
 
30
  return filename_without_extension
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def ensure_output_folder_exists():
33
  """Checks if the 'output/' folder exists, creates it if not."""
34
 
@@ -41,6 +73,20 @@ def ensure_output_folder_exists():
41
  else:
42
  print(f"The 'output/' folder already exists.")
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
46
  def add_folder_to_path(folder_path: str):
 
1
  import os
2
+ import gradio as gr
3
+ import pandas as pd
4
 
5
  def get_or_create_env_var(var_name, default_value):
6
  # Get the environment variable if it exists
 
31
 
32
  return filename_without_extension
33
 
34
+ def detect_file_type(filename):
35
+ """Detect the file type based on its extension."""
36
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
37
+ return 'csv'
38
+ elif filename.endswith('.xlsx'):
39
+ return 'xlsx'
40
+ elif filename.endswith('.parquet'):
41
+ return 'parquet'
42
+ elif filename.endswith('.pdf'):
43
+ return 'pdf'
44
+ elif filename.endswith('.jpg'):
45
+ return 'jpg'
46
+ elif filename.endswith('.jpeg'):
47
+ return 'jpeg'
48
+ elif filename.endswith('.png'):
49
+ return 'png'
50
+ else:
51
+ raise ValueError("Unsupported file type.")
52
+
53
+ def read_file(filename):
54
+ """Read the file based on its detected type."""
55
+ file_type = detect_file_type(filename)
56
+
57
+ if file_type == 'csv':
58
+ return pd.read_csv(filename, low_memory=False)
59
+ elif file_type == 'xlsx':
60
+ return pd.read_excel(filename)
61
+ elif file_type == 'parquet':
62
+ return pd.read_parquet(filename)
63
+
64
  def ensure_output_folder_exists():
65
  """Checks if the 'output/' folder exists, creates it if not."""
66
 
 
73
  else:
74
  print(f"The 'output/' folder already exists.")
75
 
76
+ def put_columns_in_df(in_file):
77
+ new_choices = []
78
+ concat_choices = []
79
+
80
+ for file in in_file:
81
+ df = read_file(file.name)
82
+ new_choices = list(df.columns)
83
+
84
+ concat_choices.extend(new_choices)
85
+
86
+ # Drop duplicate columns
87
+ concat_choices = list(set(concat_choices))
88
+
89
+ return gr.Dropdown(choices=concat_choices, value=concat_choices)
90
 
91
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
92
  def add_folder_to_path(folder_path: str):
tools/presidio_analyzer_custom.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
+ from tqdm import tqdm
4
+
5
+ from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
6
+ from presidio_analyzer.nlp_engine import NlpArtifacts
7
+
8
+
9
+
10
+ def analyze_iterator_custom(
11
+ self,
12
+ texts: Iterable[Union[str, bool, float, int]],
13
+ language: str,
14
+ list_length:int,
15
+ progress=gr.Progress(),
16
+ **kwargs,
17
+ ) -> List[List[RecognizerResult]]:
18
+ """
19
+ Analyze an iterable of strings.
20
+
21
+ :param texts: An list containing strings to be analyzed.
22
+ :param language: Input language
23
+ :param list_length: Length of the input list.
24
+ :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
25
+ """
26
+
27
+ # validate types
28
+ texts = self._validate_types(texts)
29
+
30
+ # Process the texts as batch for improved performance
31
+ nlp_artifacts_batch: Iterator[
32
+ Tuple[str, NlpArtifacts]
33
+ ] = self.analyzer_engine.nlp_engine.process_batch(
34
+ texts=texts, language=language
35
+ )
36
+
37
+
38
+
39
+ list_results = []
40
+
41
+ # Uncomment this if you want to show progress within a file
42
+ #for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
43
+ for text, nlp_artifacts in nlp_artifacts_batch:
44
+ results = self.analyzer_engine.analyze(
45
+ text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
46
+ )
47
+
48
+ list_results.append(results)
49
+
50
+ return list_results
51
+
52
+ def analyze_dict(
53
+ self,
54
+ input_dict: Dict[str, Union[Any, Iterable[Any]]],
55
+ language: str,
56
+ keys_to_skip: Optional[List[str]] = None,
57
+ **kwargs,
58
+ ) -> Iterator[DictAnalyzerResult]:
59
+ """
60
+ Analyze a dictionary of keys (strings) and values/iterable of values.
61
+
62
+ Non-string values are returned as is.
63
+
64
+ :param input_dict: The input dictionary for analysis
65
+ :param language: Input language
66
+ :param keys_to_skip: Keys to ignore during analysis
67
+ :param kwargs: Additional keyword arguments
68
+ for the `AnalyzerEngine.analyze` method.
69
+ Use this to pass arguments to the analyze method,
70
+ such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
71
+ See `AnalyzerEngine.analyze` for the full list.
72
+ """
73
+
74
+ context = []
75
+ if "context" in kwargs:
76
+ context = kwargs["context"]
77
+ del kwargs["context"]
78
+
79
+ if not keys_to_skip:
80
+ keys_to_skip = []
81
+
82
+
83
+ for key, value in input_dict.items():
84
+ if not value or key in keys_to_skip:
85
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
86
+ continue # skip this key as requested
87
+
88
+ # Add the key as an additional context
89
+ specific_context = context[:]
90
+ specific_context.append(key)
91
+
92
+ if type(value) in (str, int, bool, float):
93
+ results: List[RecognizerResult] = self.analyzer_engine.analyze(
94
+ text=str(value), language=language, context=[key], **kwargs
95
+ )
96
+ elif isinstance(value, dict):
97
+ new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
98
+ results = self.analyze_dict(
99
+ input_dict=value,
100
+ language=language,
101
+ context=specific_context,
102
+ keys_to_skip=new_keys_to_skip,
103
+ **kwargs,
104
+ )
105
+ elif isinstance(value, Iterable):
106
+ # Recursively iterate nested dicts
107
+ list_length = len(value)
108
+
109
+ results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
110
+ texts=value,
111
+ language=language,
112
+ context=specific_context,
113
+ list_length=list_length,
114
+ **kwargs,
115
+ )
116
+ else:
117
+ raise ValueError(f"type {type(value)} is unsupported.")
118
+
119
+ yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)