Commit
·
7810536
1
Parent(s):
12224f5
Can now redaction text or csv/xlsx files. Can redact multiple files. Embeds redactions as image-based file by default
Browse files- Dockerfile +1 -1
- app.py +73 -43
- requirements.txt +3 -7
- tools/aws_functions.py +36 -42
- tools/data_anonymise.py +289 -0
- tools/file_conversion.py +53 -33
- tools/file_redaction.py +73 -133
- tools/helper_functions.py +46 -0
- tools/presidio_analyzer_custom.py +119 -0
Dockerfile
CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
-
RUN pip install --no-cache-dir gradio==4.
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
+
RUN pip install --no-cache-dir gradio==4.36.1
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
app.py
CHANGED
@@ -3,10 +3,11 @@ import os
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
-
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
-
from tools.
|
|
|
10 |
import gradio as gr
|
11 |
|
12 |
add_folder_to_path("_internal/tesseract/")
|
@@ -18,9 +19,7 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
|
|
18 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
19 |
language = 'en'
|
20 |
|
21 |
-
|
22 |
# Create the gradio interface
|
23 |
-
|
24 |
block = gr.Blocks(theme = gr.themes.Base())
|
25 |
|
26 |
with block:
|
@@ -32,61 +31,92 @@ with block:
|
|
32 |
gr.Markdown(
|
33 |
"""
|
34 |
# Document redaction
|
35 |
-
Take an image-based PDF or image file, or text-based PDF document and redact any personal information. 'Image analysis' will convert PDF pages to image and the identify text via OCR methods before redaction, and also works with JPG or PNG files. 'Text analysis' will analyse only selectable text that exists in the original PDF before redaction. Choose 'Image analysis' if you are not sure of the type of PDF document you are working with.
|
36 |
|
37 |
-
|
|
|
|
|
38 |
|
39 |
Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
|
40 |
""")
|
41 |
|
42 |
-
with gr.Tab("
|
43 |
-
|
44 |
-
with gr.Accordion("
|
45 |
-
in_file = gr.File(label="Choose document
|
46 |
-
|
47 |
-
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
48 |
-
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
|
49 |
-
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
|
50 |
-
|
51 |
-
redact_btn = gr.Button("Redact document", variant="primary")
|
52 |
|
53 |
with gr.Row():
|
54 |
output_summary = gr.Textbox(label="Output summary")
|
55 |
output_file = gr.File(label="Output file")
|
56 |
|
57 |
with gr.Row():
|
58 |
-
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
|
59 |
-
|
60 |
-
with gr.Tab(label="Advanced options"):
|
61 |
-
with gr.Accordion(label = "AWS data access", open = True):
|
62 |
-
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
63 |
-
with gr.Row():
|
64 |
-
in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
65 |
-
load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
66 |
-
|
67 |
-
aws_log_box = gr.Textbox(label="AWS data load status")
|
68 |
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
|
|
|
|
|
|
|
|
|
|
72 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
73 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
74 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
75 |
-
outputs=[output_summary, output_file, output_file_list_state], api_name="
|
76 |
-
|
77 |
-
|
78 |
-
outputs=[output_summary, output_file])
|
79 |
|
80 |
-
#
|
81 |
-
#
|
82 |
|
83 |
-
#
|
84 |
-
|
|
|
85 |
|
86 |
-
# Download OpenSSL from here:
|
87 |
-
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
|
88 |
-
#block.queue().launch(ssl_verify=False, share=False, debug=False, server_name="0.0.0.0",server_port=443,
|
89 |
-
# ssl_certfile="cert.pem", ssl_keyfile="key.pem") # port 443 for https. Certificates currently not valid
|
90 |
|
91 |
-
#
|
92 |
-
|
|
|
|
3 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
|
6 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
+
from tools.file_conversion import prepare_image_or_text_pdf
|
9 |
+
from tools.data_anonymise import do_anonymise
|
10 |
+
#from tools.aws_functions import load_data_from_aws
|
11 |
import gradio as gr
|
12 |
|
13 |
add_folder_to_path("_internal/tesseract/")
|
|
|
19 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
20 |
language = 'en'
|
21 |
|
|
|
22 |
# Create the gradio interface
|
|
|
23 |
block = gr.Blocks(theme = gr.themes.Base())
|
24 |
|
25 |
with block:
|
|
|
31 |
gr.Markdown(
|
32 |
"""
|
33 |
# Document redaction
|
|
|
34 |
|
35 |
+
Redact personal information from documents, open text, or xlsx/csv tabular data. See the 'Redaction settings' to change various settings such as which types of information to redact (e.g. people, places), or terms to exclude from redaction.
|
36 |
+
|
37 |
+
WARNING: This is a beta product. It is not 100% accurate, and it will miss some personal information. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
38 |
|
39 |
Other redaction entities are possible to include in this app easily, especially country-specific entities. If you want to use these, clone the repo locally and add entity names from [this link](https://microsoft.github.io/presidio/supported_entities/) to the 'full_entity_list' variable in app.py.
|
40 |
""")
|
41 |
|
42 |
+
with gr.Tab("PDFs/images"):
|
43 |
+
|
44 |
+
with gr.Accordion("Redact document", open = True):
|
45 |
+
in_file = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png'])
|
46 |
+
redact_btn = gr.Button("Redact document(s)", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
with gr.Row():
|
49 |
output_summary = gr.Textbox(label="Output summary")
|
50 |
output_file = gr.File(label="Output file")
|
51 |
|
52 |
with gr.Row():
|
53 |
+
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
with gr.Tab(label="Open text or Excel/csv files"):
|
56 |
+
gr.Markdown(
|
57 |
+
"""
|
58 |
+
### Choose open text or a tabular data file (xlsx or csv) to redact.
|
59 |
+
"""
|
60 |
+
)
|
61 |
+
with gr.Accordion("Paste open text", open = False):
|
62 |
+
in_text = gr.Textbox(label="Enter open text", lines=10)
|
63 |
+
with gr.Accordion("Upload xlsx (first sheet read only) or csv file(s)", open = False):
|
64 |
+
in_file_text = gr.File(label="Choose an xlsx (first sheet read only) or csv files", file_count= "multiple", file_types=['.xlsx', '.csv', '.parquet', '.csv.gz'])
|
65 |
+
|
66 |
+
in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select columns that you want to anonymise. Ensure that at least one named column exists in all files.")
|
67 |
+
|
68 |
+
match_btn = gr.Button("Anonymise text", variant="primary")
|
69 |
+
|
70 |
+
with gr.Row():
|
71 |
+
text_output_summary = gr.Textbox(label="Output result")
|
72 |
+
text_output_file = gr.File(label="Output file")
|
73 |
+
|
74 |
+
with gr.Tab(label="Redaction settings"):
|
75 |
+
gr.Markdown(
|
76 |
+
"""
|
77 |
+
Define redaction settings that affect both document and open text redaction.
|
78 |
+
""")
|
79 |
+
with gr.Accordion("Settings for documents", open = True):
|
80 |
+
in_redaction_method = gr.Radio(label="Default document redaction method - text analysis is faster is not useful for image-based PDFs. Imaged-based is slightly less accurate in general.", value = "Text analysis", choices=["Text analysis", "Image analysis"])
|
81 |
+
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
82 |
+
anon_strat = gr.Radio(choices=["replace", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace")
|
83 |
+
|
84 |
+
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
85 |
+
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact (click close to down arrow for full list)")
|
86 |
+
with gr.Row():
|
87 |
+
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
88 |
+
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
89 |
+
|
90 |
+
# AWS options - not yet implemented
|
91 |
+
# with gr.Tab(label="Advanced options"):
|
92 |
+
# with gr.Accordion(label = "AWS data access", open = True):
|
93 |
+
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
94 |
+
# with gr.Row():
|
95 |
+
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
96 |
+
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
97 |
+
|
98 |
+
# aws_log_box = gr.Textbox(label="AWS data load status")
|
99 |
|
100 |
+
# ### Loading AWS data ###
|
101 |
+
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
102 |
+
|
103 |
+
|
104 |
+
# Document redaction
|
105 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list],
|
106 |
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
107 |
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
108 |
+
outputs=[output_summary, output_file, output_file_list_state], api_name="redact_doc")#.\
|
109 |
+
#then(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
110 |
+
#outputs=[output_summary, output_file])
|
|
|
111 |
|
112 |
+
#convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file_list_state],
|
113 |
+
# outputs=[output_summary, output_file], api_name="convert_to_img")
|
114 |
|
115 |
+
# Open text interaction
|
116 |
+
in_file_text.upload(fn=put_columns_in_df, inputs=[in_file_text], outputs=[in_colnames])
|
117 |
+
match_btn.click(fn=do_anonymise, inputs=[in_file_text, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list], outputs=[text_output_summary, text_output_file], api_name="redact_text")
|
118 |
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
# Launch the Gradio app
|
121 |
+
if __name__ == "__main__":
|
122 |
+
block.queue().launch(show_error=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
|
requirements.txt
CHANGED
@@ -10,10 +10,6 @@ spacy # Not specified as latest versions create a conflict with latest versions
|
|
10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
11 |
gradio # Not specified as latest versions create a conflict with latest versions of spacy
|
12 |
boto3==1.34.103
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#unstructured_pytesseract
|
17 |
-
#pillow-heif
|
18 |
-
#python-docx
|
19 |
-
#python-pptx
|
|
|
10 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
|
11 |
gradio # Not specified as latest versions create a conflict with latest versions of spacy
|
12 |
boto3==1.34.103
|
13 |
+
faker
|
14 |
+
openpyxl
|
15 |
+
pyarrow
|
|
|
|
|
|
|
|
tools/aws_functions.py
CHANGED
@@ -3,45 +3,44 @@ import pandas as pd
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
#
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
except Exception as e:
|
44 |
-
print(e)
|
45 |
|
46 |
# Download direct from S3 - requires login credentials
|
47 |
def download_file_from_s3(bucket_name, key, local_file_path):
|
@@ -50,8 +49,6 @@ def download_file_from_s3(bucket_name, key, local_file_path):
|
|
50 |
s3.download_file(bucket_name, key, local_file_path)
|
51 |
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
52 |
|
53 |
-
#download_file_from_s3(bucket_name, object_key, local_file_loc)
|
54 |
-
|
55 |
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
56 |
"""
|
57 |
Download all files from an S3 folder to a local folder.
|
@@ -77,7 +74,6 @@ def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
|
77 |
except Exception as e:
|
78 |
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
79 |
|
80 |
-
|
81 |
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
82 |
"""
|
83 |
Download specific files from an S3 folder to a local folder.
|
@@ -111,8 +107,6 @@ def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
|
111 |
except Exception as e:
|
112 |
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
113 |
|
114 |
-
|
115 |
-
|
116 |
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
117 |
|
118 |
temp_dir = tempfile.mkdtemp()
|
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
6 |
+
from tools.helper_functions import get_or_create_env_var
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
+
bucket_name=""
|
10 |
+
|
11 |
+
# Get AWS credentials if required
|
12 |
+
|
13 |
+
aws_var = "RUN_AWS_FUNCTIONS"
|
14 |
+
aws_var_default = "0"
|
15 |
+
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
16 |
+
print(f'The value of {aws_var} is {aws_var_val}')
|
17 |
+
|
18 |
+
if aws_var_val == "1":
|
19 |
+
try:
|
20 |
+
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
21 |
+
session = boto3.Session() # profile_name="default"
|
22 |
+
except Exception as e:
|
23 |
+
print(e)
|
24 |
+
|
25 |
+
def get_assumed_role_info():
|
26 |
+
sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
|
27 |
+
response = sts.get_caller_identity()
|
28 |
+
|
29 |
+
# Extract ARN of the assumed role
|
30 |
+
assumed_role_arn = response['Arn']
|
31 |
+
|
32 |
+
# Extract the name of the assumed role from the ARN
|
33 |
+
assumed_role_name = assumed_role_arn.split('/')[-1]
|
34 |
+
|
35 |
+
return assumed_role_arn, assumed_role_name
|
36 |
+
|
37 |
+
try:
|
38 |
+
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
39 |
+
|
40 |
+
print("Assumed Role ARN:", assumed_role_arn)
|
41 |
+
print("Assumed Role Name:", assumed_role_name)
|
42 |
+
except Exception as e:
|
43 |
+
print(e)
|
|
|
|
|
44 |
|
45 |
# Download direct from S3 - requires login credentials
|
46 |
def download_file_from_s3(bucket_name, key, local_file_path):
|
|
|
49 |
s3.download_file(bucket_name, key, local_file_path)
|
50 |
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
51 |
|
|
|
|
|
52 |
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
53 |
"""
|
54 |
Download all files from an S3 folder to a local folder.
|
|
|
74 |
except Exception as e:
|
75 |
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
76 |
|
|
|
77 |
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
78 |
"""
|
79 |
Download specific files from an S3 folder to a local folder.
|
|
|
107 |
except Exception as e:
|
108 |
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
109 |
|
|
|
|
|
110 |
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
111 |
|
112 |
temp_dir = tempfile.mkdtemp()
|
tools/data_anonymise.py
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import secrets
|
3 |
+
import base64
|
4 |
+
import time
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from faker import Faker
|
8 |
+
|
9 |
+
from gradio import Progress
|
10 |
+
from typing import List
|
11 |
+
|
12 |
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
|
13 |
+
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
14 |
+
from presidio_anonymizer.entities import OperatorConfig
|
15 |
+
|
16 |
+
from tools.helper_functions import output_folder, get_file_path_end, read_file
|
17 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
18 |
+
|
19 |
+
# Use custom version of analyze_dict to be able to track progress
|
20 |
+
from tools.presidio_analyzer_custom import analyze_dict
|
21 |
+
|
22 |
+
|
23 |
+
fake = Faker("en_UK")
|
24 |
+
def fake_first_name(x):
|
25 |
+
return fake.first_name()
|
26 |
+
|
27 |
+
def anon_consistent_names(df):
|
28 |
+
# ## Pick out common names and replace them with the same person value
|
29 |
+
df_dict = df.to_dict(orient="list")
|
30 |
+
|
31 |
+
analyzer = AnalyzerEngine()
|
32 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
33 |
+
|
34 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
35 |
+
analyzer_results = list(analyzer_results)
|
36 |
+
|
37 |
+
# + tags=[]
|
38 |
+
text = analyzer_results[3].value
|
39 |
+
|
40 |
+
# + tags=[]
|
41 |
+
recognizer_result = str(analyzer_results[3].recognizer_results)
|
42 |
+
|
43 |
+
# + tags=[]
|
44 |
+
recognizer_result
|
45 |
+
|
46 |
+
# + tags=[]
|
47 |
+
data_str = recognizer_result # abbreviated for brevity
|
48 |
+
|
49 |
+
# Adjusting the parse_dict function to handle trailing ']'
|
50 |
+
# Splitting the main data string into individual list strings
|
51 |
+
list_strs = data_str[1:-1].split('], [')
|
52 |
+
|
53 |
+
def parse_dict(s):
|
54 |
+
s = s.strip('[]') # Removing any surrounding brackets
|
55 |
+
items = s.split(', ')
|
56 |
+
d = {}
|
57 |
+
for item in items:
|
58 |
+
key, value = item.split(': ')
|
59 |
+
if key == 'score':
|
60 |
+
d[key] = float(value)
|
61 |
+
elif key in ['start', 'end']:
|
62 |
+
d[key] = int(value)
|
63 |
+
else:
|
64 |
+
d[key] = value
|
65 |
+
return d
|
66 |
+
|
67 |
+
# Re-running the improved processing code
|
68 |
+
|
69 |
+
result = []
|
70 |
+
|
71 |
+
for lst_str in list_strs:
|
72 |
+
# Splitting each list string into individual dictionary strings
|
73 |
+
dict_strs = lst_str.split(', type: ')
|
74 |
+
dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
|
75 |
+
|
76 |
+
# Parsing each dictionary string
|
77 |
+
dicts = [parse_dict(d) for d in dict_strs]
|
78 |
+
result.append(dicts)
|
79 |
+
|
80 |
+
#result
|
81 |
+
|
82 |
+
# + tags=[]
|
83 |
+
names = []
|
84 |
+
|
85 |
+
for idx, paragraph in enumerate(text):
|
86 |
+
paragraph_texts = []
|
87 |
+
for dictionary in result[idx]:
|
88 |
+
if dictionary['type'] == 'PERSON':
|
89 |
+
paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
|
90 |
+
names.append(paragraph_texts)
|
91 |
+
|
92 |
+
# + tags=[]
|
93 |
+
# Flatten the list of lists and extract unique names
|
94 |
+
unique_names = list(set(name for sublist in names for name in sublist))
|
95 |
+
|
96 |
+
# + tags=[]
|
97 |
+
fake_names = pd.Series(unique_names).apply(fake_first_name)
|
98 |
+
|
99 |
+
# + tags=[]
|
100 |
+
mapping_df = pd.DataFrame(data={"Unique names":unique_names,
|
101 |
+
"Fake names": fake_names})
|
102 |
+
|
103 |
+
# + tags=[]
|
104 |
+
# Convert mapping dataframe to dictionary
|
105 |
+
# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
|
106 |
+
name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
|
107 |
+
|
108 |
+
# + tags=[]
|
109 |
+
name_map
|
110 |
+
|
111 |
+
# + tags=[]
|
112 |
+
scrubbed_df_consistent_names = df.replace(name_map, regex = True)
|
113 |
+
|
114 |
+
# + tags=[]
|
115 |
+
scrubbed_df_consistent_names
|
116 |
+
|
117 |
+
return scrubbed_df_consistent_names
|
118 |
+
|
119 |
+
def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[str], allow_list:List[str]=[], progress=Progress(track_tqdm=False)):
|
120 |
+
# DataFrame to dict
|
121 |
+
df_dict = df.to_dict(orient="list")
|
122 |
+
|
123 |
+
if allow_list:
|
124 |
+
allow_list_flat = [item for sublist in allow_list for item in sublist]
|
125 |
+
|
126 |
+
#analyzer = nlp_analyser #AnalyzerEngine()
|
127 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
128 |
+
|
129 |
+
anonymizer = AnonymizerEngine()
|
130 |
+
|
131 |
+
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
132 |
+
|
133 |
+
# analyzer_results = batch_analyzer.analyze_dict(df_dict, language=language,
|
134 |
+
# entities=chosen_redact_entities,
|
135 |
+
# score_threshold=score_threshold,
|
136 |
+
# return_decision_process=False,
|
137 |
+
# allow_list=allow_list_flat)
|
138 |
+
|
139 |
+
print("Identifying personal information")
|
140 |
+
analyse_tic = time.perf_counter()
|
141 |
+
|
142 |
+
print("Allow list:", allow_list)
|
143 |
+
|
144 |
+
# Use custom analyzer to be able to track progress with Gradio
|
145 |
+
analyzer_results = analyze_dict(batch_analyzer, df_dict, language=language,
|
146 |
+
entities=chosen_redact_entities,
|
147 |
+
score_threshold=score_threshold,
|
148 |
+
return_decision_process=False,
|
149 |
+
allow_list=allow_list_flat)
|
150 |
+
analyzer_results = list(analyzer_results)
|
151 |
+
#analyzer_results
|
152 |
+
|
153 |
+
analyse_toc = time.perf_counter()
|
154 |
+
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
155 |
+
print(analyse_time_out)
|
156 |
+
|
157 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
158 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
159 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
160 |
+
|
161 |
+
# Create faker function (note that it has to receive a value)
|
162 |
+
|
163 |
+
fake = Faker("en_UK")
|
164 |
+
|
165 |
+
def fake_first_name(x):
|
166 |
+
return fake.first_name()
|
167 |
+
|
168 |
+
# Set up the anonymization configuration WITHOUT DATE_TIME
|
169 |
+
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
170 |
+
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
171 |
+
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
172 |
+
mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
|
173 |
+
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
174 |
+
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
175 |
+
|
176 |
+
|
177 |
+
if anon_strat == "replace": chosen_mask_config = replace_config
|
178 |
+
if anon_strat == "redact": chosen_mask_config = redact_config
|
179 |
+
if anon_strat == "hash": chosen_mask_config = hash_config
|
180 |
+
if anon_strat == "mask": chosen_mask_config = mask_config
|
181 |
+
if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
|
182 |
+
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
183 |
+
|
184 |
+
# I think in general people will want to keep date / times
|
185 |
+
keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
186 |
+
|
187 |
+
combined_config = {**chosen_mask_config, **keep_date_config}
|
188 |
+
combined_config
|
189 |
+
|
190 |
+
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
191 |
+
|
192 |
+
scrubbed_df = pd.DataFrame(anonymizer_results)
|
193 |
+
|
194 |
+
# Create reporting message
|
195 |
+
out_message = "Successfully anonymised"
|
196 |
+
|
197 |
+
if anon_strat == "encrypt":
|
198 |
+
out_message = out_message + ". Your decryption key is " + key_string + "."
|
199 |
+
|
200 |
+
return scrubbed_df, out_message
|
201 |
+
|
202 |
+
def do_anonymise(in_file, in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
203 |
+
|
204 |
+
def check_lists(list1, list2):
|
205 |
+
return any(string in list2 for string in list1)
|
206 |
+
|
207 |
+
def get_common_strings(list1, list2):
|
208 |
+
"""
|
209 |
+
Finds the common strings between two lists.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
list1: The first list of strings.
|
213 |
+
list2: The second list of strings.
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
A list containing the common strings.
|
217 |
+
"""
|
218 |
+
common_strings = []
|
219 |
+
for string in list1:
|
220 |
+
if string in list2:
|
221 |
+
common_strings.append(string)
|
222 |
+
return common_strings
|
223 |
+
|
224 |
+
# Load file
|
225 |
+
|
226 |
+
anon_df = pd.DataFrame()
|
227 |
+
out_files_list = []
|
228 |
+
|
229 |
+
# Check if files and text exist
|
230 |
+
if not in_file:
|
231 |
+
if in_text:
|
232 |
+
in_file=['open_text']
|
233 |
+
else:
|
234 |
+
out_message = "Please enter text or a file to redact."
|
235 |
+
return out_message, None
|
236 |
+
|
237 |
+
for match_file in progress.tqdm(in_file, desc="Anonymising files", unit = "file"):
|
238 |
+
|
239 |
+
if match_file=='open_text':
|
240 |
+
anon_df = pd.DataFrame(data={'text':[in_text]})
|
241 |
+
chosen_cols=['text']
|
242 |
+
out_file_part = match_file
|
243 |
+
else:
|
244 |
+
anon_df = read_file(match_file)
|
245 |
+
out_file_part = get_file_path_end(match_file.name)
|
246 |
+
|
247 |
+
|
248 |
+
|
249 |
+
# Check for chosen col, skip file if not found
|
250 |
+
all_cols_original_order = list(anon_df.columns)
|
251 |
+
|
252 |
+
any_cols_found = check_lists(chosen_cols, all_cols_original_order)
|
253 |
+
|
254 |
+
if any_cols_found == False:
|
255 |
+
out_message = "No chosen columns found in dataframe: " + out_file_part
|
256 |
+
print(out_message)
|
257 |
+
continue
|
258 |
+
else:
|
259 |
+
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
260 |
+
|
261 |
+
# Split dataframe to keep only selected columns
|
262 |
+
print("Remaining columns to redact:", chosen_cols_in_anon_df)
|
263 |
+
|
264 |
+
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
265 |
+
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
266 |
+
|
267 |
+
# Anonymise the selected columns
|
268 |
+
anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, allow_list)
|
269 |
+
|
270 |
+
# Rejoin the dataframe together
|
271 |
+
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
272 |
+
anon_df_out = anon_df_out[all_cols_original_order]
|
273 |
+
|
274 |
+
# Export file
|
275 |
+
|
276 |
+
|
277 |
+
# out_file_part = re.sub(r'\.csv', '', match_file.name)
|
278 |
+
|
279 |
+
anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat + ".csv"
|
280 |
+
|
281 |
+
anon_df_out.to_csv(anon_export_file_name, index = None)
|
282 |
+
|
283 |
+
out_files_list.append(anon_export_file_name)
|
284 |
+
|
285 |
+
# Print result text to output text box if just anonymising open text
|
286 |
+
if match_file=='open_text':
|
287 |
+
out_message = anon_df_out['text'][0]
|
288 |
+
|
289 |
+
return out_message, out_files_list
|
tools/file_conversion.py
CHANGED
@@ -45,9 +45,10 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
|
45 |
images = []
|
46 |
|
47 |
# Open the PDF file
|
48 |
-
for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
|
|
49 |
|
50 |
-
print("Current page: ", str(page_num))
|
51 |
|
52 |
# Convert one page to image
|
53 |
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
@@ -72,48 +73,61 @@ def process_file(file_path):
|
|
72 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
73 |
print(f"{file_path} is an image file.")
|
74 |
# Perform image processing here
|
75 |
-
|
76 |
|
77 |
# Check if the file is a PDF
|
78 |
elif file_extension == '.pdf':
|
79 |
print(f"{file_path} is a PDF file. Converting to image set")
|
80 |
# Run your function for processing PDF files here
|
81 |
-
|
82 |
|
83 |
else:
|
84 |
print(f"{file_path} is not an image or PDF file.")
|
85 |
-
|
86 |
|
87 |
-
|
88 |
|
89 |
-
|
|
|
|
|
90 |
|
91 |
out_message = ''
|
92 |
out_file_paths = []
|
93 |
|
94 |
-
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
-
return out_message,
|
117 |
|
118 |
|
119 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
@@ -122,14 +136,20 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
122 |
out_file_paths = out_text_file_path
|
123 |
|
124 |
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
-
print("Creating image version of
|
|
|
126 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
127 |
-
out_text_image_file_path = output_folder + file_path_without_ext + "
|
128 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
|
130 |
-
out_file_paths.append(out_text_image_file_path)
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
|
134 |
return out_message, out_file_paths
|
135 |
|
|
|
45 |
images = []
|
46 |
|
47 |
# Open the PDF file
|
48 |
+
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
49 |
+
for page_num in range(0,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
50 |
|
51 |
+
# print("Current page: ", str(page_num + 1))
|
52 |
|
53 |
# Convert one page to image
|
54 |
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
|
|
73 |
if file_extension in ['.jpg', '.jpeg', '.png']:
|
74 |
print(f"{file_path} is an image file.")
|
75 |
# Perform image processing here
|
76 |
+
img_object = [Image.open(file_path)]
|
77 |
|
78 |
# Check if the file is a PDF
|
79 |
elif file_extension == '.pdf':
|
80 |
print(f"{file_path} is a PDF file. Converting to image set")
|
81 |
# Run your function for processing PDF files here
|
82 |
+
img_object = convert_pdf_to_images(file_path)
|
83 |
|
84 |
else:
|
85 |
print(f"{file_path} is not an image or PDF file.")
|
86 |
+
img_object = ['']
|
87 |
|
88 |
+
# print('Image object is:', img_object)
|
89 |
|
90 |
+
return img_object
|
91 |
+
|
92 |
+
def prepare_image_or_text_pdf(file_paths:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
|
93 |
|
94 |
out_message = ''
|
95 |
out_file_paths = []
|
96 |
|
97 |
+
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
98 |
+
|
99 |
+
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
100 |
+
for file in file_paths:
|
101 |
+
file_path = file.name
|
102 |
+
|
103 |
+
#if file_path:
|
104 |
+
# file_path_without_ext = get_file_path_end(file_path)
|
105 |
+
if not file_path:
|
106 |
+
out_message = "No file selected"
|
107 |
+
print(out_message)
|
108 |
+
return out_message, out_file_paths
|
109 |
+
|
110 |
+
if in_redact_method == "Image analysis":
|
111 |
+
# Analyse and redact image-based pdf or image
|
112 |
+
if is_pdf_or_image(file_path) == False:
|
113 |
+
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
114 |
+
print(out_message)
|
115 |
+
return out_message, None
|
116 |
+
|
117 |
+
out_file_path = process_file(file_path)
|
118 |
+
print("Out file path at image conversion step:", out_file_path)
|
119 |
+
|
120 |
+
elif in_redact_method == "Text analysis":
|
121 |
+
if is_pdf(file_path) == False:
|
122 |
+
out_message = "Please upload a PDF file for text analysis."
|
123 |
+
print(out_message)
|
124 |
+
return out_message, None
|
125 |
+
|
126 |
+
out_file_path = file_path
|
127 |
+
|
128 |
+
out_file_paths.append(out_file_path)
|
129 |
|
130 |
+
return out_message, out_file_paths
|
131 |
|
132 |
|
133 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
|
136 |
out_file_paths = out_text_file_path
|
137 |
|
138 |
# Convert annotated text pdf back to image to give genuine redactions
|
139 |
+
print("Creating image version of redacted PDF to embed redactions.")
|
140 |
+
|
141 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
142 |
+
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
143 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
144 |
|
145 |
+
# out_file_paths.append(out_text_image_file_path)
|
146 |
+
|
147 |
+
out_file_paths = [out_text_image_file_path]
|
148 |
+
|
149 |
+
out_message = "PDF " + file_path_without_ext + " converted to image-based file."
|
150 |
+
print(out_message)
|
151 |
|
152 |
+
print("Out file paths:", out_file_paths)
|
153 |
|
154 |
return out_message, out_file_paths
|
155 |
|
tools/file_redaction.py
CHANGED
@@ -5,7 +5,7 @@ from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
|
5 |
from presidio_image_redactor.entities import ImageRecognizerResult
|
6 |
from pdfminer.high_level import extract_pages
|
7 |
from tools.file_conversion import process_file
|
8 |
-
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine
|
9 |
from pikepdf import Pdf, Dictionary, Name
|
10 |
from gradio import Progress
|
11 |
import time
|
@@ -13,64 +13,89 @@ from collections import defaultdict # For efficient grouping
|
|
13 |
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
15 |
from tools.helper_functions import get_file_path_end, output_folder
|
16 |
-
from tools.file_conversion import process_file, is_pdf,
|
17 |
import gradio as gr
|
18 |
|
19 |
|
20 |
-
def choose_and_run_redactor(
|
21 |
|
22 |
tic = time.perf_counter()
|
23 |
|
24 |
-
out_message =
|
25 |
out_file_paths = []
|
26 |
|
27 |
if in_allow_list:
|
28 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
29 |
|
30 |
-
if file_path:
|
31 |
-
file_path_without_ext = get_file_path_end(file_path)
|
32 |
-
else:
|
33 |
-
out_message = "No file selected"
|
34 |
-
print(out_message)
|
35 |
-
return out_message, out_file_paths
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
return
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
toc = time.perf_counter()
|
68 |
out_time = f"Time taken: {toc - tic:0.1f} seconds."
|
69 |
print(out_time)
|
70 |
|
71 |
-
|
|
|
72 |
|
73 |
-
return
|
74 |
|
75 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
76 |
merged_bboxes = []
|
@@ -115,7 +140,7 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
115 |
|
116 |
out_message = "PDF does not exist as images. Converting pages to image"
|
117 |
print(out_message)
|
118 |
-
progress(0, desc=out_message)
|
119 |
|
120 |
image_paths = process_file(file_path)
|
121 |
|
@@ -124,9 +149,10 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
124 |
|
125 |
out_message = "Redacting pages"
|
126 |
print(out_message)
|
127 |
-
progress(0.1, desc=out_message)
|
128 |
|
129 |
-
for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
|
|
130 |
|
131 |
print("Redacting page ", str(i + 1))
|
132 |
|
@@ -171,7 +197,6 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
171 |
|
172 |
return images
|
173 |
|
174 |
-
|
175 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
176 |
'''
|
177 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
@@ -189,9 +214,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
189 |
|
190 |
page_num = 0
|
191 |
|
192 |
-
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
193 |
-
|
194 |
-
print("Page number is: ", page_num)
|
195 |
|
196 |
annotations_on_page = []
|
197 |
analyzed_bounding_boxes = []
|
@@ -309,88 +334,3 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
309 |
analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
|
310 |
|
311 |
return pdf
|
312 |
-
|
313 |
-
|
314 |
-
# for page_num, annotations_on_page in enumerate(annotations_all_pages):
|
315 |
-
# # 2. Normalize annotation heights on the same line:
|
316 |
-
# line_heights = {} # {y_coordinate: max_height}
|
317 |
-
|
318 |
-
# # Get line heights for every annotation
|
319 |
-
# for annotation in annotations_on_page:
|
320 |
-
# if 'Rect' in annotation:
|
321 |
-
# y = annotation['Rect'][1]
|
322 |
-
# height = annotation['Rect'][3] - annotation['Rect'][1]
|
323 |
-
# line_heights[y] = max(line_heights.get(y, 0), height)
|
324 |
-
|
325 |
-
# # Update line heights for annotations
|
326 |
-
# for annotation in annotations_on_page:
|
327 |
-
# if 'Rect' in annotation:
|
328 |
-
# y = annotation['Rect'][1]
|
329 |
-
# annotation['Rect'][3] = y + line_heights[y]
|
330 |
-
|
331 |
-
# # Update QuadPoints to match the new Rect coordinates
|
332 |
-
# x1, y1, x2, y2 = annotation['Rect'] # Extract coordinates from Rect
|
333 |
-
# annotation['QuadPoints'] = [
|
334 |
-
# x1, y2, # Top left
|
335 |
-
# x2, y2, # Top right
|
336 |
-
# x1, y1, # Bottom left
|
337 |
-
# x2, y1 # Bottom right
|
338 |
-
# ]
|
339 |
-
|
340 |
-
|
341 |
-
# def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
342 |
-
# '''
|
343 |
-
# take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
344 |
-
# '''
|
345 |
-
|
346 |
-
# if not image_paths:
|
347 |
-
|
348 |
-
# out_message = "PDF does not exist as images. Converting pages to image"
|
349 |
-
# print(out_message)
|
350 |
-
# progress(0, desc=out_message)
|
351 |
-
|
352 |
-
# image_paths = process_file(file_path)
|
353 |
-
|
354 |
-
# # Create a new PDF
|
355 |
-
# #pdf = pikepdf.new()
|
356 |
-
|
357 |
-
# images = []
|
358 |
-
# number_of_pages = len(image_paths)
|
359 |
-
|
360 |
-
# out_message = "Redacting pages"
|
361 |
-
# print(out_message)
|
362 |
-
# progress(0.1, desc=out_message)
|
363 |
-
|
364 |
-
# for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
365 |
-
|
366 |
-
# print("Redacting page ", str(i + 1))
|
367 |
-
|
368 |
-
# # Get the image to redact using PIL lib (pillow)
|
369 |
-
# image = image_paths[i] #Image.open(image_paths[i])
|
370 |
-
|
371 |
-
# # %%
|
372 |
-
# image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
373 |
-
# engine = ImageRedactorEngine(image_analyser)
|
374 |
-
|
375 |
-
# if language == 'en':
|
376 |
-
# ocr_lang = 'eng'
|
377 |
-
# else: ocr_lang = language
|
378 |
-
|
379 |
-
# # %%
|
380 |
-
# # Redact the image with pink color
|
381 |
-
# redacted_image = engine.redact(image,
|
382 |
-
# fill=(0, 0, 0),
|
383 |
-
# ocr_kwargs={"lang": ocr_lang},
|
384 |
-
# allow_list=allow_list,
|
385 |
-
# ad_hoc_recognizers= None,
|
386 |
-
# **{
|
387 |
-
# "language": language,
|
388 |
-
# "entities": chosen_redact_entities,
|
389 |
-
# "score_threshold": score_threshold
|
390 |
-
# },
|
391 |
-
# )
|
392 |
-
|
393 |
-
# images.append(redacted_image)
|
394 |
-
|
395 |
-
|
396 |
-
# return images
|
|
|
5 |
from presidio_image_redactor.entities import ImageRecognizerResult
|
6 |
from pdfminer.high_level import extract_pages
|
7 |
from tools.file_conversion import process_file
|
8 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine #, LTAnno
|
9 |
from pikepdf import Pdf, Dictionary, Name
|
10 |
from gradio import Progress
|
11 |
import time
|
|
|
13 |
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
15 |
from tools.helper_functions import get_file_path_end, output_folder
|
16 |
+
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf
|
17 |
import gradio as gr
|
18 |
|
19 |
|
20 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
21 |
|
22 |
tic = time.perf_counter()
|
23 |
|
24 |
+
out_message = []
|
25 |
out_file_paths = []
|
26 |
|
27 |
if in_allow_list:
|
28 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
print("File paths:", file_paths)
|
32 |
+
|
33 |
+
for file in progress.tqdm(file_paths, desc="Redacting files", unit = "files"):
|
34 |
+
file_path = file.name
|
35 |
+
|
36 |
+
if file_path:
|
37 |
+
file_path_without_ext = get_file_path_end(file_path)
|
38 |
+
if is_pdf(file_path) == False:
|
39 |
+
# If user has not submitted a pdf, assume it's an image
|
40 |
+
print("File is not a pdf, assuming that image analysis needs to be used.")
|
41 |
+
in_redact_method = "Image analysis"
|
42 |
+
else:
|
43 |
+
out_message = "No file selected"
|
44 |
+
print(out_message)
|
45 |
+
return out_message, out_file_paths
|
46 |
+
|
47 |
+
if in_redact_method == "Image analysis":
|
48 |
+
# Analyse and redact image-based pdf or image
|
49 |
+
# if is_pdf_or_image(file_path) == False:
|
50 |
+
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
51 |
+
|
52 |
+
print("Redacting file as image-based pdf")
|
53 |
+
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
54 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
55 |
+
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
56 |
+
|
57 |
+
out_file_paths.append(out_image_file_path)
|
58 |
+
out_message.append("File '" + file_path_without_ext + "' successfully redacted and saved to file.")
|
59 |
+
|
60 |
+
elif in_redact_method == "Text analysis":
|
61 |
+
if is_pdf(file_path) == False:
|
62 |
+
return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
|
63 |
+
|
64 |
+
# Analyse text-based pdf
|
65 |
+
print('Redacting file as text-based PDF')
|
66 |
+
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
67 |
+
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
68 |
+
pdf_text.save(out_text_file_path)
|
69 |
+
|
70 |
+
#out_file_paths.append(out_text_file_path)
|
71 |
+
out_message_new = "File " + file_path_without_ext + " successfully redacted."
|
72 |
+
out_message.append(out_message_new)
|
73 |
+
|
74 |
+
# Convert message
|
75 |
+
convert_message="Converting PDF to image-based PDF to embed redactions."
|
76 |
+
#progress(0.8, desc=convert_message)
|
77 |
+
print(convert_message)
|
78 |
+
|
79 |
+
# Convert document to image-based document to 'embed' redactions
|
80 |
+
img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
|
81 |
+
out_file_paths.extend(img_output_file_path)
|
82 |
+
|
83 |
+
# Add confirmation for converting to image if you want
|
84 |
+
# out_message.append(img_output_summary)
|
85 |
+
|
86 |
+
else:
|
87 |
+
out_message = "No redaction method selected"
|
88 |
+
print(out_message)
|
89 |
+
return out_message, out_file_paths
|
90 |
|
91 |
toc = time.perf_counter()
|
92 |
out_time = f"Time taken: {toc - tic:0.1f} seconds."
|
93 |
print(out_time)
|
94 |
|
95 |
+
out_message_out = '\n'.join(out_message)
|
96 |
+
out_message_out = out_message_out + "\n\n" + out_time
|
97 |
|
98 |
+
return out_message_out, out_file_paths, out_file_paths
|
99 |
|
100 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
101 |
merged_bboxes = []
|
|
|
140 |
|
141 |
out_message = "PDF does not exist as images. Converting pages to image"
|
142 |
print(out_message)
|
143 |
+
#progress(0, desc=out_message)
|
144 |
|
145 |
image_paths = process_file(file_path)
|
146 |
|
|
|
149 |
|
150 |
out_message = "Redacting pages"
|
151 |
print(out_message)
|
152 |
+
#progress(0.1, desc=out_message)
|
153 |
|
154 |
+
#for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
155 |
+
for i in range(0, number_of_pages):
|
156 |
|
157 |
print("Redacting page ", str(i + 1))
|
158 |
|
|
|
197 |
|
198 |
return images
|
199 |
|
|
|
200 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
201 |
'''
|
202 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
|
|
214 |
|
215 |
page_num = 0
|
216 |
|
217 |
+
#for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
218 |
+
for page in pdf.pages:
|
219 |
+
print("Page number is: ", page_num + 1)
|
220 |
|
221 |
annotations_on_page = []
|
222 |
analyzed_bounding_boxes = []
|
|
|
334 |
analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
|
335 |
|
336 |
return pdf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/helper_functions.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import os
|
|
|
|
|
2 |
|
3 |
def get_or_create_env_var(var_name, default_value):
|
4 |
# Get the environment variable if it exists
|
@@ -29,6 +31,36 @@ def get_file_path_end(file_path):
|
|
29 |
|
30 |
return filename_without_extension
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def ensure_output_folder_exists():
|
33 |
"""Checks if the 'output/' folder exists, creates it if not."""
|
34 |
|
@@ -41,6 +73,20 @@ def ensure_output_folder_exists():
|
|
41 |
else:
|
42 |
print(f"The 'output/' folder already exists.")
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
46 |
def add_folder_to_path(folder_path: str):
|
|
|
1 |
import os
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
|
5 |
def get_or_create_env_var(var_name, default_value):
|
6 |
# Get the environment variable if it exists
|
|
|
31 |
|
32 |
return filename_without_extension
|
33 |
|
34 |
+
def detect_file_type(filename):
|
35 |
+
"""Detect the file type based on its extension."""
|
36 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
37 |
+
return 'csv'
|
38 |
+
elif filename.endswith('.xlsx'):
|
39 |
+
return 'xlsx'
|
40 |
+
elif filename.endswith('.parquet'):
|
41 |
+
return 'parquet'
|
42 |
+
elif filename.endswith('.pdf'):
|
43 |
+
return 'pdf'
|
44 |
+
elif filename.endswith('.jpg'):
|
45 |
+
return 'jpg'
|
46 |
+
elif filename.endswith('.jpeg'):
|
47 |
+
return 'jpeg'
|
48 |
+
elif filename.endswith('.png'):
|
49 |
+
return 'png'
|
50 |
+
else:
|
51 |
+
raise ValueError("Unsupported file type.")
|
52 |
+
|
53 |
+
def read_file(filename):
|
54 |
+
"""Read the file based on its detected type."""
|
55 |
+
file_type = detect_file_type(filename)
|
56 |
+
|
57 |
+
if file_type == 'csv':
|
58 |
+
return pd.read_csv(filename, low_memory=False)
|
59 |
+
elif file_type == 'xlsx':
|
60 |
+
return pd.read_excel(filename)
|
61 |
+
elif file_type == 'parquet':
|
62 |
+
return pd.read_parquet(filename)
|
63 |
+
|
64 |
def ensure_output_folder_exists():
|
65 |
"""Checks if the 'output/' folder exists, creates it if not."""
|
66 |
|
|
|
73 |
else:
|
74 |
print(f"The 'output/' folder already exists.")
|
75 |
|
76 |
+
def put_columns_in_df(in_file):
|
77 |
+
new_choices = []
|
78 |
+
concat_choices = []
|
79 |
+
|
80 |
+
for file in in_file:
|
81 |
+
df = read_file(file.name)
|
82 |
+
new_choices = list(df.columns)
|
83 |
+
|
84 |
+
concat_choices.extend(new_choices)
|
85 |
+
|
86 |
+
# Drop duplicate columns
|
87 |
+
concat_choices = list(set(concat_choices))
|
88 |
+
|
89 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices)
|
90 |
|
91 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
92 |
def add_folder_to_path(folder_path: str):
|
tools/presidio_analyzer_custom.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
|
6 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
def analyze_iterator_custom(
|
11 |
+
self,
|
12 |
+
texts: Iterable[Union[str, bool, float, int]],
|
13 |
+
language: str,
|
14 |
+
list_length:int,
|
15 |
+
progress=gr.Progress(),
|
16 |
+
**kwargs,
|
17 |
+
) -> List[List[RecognizerResult]]:
|
18 |
+
"""
|
19 |
+
Analyze an iterable of strings.
|
20 |
+
|
21 |
+
:param texts: An list containing strings to be analyzed.
|
22 |
+
:param language: Input language
|
23 |
+
:param list_length: Length of the input list.
|
24 |
+
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
|
25 |
+
"""
|
26 |
+
|
27 |
+
# validate types
|
28 |
+
texts = self._validate_types(texts)
|
29 |
+
|
30 |
+
# Process the texts as batch for improved performance
|
31 |
+
nlp_artifacts_batch: Iterator[
|
32 |
+
Tuple[str, NlpArtifacts]
|
33 |
+
] = self.analyzer_engine.nlp_engine.process_batch(
|
34 |
+
texts=texts, language=language
|
35 |
+
)
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
list_results = []
|
40 |
+
|
41 |
+
# Uncomment this if you want to show progress within a file
|
42 |
+
#for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
|
43 |
+
for text, nlp_artifacts in nlp_artifacts_batch:
|
44 |
+
results = self.analyzer_engine.analyze(
|
45 |
+
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
|
46 |
+
)
|
47 |
+
|
48 |
+
list_results.append(results)
|
49 |
+
|
50 |
+
return list_results
|
51 |
+
|
52 |
+
def analyze_dict(
|
53 |
+
self,
|
54 |
+
input_dict: Dict[str, Union[Any, Iterable[Any]]],
|
55 |
+
language: str,
|
56 |
+
keys_to_skip: Optional[List[str]] = None,
|
57 |
+
**kwargs,
|
58 |
+
) -> Iterator[DictAnalyzerResult]:
|
59 |
+
"""
|
60 |
+
Analyze a dictionary of keys (strings) and values/iterable of values.
|
61 |
+
|
62 |
+
Non-string values are returned as is.
|
63 |
+
|
64 |
+
:param input_dict: The input dictionary for analysis
|
65 |
+
:param language: Input language
|
66 |
+
:param keys_to_skip: Keys to ignore during analysis
|
67 |
+
:param kwargs: Additional keyword arguments
|
68 |
+
for the `AnalyzerEngine.analyze` method.
|
69 |
+
Use this to pass arguments to the analyze method,
|
70 |
+
such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
|
71 |
+
See `AnalyzerEngine.analyze` for the full list.
|
72 |
+
"""
|
73 |
+
|
74 |
+
context = []
|
75 |
+
if "context" in kwargs:
|
76 |
+
context = kwargs["context"]
|
77 |
+
del kwargs["context"]
|
78 |
+
|
79 |
+
if not keys_to_skip:
|
80 |
+
keys_to_skip = []
|
81 |
+
|
82 |
+
|
83 |
+
for key, value in input_dict.items():
|
84 |
+
if not value or key in keys_to_skip:
|
85 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
|
86 |
+
continue # skip this key as requested
|
87 |
+
|
88 |
+
# Add the key as an additional context
|
89 |
+
specific_context = context[:]
|
90 |
+
specific_context.append(key)
|
91 |
+
|
92 |
+
if type(value) in (str, int, bool, float):
|
93 |
+
results: List[RecognizerResult] = self.analyzer_engine.analyze(
|
94 |
+
text=str(value), language=language, context=[key], **kwargs
|
95 |
+
)
|
96 |
+
elif isinstance(value, dict):
|
97 |
+
new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
|
98 |
+
results = self.analyze_dict(
|
99 |
+
input_dict=value,
|
100 |
+
language=language,
|
101 |
+
context=specific_context,
|
102 |
+
keys_to_skip=new_keys_to_skip,
|
103 |
+
**kwargs,
|
104 |
+
)
|
105 |
+
elif isinstance(value, Iterable):
|
106 |
+
# Recursively iterate nested dicts
|
107 |
+
list_length = len(value)
|
108 |
+
|
109 |
+
results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
|
110 |
+
texts=value,
|
111 |
+
language=language,
|
112 |
+
context=specific_context,
|
113 |
+
list_length=list_length,
|
114 |
+
**kwargs,
|
115 |
+
)
|
116 |
+
else:
|
117 |
+
raise ValueError(f"type {type(value)} is unsupported.")
|
118 |
+
|
119 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
|