seanpedrickcase
commited on
Commit
•
0b2c988
0
Parent(s):
Initial commit
Browse files- .dockerignore +16 -0
- .github/workflows/check_file_size.yml +16 -0
- .github/workflows/sync_to_hf.yml +20 -0
- .gitignore +16 -0
- Dockerfile +58 -0
- README.md +14 -0
- app.py +199 -0
- how_to_create_exe_dist.txt +38 -0
- requirements.txt +20 -0
- tld/.tld_set_snapshot +0 -0
- tools/__init__.py +0 -0
- tools/anonymiser.py +296 -0
- tools/aws_functions.py +164 -0
- tools/clean_funcs.py +194 -0
- tools/file_conversion.py +140 -0
- tools/file_redaction.py +236 -0
- tools/helper_functions.py +126 -0
- tools/load_spacy_model_custom_recognisers.py +168 -0
- tools/presidio_analyzer_custom.py +114 -0
- tools/unstructured_funcs.py +884 -0
.dockerignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.csv
|
2 |
+
*.pdf
|
3 |
+
*.url
|
4 |
+
*.jpg
|
5 |
+
*.png
|
6 |
+
*.ipynb
|
7 |
+
examples/*
|
8 |
+
processing/*
|
9 |
+
output/*
|
10 |
+
tools/__pycache__/*
|
11 |
+
old_code/*
|
12 |
+
tesseract/*
|
13 |
+
poppler/*
|
14 |
+
build/*
|
15 |
+
dist/*
|
16 |
+
build_deps/*
|
.github/workflows/check_file_size.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/sync_to_hf.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://seanpedrickcase:$HF_TOKEN@huggingface.co/spaces/seanpedrickcase/document_rag_preparation main
|
.gitignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.csv
|
2 |
+
*.pdf
|
3 |
+
*.url
|
4 |
+
*.jpg
|
5 |
+
*.png
|
6 |
+
*.ipynb
|
7 |
+
examples/*
|
8 |
+
processing/*
|
9 |
+
output/*
|
10 |
+
tools/__pycache__/*
|
11 |
+
old_code/*
|
12 |
+
tesseract/*
|
13 |
+
poppler/*
|
14 |
+
build/*
|
15 |
+
dist/*
|
16 |
+
build_deps/*
|
Dockerfile
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
2 |
+
|
3 |
+
# Install system dependencies. Need to specify -y for poppler to get it to install
|
4 |
+
RUN apt-get update \
|
5 |
+
&& apt-get install -y \
|
6 |
+
tesseract-ocr -y \
|
7 |
+
poppler-utils -y \
|
8 |
+
libgl1-mesa-glx -y \
|
9 |
+
libglib2.0-0 -y \
|
10 |
+
&& apt-get clean \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
WORKDIR /src
|
14 |
+
|
15 |
+
COPY requirements.txt .
|
16 |
+
|
17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
+
|
19 |
+
RUN pip install --no-cache-dir gradio==4.31.5
|
20 |
+
|
21 |
+
# Set up a new user named "user" with user ID 1000
|
22 |
+
RUN useradd -m -u 1000 user
|
23 |
+
|
24 |
+
# Change ownership of /home/user directory
|
25 |
+
#RUN chown -R user:user /home/user
|
26 |
+
|
27 |
+
# Make output folder
|
28 |
+
RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
|
29 |
+
RUN mkdir -p /home/user/app/tld && chown -R user:user /home/user/app/tld
|
30 |
+
|
31 |
+
# Switch to the "user" user
|
32 |
+
USER user
|
33 |
+
|
34 |
+
# Set environmental variables
|
35 |
+
ENV HOME=/home/user \
|
36 |
+
PATH=/home/user/.local/bin:$PATH \
|
37 |
+
PYTHONPATH=$HOME/app \
|
38 |
+
PYTHONUNBUFFERED=1 \
|
39 |
+
GRADIO_ALLOW_FLAGGING=never \
|
40 |
+
GRADIO_NUM_PORTS=1 \
|
41 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
42 |
+
GRADIO_SERVER_PORT=7860 \
|
43 |
+
GRADIO_THEME=huggingface \
|
44 |
+
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
45 |
+
#GRADIO_TEMP_DIR=$HOME/tmp \
|
46 |
+
#GRADIO_ROOT_PATH=/address-match \
|
47 |
+
# gunicorn keep alive timeout limit extended for GUI-based work - https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker?tab=readme-ov-file#timeout
|
48 |
+
KEEP_ALIVE=60 \
|
49 |
+
SYSTEM=spaces
|
50 |
+
|
51 |
+
# Set the working directory to the user's home directory
|
52 |
+
WORKDIR $HOME/app
|
53 |
+
|
54 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
55 |
+
COPY --chown=user . $HOME/app
|
56 |
+
#COPY . $HOME/app
|
57 |
+
|
58 |
+
CMD ["python", "app.py"]
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Document RAG preparation
|
3 |
+
emoji: 📖
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
---
|
11 |
+
|
12 |
+
# Document RAG preparation
|
13 |
+
|
14 |
+
Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), code files (.py, .js etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
|
app.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
4 |
+
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
5 |
+
|
6 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, custom_regex_load
|
7 |
+
from tools.unstructured_funcs import partition_file, clean_elements, export_elements_as_table_to_file, filter_elements_and_metadata, chunk_all_elements, minimum_chunk_length, start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks, multipage_sections, overlap_all
|
8 |
+
#from tools.aws_functions import load_data_from_aws
|
9 |
+
from tools.clean_funcs import pre_clean, full_entity_list, chosen_redact_entities
|
10 |
+
import gradio as gr
|
11 |
+
import pandas as pd
|
12 |
+
import numpy as np
|
13 |
+
from typing import Type, List
|
14 |
+
from unstructured.documents.elements import Element
|
15 |
+
|
16 |
+
# Creating an alias for pandas DataFrame using Type
|
17 |
+
PandasDataFrame = Type[pd.DataFrame]
|
18 |
+
|
19 |
+
add_folder_to_path("_internal/tesseract/")
|
20 |
+
add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
|
21 |
+
|
22 |
+
ensure_output_folder_exists()
|
23 |
+
|
24 |
+
language = 'en'
|
25 |
+
default_meta_keys_to_filter=["file_directory", "filetype"]
|
26 |
+
default_element_types_to_filter = ['UncategorizedText', 'Header']
|
27 |
+
|
28 |
+
|
29 |
+
def get_element_metadata(elements, prefix=""):
|
30 |
+
"""Recursively retrieves element names and metadata in the desired format."""
|
31 |
+
result = []
|
32 |
+
|
33 |
+
for element in elements:
|
34 |
+
# print("Element metadata: ", element.metadata)
|
35 |
+
# print("Element metadata dict: ", element.metadata.__dict__)
|
36 |
+
|
37 |
+
if hasattr(element, 'metadata') and isinstance(element.metadata.__dict__, dict):
|
38 |
+
for key, value in element.metadata.__dict__.items(): # Iterate over key-value pairs in metadata dictionary
|
39 |
+
new_prefix = f"{prefix}." if prefix else ""
|
40 |
+
if isinstance(value, dict): # Nested metadata
|
41 |
+
result.extend(get_element_metadata([value], new_prefix)) # Recurse with the nested dictionary as a single-item list
|
42 |
+
else: # Leaf element
|
43 |
+
meta_element_to_add = f"{new_prefix}{key}"
|
44 |
+
if meta_element_to_add not in result:
|
45 |
+
result.append(meta_element_to_add)
|
46 |
+
else:
|
47 |
+
print(f"Warning: Element {element} does not have a metadata dictionary.") # Handle elements without metadata gracefully
|
48 |
+
|
49 |
+
return result
|
50 |
+
|
51 |
+
def update_filter_dropdowns(elements_table:PandasDataFrame, elements:List[Element]):
|
52 |
+
if 'text' in elements_table.columns:
|
53 |
+
elements_table_filt = elements_table.drop('text', axis=1)
|
54 |
+
else:
|
55 |
+
elements_table_filt = elements_table
|
56 |
+
|
57 |
+
# Error handling for missing 'type' column
|
58 |
+
if 'type' not in elements_table_filt.columns:
|
59 |
+
print("Warning: 'type' column not found in the DataFrame.")
|
60 |
+
return gr.Dropdown(label="Element types (not available)"), gr.Dropdown(label="Metadata properties (not available)")
|
61 |
+
|
62 |
+
element_types_to_filter = elements_table_filt['type'].unique().tolist()
|
63 |
+
meta_keys_to_filter = get_element_metadata(elements)
|
64 |
+
|
65 |
+
#print("Element types:", element_types_to_filter)
|
66 |
+
#print("Meta keys:", meta_keys_to_filter)
|
67 |
+
|
68 |
+
element_types_to_filter_shortlist = [x for x in default_element_types_to_filter if x in element_types_to_filter]
|
69 |
+
meta_keys_to_filter_shortlist = [x for x in default_meta_keys_to_filter if x in meta_keys_to_filter]
|
70 |
+
|
71 |
+
return gr.Dropdown(
|
72 |
+
value=element_types_to_filter_shortlist, choices=element_types_to_filter, multiselect=True, interactive=True, label="Choose element types to exclude from element list"
|
73 |
+
), gr.Dropdown(
|
74 |
+
value=meta_keys_to_filter_shortlist, choices=meta_keys_to_filter, multiselect=True, interactive=True, label="Choose metadata keys to filter out"
|
75 |
+
)
|
76 |
+
|
77 |
+
# Create the gradio interface
|
78 |
+
|
79 |
+
block = gr.Blocks(theme = gr.themes.Base())
|
80 |
+
|
81 |
+
with block:
|
82 |
+
|
83 |
+
elements_state = gr.State([])
|
84 |
+
elements_table_state = gr.State(pd.DataFrame())
|
85 |
+
metadata_keys_state = gr.State([])
|
86 |
+
output_image_files_state = gr.State([])
|
87 |
+
output_file_list_state = gr.State([])
|
88 |
+
in_colnames_state = gr.State("text")
|
89 |
+
|
90 |
+
data_state = gr.State(pd.DataFrame())
|
91 |
+
embeddings_state = gr.State(np.array([]))
|
92 |
+
embeddings_type_state = gr.State("")
|
93 |
+
topic_model_state = gr.State()
|
94 |
+
assigned_topics_state = gr.State([])
|
95 |
+
custom_regex_state = gr.State(pd.DataFrame())
|
96 |
+
docs_state = gr.State()
|
97 |
+
data_file_name_no_ext_state = gr.State()
|
98 |
+
label_list_state = gr.State(pd.DataFrame())
|
99 |
+
output_name_state = gr.State("")
|
100 |
+
|
101 |
+
gr.Markdown(
|
102 |
+
"""
|
103 |
+
# Document RAG preparation
|
104 |
+
Extract text from documents and convert into tabular format using the Unstructured package. The outputs can then be used downstream for e.g. RAG/other processes that require tabular data. Currently supports the following file types: .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.). Outputs csvs and files in a 'Document' format commonly used as input to vector databases e.g. ChromaDB, or Langchain embedding datastore integrations. See [here](https://docs.unstructured.io/open-source/core-functionality/overview) for more details about what is going on under the hood.
|
105 |
+
""")
|
106 |
+
|
107 |
+
with gr.Tab("Partition document"):
|
108 |
+
|
109 |
+
with gr.Accordion("Upload files - accepts .pdf, .docx, .odt, .pptx, .html, text files (.txt, .md., .rst), image files (.png, .jpg, .heic), email exports (.msg, .eml), tabular files (.csv, .xlsx), or code files (.py, .js, etc.)", open = True):
|
110 |
+
in_file = gr.File(label="Choose file", file_count= "multiple", height=100)
|
111 |
+
in_pdf_partition_strategy = gr.Radio(label="PDF partition strategy", value = "fast", choices=["fast", "ocr_only", "hi_res"])
|
112 |
+
|
113 |
+
partition_btn = gr.Button("Partition documents (outputs appear below)", variant='primary')
|
114 |
+
|
115 |
+
with gr.Accordion("Clean, anonymise, or filter text elements", open = False):
|
116 |
+
with gr.Accordion("Filter element types from text and information from metadata", open = False):
|
117 |
+
element_types_to_filter = gr.Dropdown(value=default_element_types_to_filter, choices=default_element_types_to_filter, multiselect=True, interactive=True, label = "Choose element types to exclude from element list")
|
118 |
+
meta_keys_to_filter = gr.Dropdown(value=default_meta_keys_to_filter, choices=default_meta_keys_to_filter, multiselect=True, interactive=True, label = "Choose metadata keys to filter out")
|
119 |
+
|
120 |
+
filter_meta_btn = gr.Button("Filter elements/metadata")
|
121 |
+
|
122 |
+
with gr.Accordion("Clean/anonymise text", open = False):
|
123 |
+
with gr.Row():
|
124 |
+
clean_options = gr.Dropdown(choices = ["Convert bytes to string","Replace quotes","Clean non ASCII","Clean ordered list", "Group paragraphs",
|
125 |
+
"Remove trailing punctuation", "Remove all punctuation","Clean text","Remove extra whitespace", "Remove dashes","Remove bullets",
|
126 |
+
"Make lowercase"],
|
127 |
+
value=["Clean ordered list", "Group paragraphs", "Clean non ASCII", "Remove extra whitespace", "Remove dashes", "Remove bullets"],
|
128 |
+
label="Clean options", multiselect=True, interactive=True)
|
129 |
+
|
130 |
+
with gr.Accordion("Clean with custom regex", open = False):
|
131 |
+
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
132 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove custom regex.")
|
133 |
+
with gr.Row():
|
134 |
+
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
135 |
+
custom_regex_text = gr.Textbox(label="Custom regex load status")
|
136 |
+
|
137 |
+
with gr.Accordion("Anonymise text", open = False):
|
138 |
+
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data. Personal details are redacted - not 100% effective. Please check results afterwards!")
|
139 |
+
with gr.Row():
|
140 |
+
anon_strat = gr.Dropdown(value = "redact", choices=["redact", "replace"], multiselect=False, label="Anonymisation strategy. Choose from redact (simply remove text), or replace with entity type (e.g. <PERSON>)")
|
141 |
+
anon_entities_drop = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Choose entities to find and anonymise in your open text")
|
142 |
+
|
143 |
+
unstructured_clean_btn = gr.Button("Clean data")
|
144 |
+
|
145 |
+
with gr.Accordion("Chunk text", open = False):
|
146 |
+
with gr.Row():
|
147 |
+
chunking_method_rad = gr.Radio(value = "Chunk within title", choices = ["Chunk within title", "Basic chunking"], interactive=True)
|
148 |
+
multipage_sections_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label = "Continue chunk over page breaks.", interactive=True)
|
149 |
+
overlap_all_drop =gr.Dropdown(choices=["Yes", "No"], value = "Yes", label="Overlap over adjacent element text if needed.", interactive=True)
|
150 |
+
with gr.Row():
|
151 |
+
minimum_chunk_length_slide = gr.Slider(value = minimum_chunk_length, minimum=100, maximum=10000, step = 100, label= "Minimum chunk character length. Chunk will overlap next title if character limit not reached.", interactive=True)
|
152 |
+
start_new_chunk_after_end_of_this_element_length_slide = gr.Slider(value = start_new_chunk_after_end_of_this_element_length, minimum=100, maximum=10000, step = 100, label = "'Soft' maximum chunk character length - chunk will continue until end of current element when length reached")
|
153 |
+
hard_max_character_length_chunks_slide = gr.Slider(value = hard_max_character_length_chunks, minimum=100, maximum=10000, step = 100, label = "'Hard' maximum chunk character length. Chunk will not be longer than this.", interactive=True)
|
154 |
+
|
155 |
+
chunk_btn = gr.Button("Chunk document")
|
156 |
+
|
157 |
+
# Save chunked data to file
|
158 |
+
with gr.Accordion("File outputs", open = True):
|
159 |
+
with gr.Row():
|
160 |
+
output_summary = gr.Textbox(label="Output summary")
|
161 |
+
output_file = gr.File(label="Output file")
|
162 |
+
|
163 |
+
# AWS functions not yet implemented in this app
|
164 |
+
# with gr.Tab(label="AWS data load"):
|
165 |
+
# with gr.Accordion(label = "AWS data access", open = True):
|
166 |
+
# aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
167 |
+
# with gr.Row():
|
168 |
+
# in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
|
169 |
+
# load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
|
170 |
+
|
171 |
+
# aws_log_box = gr.Textbox(label="AWS data load status")
|
172 |
+
|
173 |
+
# Partition data, then Update filter dropdowns from loaded data
|
174 |
+
partition_btn.click(fn = partition_file, inputs=[in_file, in_pdf_partition_strategy],
|
175 |
+
outputs=[output_summary, elements_state, output_file, output_name_state, elements_table_state], api_name="partition").\
|
176 |
+
then(fn = update_filter_dropdowns, inputs=[elements_table_state, elements_state], outputs=[element_types_to_filter, meta_keys_to_filter])
|
177 |
+
|
178 |
+
# Clean data
|
179 |
+
## Filter metadata
|
180 |
+
|
181 |
+
filter_meta_btn.click(fn=filter_elements_and_metadata, inputs=[elements_state, element_types_to_filter, meta_keys_to_filter], outputs=[elements_state]).\
|
182 |
+
then(fn=export_elements_as_table_to_file, inputs=[elements_state, output_name_state], outputs=[output_summary, output_file])
|
183 |
+
|
184 |
+
## General text clean and anonymisation
|
185 |
+
|
186 |
+
### Custom regex load
|
187 |
+
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
188 |
+
|
189 |
+
unstructured_clean_btn.click(fn=clean_elements, inputs=[elements_state, clean_options, output_name_state], outputs=[elements_state, output_summary, output_file, output_name_state]).\
|
190 |
+
then(fn=pre_clean, inputs=[elements_state, in_colnames_state, custom_regex_state, clean_text, output_name_state, anonymise_drop, anon_strat, anon_entities_drop], outputs=[output_summary, output_file, elements_state, output_name_state])
|
191 |
+
|
192 |
+
## Chunk data
|
193 |
+
chunk_btn.click(fn = chunk_all_elements, inputs=[elements_state, output_name_state, chunking_method_rad, minimum_chunk_length_slide, start_new_chunk_after_end_of_this_element_length_slide, hard_max_character_length_chunks_slide, multipage_sections_drop, overlap_all_drop], outputs=[output_summary, output_file, output_name_state])
|
194 |
+
|
195 |
+
# Loading AWS data - not yet implemented in this app
|
196 |
+
# load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
197 |
+
|
198 |
+
# Simple run
|
199 |
+
block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
|
how_to_create_exe_dist.txt
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
|
2 |
+
|
3 |
+
2. Activate the environment 'conda activate new_env'
|
4 |
+
|
5 |
+
3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
|
6 |
+
|
7 |
+
NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
|
8 |
+
|
9 |
+
6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
|
10 |
+
|
11 |
+
7. pip install pyinstaller
|
12 |
+
|
13 |
+
8. In command line, cd to the folder that contains app.py.
|
14 |
+
|
15 |
+
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
+
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRagPrepApp_0.1 app.py
|
18 |
+
|
19 |
+
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
+
|
21 |
+
|
22 |
+
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
23 |
+
|
24 |
+
a = Analysis(
|
25 |
+
...
|
26 |
+
module_collection_mode={
|
27 |
+
'gradio': 'py', # Collect gradio package as source .py files
|
28 |
+
}
|
29 |
+
)
|
30 |
+
|
31 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRagPrepApp_0.1.spec
|
32 |
+
|
33 |
+
|
34 |
+
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\<app_name>').
|
35 |
+
|
36 |
+
10. In 'dist\<app_name>' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
|
37 |
+
|
38 |
+
11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.2.2
|
2 |
+
spacy # Not specified as latest versions create a conflict with latest versions of gradio
|
3 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
4 |
+
gradio # Not specified as latest versions create a conflict with latest versions of spacy
|
5 |
+
boto3==1.34.103
|
6 |
+
unstructured
|
7 |
+
unstructured[pdf]
|
8 |
+
unstructured[docx]
|
9 |
+
unstructured[pptx]
|
10 |
+
unstructured[html]
|
11 |
+
unstructured[text]
|
12 |
+
unstructured[xlsx]
|
13 |
+
unstructured[odt]
|
14 |
+
unstructured[jpg]
|
15 |
+
unstructured[msg]
|
16 |
+
Faker==22.2.0
|
17 |
+
presidio_analyzer==2.2.351
|
18 |
+
presidio_anonymizer==2.2.351
|
19 |
+
polars==0.20.6
|
20 |
+
|
tld/.tld_set_snapshot
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tools/__init__.py
ADDED
File without changes
|
tools/anonymiser.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.cli import download
|
2 |
+
import spacy
|
3 |
+
from tools.presidio_analyzer_custom import analyze_dict
|
4 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser
|
5 |
+
from typing import List
|
6 |
+
from unstructured.documents.elements import Element
|
7 |
+
|
8 |
+
spacy.prefer_gpu()
|
9 |
+
|
10 |
+
def spacy_model_installed(model_name):
|
11 |
+
try:
|
12 |
+
import en_core_web_sm
|
13 |
+
en_core_web_sm.load()
|
14 |
+
print("Successfully imported spaCy model")
|
15 |
+
#nlp = spacy.load("en_core_web_sm")
|
16 |
+
#print(nlp._path)
|
17 |
+
except:
|
18 |
+
download(model_name)
|
19 |
+
spacy.load(model_name)
|
20 |
+
print("Successfully imported spaCy model")
|
21 |
+
#print(nlp._path)
|
22 |
+
|
23 |
+
|
24 |
+
#if not is_model_installed(model_name):
|
25 |
+
# os.system(f"python -m spacy download {model_name}")
|
26 |
+
model_name = "en_core_web_sm"
|
27 |
+
spacy_model_installed(model_name)
|
28 |
+
|
29 |
+
#spacy.load(model_name)
|
30 |
+
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
31 |
+
#os.system("pip uninstall -y gradio")
|
32 |
+
#os.system("pip install gradio==3.50.0")
|
33 |
+
#os.system("python -m spacy download en_core_web_lg")
|
34 |
+
|
35 |
+
import re
|
36 |
+
import secrets
|
37 |
+
import base64
|
38 |
+
import time
|
39 |
+
|
40 |
+
import pandas as pd
|
41 |
+
|
42 |
+
from faker import Faker
|
43 |
+
|
44 |
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecognizer
|
45 |
+
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
46 |
+
from presidio_anonymizer.entities import OperatorConfig
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
def anon_consistent_names(df):
|
51 |
+
# ## Pick out common names and replace them with the same person value
|
52 |
+
df_dict = df.to_dict(orient="list")
|
53 |
+
|
54 |
+
analyzer = AnalyzerEngine()
|
55 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
56 |
+
|
57 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
58 |
+
analyzer_results = list(analyzer_results)
|
59 |
+
|
60 |
+
# + tags=[]
|
61 |
+
text = analyzer_results[3].value
|
62 |
+
|
63 |
+
# + tags=[]
|
64 |
+
recognizer_result = str(analyzer_results[3].recognizer_results)
|
65 |
+
|
66 |
+
# + tags=[]
|
67 |
+
recognizer_result
|
68 |
+
|
69 |
+
# + tags=[]
|
70 |
+
data_str = recognizer_result # abbreviated for brevity
|
71 |
+
|
72 |
+
# Adjusting the parse_dict function to handle trailing ']'
|
73 |
+
# Splitting the main data string into individual list strings
|
74 |
+
list_strs = data_str[1:-1].split('], [')
|
75 |
+
|
76 |
+
def parse_dict(s):
|
77 |
+
s = s.strip('[]') # Removing any surrounding brackets
|
78 |
+
items = s.split(', ')
|
79 |
+
d = {}
|
80 |
+
for item in items:
|
81 |
+
key, value = item.split(': ')
|
82 |
+
if key == 'score':
|
83 |
+
d[key] = float(value)
|
84 |
+
elif key in ['start', 'end']:
|
85 |
+
d[key] = int(value)
|
86 |
+
else:
|
87 |
+
d[key] = value
|
88 |
+
return d
|
89 |
+
|
90 |
+
# Re-running the improved processing code
|
91 |
+
|
92 |
+
result = []
|
93 |
+
|
94 |
+
for lst_str in list_strs:
|
95 |
+
# Splitting each list string into individual dictionary strings
|
96 |
+
dict_strs = lst_str.split(', type: ')
|
97 |
+
dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
|
98 |
+
|
99 |
+
# Parsing each dictionary string
|
100 |
+
dicts = [parse_dict(d) for d in dict_strs]
|
101 |
+
result.append(dicts)
|
102 |
+
|
103 |
+
#result
|
104 |
+
|
105 |
+
# + tags=[]
|
106 |
+
names = []
|
107 |
+
|
108 |
+
for idx, paragraph in enumerate(text):
|
109 |
+
paragraph_texts = []
|
110 |
+
for dictionary in result[idx]:
|
111 |
+
if dictionary['type'] == 'PERSON':
|
112 |
+
paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
|
113 |
+
names.append(paragraph_texts)
|
114 |
+
|
115 |
+
# + tags=[]
|
116 |
+
# Flatten the list of lists and extract unique names
|
117 |
+
unique_names = list(set(name for sublist in names for name in sublist))
|
118 |
+
|
119 |
+
# + tags=[]
|
120 |
+
fake_names = pd.Series(unique_names).apply(fake_first_name)
|
121 |
+
|
122 |
+
# + tags=[]
|
123 |
+
mapping_df = pd.DataFrame(data={"Unique names":unique_names,
|
124 |
+
"Fake names": fake_names})
|
125 |
+
|
126 |
+
# + tags=[]
|
127 |
+
# Convert mapping dataframe to dictionary
|
128 |
+
# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
|
129 |
+
name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
|
130 |
+
|
131 |
+
# + tags=[]
|
132 |
+
name_map
|
133 |
+
|
134 |
+
# + tags=[]
|
135 |
+
scrubbed_df_consistent_names = df.replace(name_map, regex = True)
|
136 |
+
|
137 |
+
# + tags=[]
|
138 |
+
scrubbed_df_consistent_names
|
139 |
+
|
140 |
+
return scrubbed_df_consistent_names
|
141 |
+
|
142 |
+
def detect_file_type(filename):
|
143 |
+
"""Detect the file type based on its extension."""
|
144 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
145 |
+
return 'csv'
|
146 |
+
elif filename.endswith('.xlsx'):
|
147 |
+
return 'xlsx'
|
148 |
+
elif filename.endswith('.parquet'):
|
149 |
+
return 'parquet'
|
150 |
+
else:
|
151 |
+
raise ValueError("Unsupported file type.")
|
152 |
+
|
153 |
+
def read_file(filename):
|
154 |
+
"""Read the file based on its detected type."""
|
155 |
+
file_type = detect_file_type(filename)
|
156 |
+
|
157 |
+
if file_type == 'csv':
|
158 |
+
return pd.read_csv(filename, low_memory=False)
|
159 |
+
elif file_type == 'xlsx':
|
160 |
+
return pd.read_excel(filename)
|
161 |
+
elif file_type == 'parquet':
|
162 |
+
return pd.read_parquet(filename)
|
163 |
+
|
164 |
+
def anonymise_script(text_list:List[Element], anon_strat:str, nlp_analyser=None):
|
165 |
+
|
166 |
+
#print(df.shape)
|
167 |
+
|
168 |
+
#df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "")
|
169 |
+
#print("Length of input series blank at start is: ", df_chosen_col_mask.value_counts())
|
170 |
+
|
171 |
+
# DataFrame to dict
|
172 |
+
df_dict = pd.DataFrame(data={"text":text_list}).to_dict(orient="list")
|
173 |
+
|
174 |
+
if nlp_analyser:
|
175 |
+
analyzer = nlp_analyser
|
176 |
+
else:
|
177 |
+
analyzer = AnalyzerEngine()
|
178 |
+
|
179 |
+
# Add titles to analyzer list
|
180 |
+
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
|
181 |
+
deny_list=["Mr","Mrs","Miss", "Ms", "mr", "mrs", "miss", "ms"])
|
182 |
+
|
183 |
+
analyzer.registry.add_recognizer(titles_recognizer)
|
184 |
+
|
185 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
186 |
+
|
187 |
+
anonymizer = AnonymizerEngine()
|
188 |
+
|
189 |
+
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
190 |
+
|
191 |
+
print("Identifying personal data")
|
192 |
+
analyse_tic = time.perf_counter()
|
193 |
+
#analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
|
194 |
+
analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en")
|
195 |
+
#print(analyzer_results)
|
196 |
+
analyzer_results = list(analyzer_results)
|
197 |
+
|
198 |
+
analyse_toc = time.perf_counter()
|
199 |
+
analyse_time_out = f"Analysing the text took {analyse_toc - analyse_tic:0.1f} seconds."
|
200 |
+
print(analyse_time_out)
|
201 |
+
|
202 |
+
# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
|
203 |
+
key = secrets.token_bytes(16) # 128 bits = 16 bytes
|
204 |
+
key_string = base64.b64encode(key).decode('utf-8')
|
205 |
+
|
206 |
+
# Create faker function (note that it has to receive a value)
|
207 |
+
|
208 |
+
fake = Faker("en_UK")
|
209 |
+
|
210 |
+
def fake_first_name(x):
|
211 |
+
return fake.first_name()
|
212 |
+
|
213 |
+
# Set up the anonymization configuration WITHOUT DATE_TIME
|
214 |
+
replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
|
215 |
+
redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
|
216 |
+
hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
|
217 |
+
mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
|
218 |
+
people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
|
219 |
+
fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
|
220 |
+
|
221 |
+
|
222 |
+
if anon_strat == "replace": chosen_mask_config = replace_config
|
223 |
+
if anon_strat == "redact": chosen_mask_config = redact_config
|
224 |
+
if anon_strat == "hash": chosen_mask_config = hash_config
|
225 |
+
if anon_strat == "mask": chosen_mask_config = mask_config
|
226 |
+
if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
|
227 |
+
elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
|
228 |
+
|
229 |
+
# I think in general people will want to keep date / times - NOT FOR TOPIC MODELLING
|
230 |
+
#keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
|
231 |
+
|
232 |
+
#combined_config = {**chosen_mask_config, **keep_date_config}
|
233 |
+
combined_config = {**chosen_mask_config}#, **keep_date_config}
|
234 |
+
combined_config
|
235 |
+
|
236 |
+
print("Anonymising personal data")
|
237 |
+
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
|
238 |
+
|
239 |
+
#print(anonymizer_results)
|
240 |
+
|
241 |
+
scrubbed_df = pd.DataFrame(data={"text":anonymizer_results["text"]})
|
242 |
+
|
243 |
+
scrubbed_series = scrubbed_df["text"]
|
244 |
+
|
245 |
+
#print(scrubbed_series[0:6])
|
246 |
+
|
247 |
+
#print("Length of output series is: ", len(scrubbed_series))
|
248 |
+
#print("Length of input series at end is: ", len(df[chosen_col]))
|
249 |
+
|
250 |
+
|
251 |
+
#scrubbed_values_mask = (scrubbed_series.isnull()) | (scrubbed_series.str.strip() == "")
|
252 |
+
#df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "")
|
253 |
+
|
254 |
+
#print("Length of input series blank at end is: ", df_chosen_col_mask.value_counts())
|
255 |
+
#print("Length of output series blank is: ", scrubbed_values_mask.value_counts())
|
256 |
+
|
257 |
+
|
258 |
+
# Create reporting message
|
259 |
+
out_message = "Successfully anonymised"
|
260 |
+
|
261 |
+
if anon_strat == "encrypt":
|
262 |
+
out_message = out_message + ". Your decryption key is " + key_string + "."
|
263 |
+
|
264 |
+
return scrubbed_series, out_message
|
265 |
+
|
266 |
+
def do_anonymise(in_file:str, anon_strat:str, chosen_cols:List[str]):
|
267 |
+
|
268 |
+
# Load file
|
269 |
+
|
270 |
+
anon_df = pd.DataFrame()
|
271 |
+
|
272 |
+
if in_file:
|
273 |
+
for match_file in in_file:
|
274 |
+
match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
|
275 |
+
anon_df = pd.concat([anon_df, match_temp_file])
|
276 |
+
|
277 |
+
# Split dataframe to keep only selected columns
|
278 |
+
all_cols_original_order = list(anon_df.columns)
|
279 |
+
anon_df_part = anon_df[chosen_cols]
|
280 |
+
anon_df_remain = anon_df.drop(chosen_cols, axis = 1)
|
281 |
+
|
282 |
+
# Anonymise the selected columns
|
283 |
+
anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)
|
284 |
+
|
285 |
+
# Rejoin the dataframe together
|
286 |
+
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
287 |
+
anon_df_out = anon_df_out[all_cols_original_order]
|
288 |
+
|
289 |
+
# Export file
|
290 |
+
out_file_part = re.sub(r'\.csv', '', match_file.name)
|
291 |
+
|
292 |
+
anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"
|
293 |
+
|
294 |
+
anon_df_out.to_csv(anon_export_file_name, index = None)
|
295 |
+
|
296 |
+
return out_message, anon_export_file_name
|
tools/aws_functions.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type
|
2 |
+
import pandas as pd
|
3 |
+
import boto3
|
4 |
+
import tempfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
PandasDataFrame = Type[pd.DataFrame]
|
8 |
+
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
9 |
+
|
10 |
+
try:
|
11 |
+
session = boto3.Session() # profile_name="default"
|
12 |
+
except Exception as e:
|
13 |
+
print(e)
|
14 |
+
|
15 |
+
# sts = session.client("sts")
|
16 |
+
# Create a Session with the IAM role ARN
|
17 |
+
# aws_role = os.environ['AWS_ROLE_DATA_TEXT_SEARCH']
|
18 |
+
# response = sts.assume_role(
|
19 |
+
# RoleArn=aws_role,
|
20 |
+
# RoleSessionName="ecs-test-session"
|
21 |
+
# )
|
22 |
+
# print(response)
|
23 |
+
|
24 |
+
|
25 |
+
def get_assumed_role_info():
|
26 |
+
sts = boto3.client('sts', region_name='eu-west-2', endpoint_url='https://sts.eu-west-2.amazonaws.com')
|
27 |
+
response = sts.get_caller_identity()
|
28 |
+
|
29 |
+
# Extract ARN of the assumed role
|
30 |
+
assumed_role_arn = response['Arn']
|
31 |
+
|
32 |
+
# Extract the name of the assumed role from the ARN
|
33 |
+
assumed_role_name = assumed_role_arn.split('/')[-1]
|
34 |
+
|
35 |
+
return assumed_role_arn, assumed_role_name
|
36 |
+
|
37 |
+
try:
|
38 |
+
assumed_role_arn, assumed_role_name = get_assumed_role_info()
|
39 |
+
|
40 |
+
print("Assumed Role ARN:", assumed_role_arn)
|
41 |
+
print("Assumed Role Name:", assumed_role_name)
|
42 |
+
except Exception as e:
|
43 |
+
print(e)
|
44 |
+
|
45 |
+
# Download direct from S3 - requires login credentials
|
46 |
+
def download_file_from_s3(bucket_name, key, local_file_path):
|
47 |
+
|
48 |
+
s3 = boto3.client('s3')
|
49 |
+
s3.download_file(bucket_name, key, local_file_path)
|
50 |
+
print(f"File downloaded from S3: s3://{bucket_name}/{key} to {local_file_path}")
|
51 |
+
|
52 |
+
#download_file_from_s3(bucket_name, object_key, local_file_loc)
|
53 |
+
|
54 |
+
def download_folder_from_s3(bucket_name, s3_folder, local_folder):
|
55 |
+
"""
|
56 |
+
Download all files from an S3 folder to a local folder.
|
57 |
+
"""
|
58 |
+
s3 = boto3.client('s3')
|
59 |
+
|
60 |
+
# List objects in the specified S3 folder
|
61 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
62 |
+
|
63 |
+
# Download each object
|
64 |
+
for obj in response.get('Contents', []):
|
65 |
+
# Extract object key and construct local file path
|
66 |
+
object_key = obj['Key']
|
67 |
+
local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
|
68 |
+
|
69 |
+
# Create directories if necessary
|
70 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
71 |
+
|
72 |
+
# Download the object
|
73 |
+
try:
|
74 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
75 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
78 |
+
|
79 |
+
|
80 |
+
def download_files_from_s3(bucket_name, s3_folder, local_folder, filenames):
|
81 |
+
"""
|
82 |
+
Download specific files from an S3 folder to a local folder.
|
83 |
+
"""
|
84 |
+
s3 = boto3.client('s3')
|
85 |
+
|
86 |
+
print("Trying to download file: ", filenames)
|
87 |
+
|
88 |
+
if filenames == '*':
|
89 |
+
# List all objects in the S3 folder
|
90 |
+
print("Trying to download all files in AWS folder: ", s3_folder)
|
91 |
+
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
|
92 |
+
|
93 |
+
print("Found files in AWS folder: ", response.get('Contents', []))
|
94 |
+
|
95 |
+
filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
|
96 |
+
|
97 |
+
print("Found filenames in AWS folder: ", filenames)
|
98 |
+
|
99 |
+
for filename in filenames:
|
100 |
+
object_key = os.path.join(s3_folder, filename)
|
101 |
+
local_file_path = os.path.join(local_folder, filename)
|
102 |
+
|
103 |
+
# Create directories if necessary
|
104 |
+
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
|
105 |
+
|
106 |
+
# Download the object
|
107 |
+
try:
|
108 |
+
s3.download_file(bucket_name, object_key, local_file_path)
|
109 |
+
print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
|
110 |
+
except Exception as e:
|
111 |
+
print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
def load_data_from_aws(in_aws_keyword_file, aws_password="", bucket_name=bucket_name):
|
116 |
+
|
117 |
+
temp_dir = tempfile.mkdtemp()
|
118 |
+
local_address_stub = temp_dir + '/doc-redaction/'
|
119 |
+
files = []
|
120 |
+
|
121 |
+
if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
|
122 |
+
out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
|
123 |
+
return files, out_message
|
124 |
+
|
125 |
+
if aws_password:
|
126 |
+
if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
|
127 |
+
|
128 |
+
s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
|
129 |
+
|
130 |
+
local_folder_path = local_address_stub
|
131 |
+
|
132 |
+
# Check if folder exists
|
133 |
+
if not os.path.exists(local_folder_path):
|
134 |
+
print(f"Folder {local_folder_path} does not exist! Making folder.")
|
135 |
+
|
136 |
+
os.mkdir(local_folder_path)
|
137 |
+
|
138 |
+
# Check if folder is empty
|
139 |
+
if len(os.listdir(local_folder_path)) == 0:
|
140 |
+
print(f"Folder {local_folder_path} is empty")
|
141 |
+
# Download data
|
142 |
+
download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
|
143 |
+
|
144 |
+
print("AWS data downloaded")
|
145 |
+
|
146 |
+
else:
|
147 |
+
print(f"Folder {local_folder_path} is not empty")
|
148 |
+
|
149 |
+
#files = os.listdir(local_folder_stub)
|
150 |
+
#print(files)
|
151 |
+
|
152 |
+
files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
|
153 |
+
|
154 |
+
out_message = "Data successfully loaded from AWS"
|
155 |
+
print(out_message)
|
156 |
+
|
157 |
+
else:
|
158 |
+
out_message = "Data not loaded from AWS"
|
159 |
+
print(out_message)
|
160 |
+
else:
|
161 |
+
out_message = "No password provided. Please ask the data team for access if you need this."
|
162 |
+
print(out_message)
|
163 |
+
|
164 |
+
return files, out_message
|
tools/clean_funcs.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import polars as pl
|
4 |
+
import gradio as gr
|
5 |
+
import time
|
6 |
+
from datetime import datetime
|
7 |
+
import tools.anonymiser as anon
|
8 |
+
from unstructured.staging.base import convert_to_dataframe
|
9 |
+
|
10 |
+
from typing import List
|
11 |
+
from unstructured.documents.elements import Element
|
12 |
+
|
13 |
+
from tools.unstructured_funcs import export_elements_as_table_to_file
|
14 |
+
|
15 |
+
today_rev = datetime.now().strftime("%Y%m%d")
|
16 |
+
|
17 |
+
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
18 |
+
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
19 |
+
|
20 |
+
# Adding custom words to the stopwords
|
21 |
+
custom_words = []
|
22 |
+
my_stop_words = custom_words
|
23 |
+
|
24 |
+
# #### Some of my cleaning functions
|
25 |
+
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
26 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
27 |
+
email_pattern_regex = r'\S*@\S*\s?'
|
28 |
+
num_pattern_regex = r'[0-9]+'
|
29 |
+
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
30 |
+
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
31 |
+
multiple_spaces_regex = r'\s{2,}'
|
32 |
+
|
33 |
+
def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean_text:str, data_file_name_no_ext:str="combined_elements", anonymise_drop:List[str]="No", anon_strat:str = "redact", anon_entities:List[str]=chosen_redact_entities, progress=gr.Progress(track_tqdm=True)):
|
34 |
+
'''
|
35 |
+
Clean open text in tabular format with custom regex or anonymisation.
|
36 |
+
'''
|
37 |
+
|
38 |
+
output_text = ""
|
39 |
+
output_list = []
|
40 |
+
|
41 |
+
progress(0, desc = "Cleaning data")
|
42 |
+
|
43 |
+
if not in_colnames:
|
44 |
+
error_message = "Please enter one column name to use for cleaning and finding topics."
|
45 |
+
print(error_message)
|
46 |
+
return error_message, None, data_file_name_no_ext, None, None
|
47 |
+
|
48 |
+
all_tic = time.perf_counter()
|
49 |
+
|
50 |
+
output_list = []
|
51 |
+
#file_list = [string.name for string in in_files]
|
52 |
+
|
53 |
+
in_colnames_list_first = in_colnames[0]
|
54 |
+
|
55 |
+
if clean_text == "Yes":
|
56 |
+
clean_tic = time.perf_counter()
|
57 |
+
print("Starting data clean.")
|
58 |
+
|
59 |
+
for element in data:
|
60 |
+
if not custom_regex.empty:
|
61 |
+
cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
|
62 |
+
else:
|
63 |
+
cleaned_data = initial_clean([element.text], [])
|
64 |
+
|
65 |
+
element.text = cleaned_data[0]
|
66 |
+
print(element.text)
|
67 |
+
|
68 |
+
clean_toc = time.perf_counter()
|
69 |
+
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
70 |
+
print(clean_time_out)
|
71 |
+
|
72 |
+
if anonymise_drop == "Yes":
|
73 |
+
progress(0.6, desc= "Anonymising data")
|
74 |
+
|
75 |
+
data_file_name_no_ext = data_file_name_no_ext + "_anon"
|
76 |
+
|
77 |
+
anon_tic = time.perf_counter()
|
78 |
+
|
79 |
+
data_list = []
|
80 |
+
|
81 |
+
for element in data:
|
82 |
+
data_list.append(element.text)
|
83 |
+
|
84 |
+
data_anon_col, anonymisation_success = anon.anonymise_script(data_list, anon_strat=anon_strat)
|
85 |
+
|
86 |
+
for i, element in enumerate(data):
|
87 |
+
element.text = data_anon_col[i]
|
88 |
+
|
89 |
+
print(anonymisation_success)
|
90 |
+
|
91 |
+
anon_toc = time.perf_counter()
|
92 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
93 |
+
|
94 |
+
alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(data, data_file_name_no_ext, file_name_suffix="_clean")
|
95 |
+
|
96 |
+
all_toc = time.perf_counter()
|
97 |
+
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
98 |
+
print(time_out)
|
99 |
+
|
100 |
+
output_text = "Data clean completed."
|
101 |
+
|
102 |
+
return output_text, out_files, data, output_file_base
|
103 |
+
|
104 |
+
|
105 |
+
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
106 |
+
#texts = pl.Series(texts).str.strip_chars()
|
107 |
+
#text = texts.str.replace_all(html_pattern_regex, ' ')
|
108 |
+
#text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
|
109 |
+
#text = text.str.replace_all(email_pattern_regex, ' ')
|
110 |
+
#text = text.str.replace_all(nums_two_more_regex, ' ')
|
111 |
+
#text = text.str.replace_all(postcode_pattern_regex, ' ')
|
112 |
+
|
113 |
+
texts = pl.Series(texts)
|
114 |
+
|
115 |
+
# Allow for custom regex patterns to be removed
|
116 |
+
if len(custom_regex) > 0:
|
117 |
+
for pattern in custom_regex:
|
118 |
+
raw_string_pattern = rf"{pattern}" # Case-insensitive regex
|
119 |
+
#print(f"Removing regex pattern: {raw_string_pattern}")
|
120 |
+
text = text.str.replace_all(raw_string_pattern, " ")
|
121 |
+
#print("Text without pattern: ", text[0])
|
122 |
+
|
123 |
+
|
124 |
+
#text = text.str.replace_all(multiple_spaces_regex, ' ')
|
125 |
+
|
126 |
+
text = text.to_list()
|
127 |
+
|
128 |
+
return text
|
129 |
+
|
130 |
+
def remove_hyphens(text_text):
|
131 |
+
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
|
132 |
+
|
133 |
+
|
134 |
+
def remove_characters_after_tokenization(tokens):
|
135 |
+
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
|
136 |
+
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
|
137 |
+
return filtered_tokens
|
138 |
+
|
139 |
+
def convert_to_lowercase(tokens):
|
140 |
+
return [token.lower() for token in tokens if token.isalpha()]
|
141 |
+
|
142 |
+
def remove_short_tokens(tokens):
|
143 |
+
return [token for token in tokens if len(token) > 3]
|
144 |
+
|
145 |
+
|
146 |
+
def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
|
147 |
+
# Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
|
148 |
+
# Only identifies the second duplicate
|
149 |
+
|
150 |
+
seen = set()
|
151 |
+
dups = []
|
152 |
+
|
153 |
+
for i, doi in enumerate(data_samples_ready):
|
154 |
+
if doi not in seen:
|
155 |
+
seen.add(doi)
|
156 |
+
else:
|
157 |
+
dups.append(i)
|
158 |
+
#data_samples_ready[dupes[0:]]
|
159 |
+
|
160 |
+
# To see a specific duplicated value you know the position of
|
161 |
+
#matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
|
162 |
+
#matching
|
163 |
+
|
164 |
+
# Remove duplicates only (keep first instance)
|
165 |
+
#data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
|
166 |
+
|
167 |
+
### Remove all duplicates including original instance
|
168 |
+
|
169 |
+
# Identify ALL duplicates including initial values
|
170 |
+
# https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python
|
171 |
+
|
172 |
+
from collections import defaultdict
|
173 |
+
D = defaultdict(list)
|
174 |
+
for i,item in enumerate(data_samples_ready):
|
175 |
+
D[item].append(i)
|
176 |
+
D = {k:v for k,v in D.items() if len(v)>1}
|
177 |
+
|
178 |
+
# https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
|
179 |
+
L = list(D.values())
|
180 |
+
flat_list_dups = [item for sublist in L for item in sublist]
|
181 |
+
|
182 |
+
# https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
|
183 |
+
for index in sorted(flat_list_dups, reverse=True):
|
184 |
+
del data_samples_ready[index]
|
185 |
+
del data_samples_clean[index]
|
186 |
+
del data_samples[index]
|
187 |
+
|
188 |
+
# Remove blanks
|
189 |
+
data_samples_ready = [i for i in data_samples_ready if i]
|
190 |
+
data_samples_clean = [i for i in data_samples_clean if i]
|
191 |
+
data_samples = [i for i in data_samples if i]
|
192 |
+
|
193 |
+
return data_samples_ready, data_samples_clean, flat_list_dups, data_samples
|
194 |
+
|
tools/file_conversion.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end
|
3 |
+
from PIL import Image
|
4 |
+
import os
|
5 |
+
from gradio import Progress
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
def is_pdf_or_image(filename):
|
9 |
+
"""
|
10 |
+
Check if a file name is a PDF or an image file.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
filename (str): The name of the file.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
|
17 |
+
"""
|
18 |
+
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
|
19 |
+
output = True
|
20 |
+
else:
|
21 |
+
output = False
|
22 |
+
return output
|
23 |
+
|
24 |
+
def is_pdf(filename):
|
25 |
+
"""
|
26 |
+
Check if a file name is a PDF.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
filename (str): The name of the file.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
bool: True if the file name ends with ".pdf", False otherwise.
|
33 |
+
"""
|
34 |
+
return filename.lower().endswith(".pdf")
|
35 |
+
|
36 |
+
# %%
|
37 |
+
## Convert pdf to image if necessary
|
38 |
+
|
39 |
+
def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
40 |
+
|
41 |
+
# Get the number of pages in the PDF
|
42 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
43 |
+
print("Number of pages in PDF: ", str(page_count))
|
44 |
+
|
45 |
+
images = []
|
46 |
+
|
47 |
+
# Open the PDF file
|
48 |
+
for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
49 |
+
|
50 |
+
print("Current page: ", str(page_num))
|
51 |
+
|
52 |
+
# Convert one page to image
|
53 |
+
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
|
54 |
+
|
55 |
+
# If no images are returned, break the loop
|
56 |
+
if not image:
|
57 |
+
break
|
58 |
+
|
59 |
+
images.extend(image)
|
60 |
+
|
61 |
+
print("PDF has been converted to images.")
|
62 |
+
|
63 |
+
return images
|
64 |
+
|
65 |
+
|
66 |
+
# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
67 |
+
def process_file(file_path):
|
68 |
+
# Get the file extension
|
69 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
70 |
+
|
71 |
+
# Check if the file is an image type
|
72 |
+
if file_extension in ['.jpg', '.jpeg', '.png']:
|
73 |
+
print(f"{file_path} is an image file.")
|
74 |
+
# Perform image processing here
|
75 |
+
out_path = [Image.open(file_path)]
|
76 |
+
|
77 |
+
# Check if the file is a PDF
|
78 |
+
elif file_extension == '.pdf':
|
79 |
+
print(f"{file_path} is a PDF file. Converting to image set")
|
80 |
+
# Run your function for processing PDF files here
|
81 |
+
out_path = convert_pdf_to_images(file_path)
|
82 |
+
|
83 |
+
else:
|
84 |
+
print(f"{file_path} is not an image or PDF file.")
|
85 |
+
out_path = ['']
|
86 |
+
|
87 |
+
return out_path
|
88 |
+
|
89 |
+
def prepare_image_or_text_pdf(file_path:str, in_redact_method:str, in_allow_list:List[List[str]]=None):
|
90 |
+
|
91 |
+
out_message = ''
|
92 |
+
out_file_paths = []
|
93 |
+
|
94 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
95 |
+
|
96 |
+
if file_path:
|
97 |
+
file_path_without_ext = get_file_path_end(file_path)
|
98 |
+
else:
|
99 |
+
out_message = "No file selected"
|
100 |
+
print(out_message)
|
101 |
+
return out_message, out_file_paths
|
102 |
+
|
103 |
+
if in_redact_method == "Image analysis":
|
104 |
+
# Analyse and redact image-based pdf or image
|
105 |
+
if is_pdf_or_image(file_path) == False:
|
106 |
+
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
107 |
+
|
108 |
+
out_file_path = process_file(file_path)
|
109 |
+
|
110 |
+
elif in_redact_method == "Text analysis":
|
111 |
+
if is_pdf(file_path) == False:
|
112 |
+
return "Please upload a PDF file for text analysis.", None
|
113 |
+
|
114 |
+
out_file_path = file_path
|
115 |
+
|
116 |
+
return out_message, out_file_path
|
117 |
+
|
118 |
+
|
119 |
+
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
120 |
+
file_path_without_ext = get_file_path_end(in_file_path)
|
121 |
+
|
122 |
+
out_file_paths = out_text_file_path
|
123 |
+
|
124 |
+
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
+
print("Creating image version of results")
|
126 |
+
pdf_text_image_paths = process_file(out_text_file_path[0])
|
127 |
+
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
128 |
+
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
+
|
130 |
+
out_file_paths.append(out_text_image_file_path)
|
131 |
+
|
132 |
+
out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
|
133 |
+
|
134 |
+
return out_message, out_file_paths
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
tools/file_redaction.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
from typing import List
|
3 |
+
import pandas as pd
|
4 |
+
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
5 |
+
from pdfminer.high_level import extract_pages
|
6 |
+
from tools.file_conversion import process_file
|
7 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
|
8 |
+
from pikepdf import Pdf, Dictionary, Name
|
9 |
+
from gradio import Progress
|
10 |
+
import time
|
11 |
+
|
12 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
13 |
+
from tools.helper_functions import get_file_path_end
|
14 |
+
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
15 |
+
import gradio as gr
|
16 |
+
|
17 |
+
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
18 |
+
|
19 |
+
tic = time.perf_counter()
|
20 |
+
|
21 |
+
out_message = ''
|
22 |
+
out_file_paths = []
|
23 |
+
|
24 |
+
if in_allow_list:
|
25 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
26 |
+
|
27 |
+
if file_path:
|
28 |
+
file_path_without_ext = get_file_path_end(file_path)
|
29 |
+
else:
|
30 |
+
out_message = "No file selected"
|
31 |
+
print(out_message)
|
32 |
+
return out_message, out_file_paths
|
33 |
+
|
34 |
+
if in_redact_method == "Image analysis":
|
35 |
+
# Analyse and redact image-based pdf or image
|
36 |
+
# if is_pdf_or_image(file_path) == False:
|
37 |
+
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
38 |
+
|
39 |
+
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
40 |
+
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
41 |
+
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
42 |
+
|
43 |
+
out_file_paths.append(out_image_file_path)
|
44 |
+
out_message = "Image-based PDF successfully redacted and saved to file."
|
45 |
+
|
46 |
+
elif in_redact_method == "Text analysis":
|
47 |
+
if is_pdf(file_path) == False:
|
48 |
+
return "Please upload a PDF file for text analysis.", None
|
49 |
+
|
50 |
+
# Analyse text-based pdf
|
51 |
+
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
52 |
+
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
|
53 |
+
pdf_text.save(out_text_file_path)
|
54 |
+
|
55 |
+
out_file_paths.append(out_text_file_path)
|
56 |
+
|
57 |
+
out_message = "Text-based PDF successfully redacted and saved to file."
|
58 |
+
|
59 |
+
else:
|
60 |
+
out_message = "No redaction method selected"
|
61 |
+
print(out_message)
|
62 |
+
return out_message, out_file_paths
|
63 |
+
|
64 |
+
toc = time.perf_counter()
|
65 |
+
out_time = f"Time taken: {toc - tic:0.1f} seconds."
|
66 |
+
print(out_time)
|
67 |
+
|
68 |
+
out_message = out_message + "\n\n" + out_time
|
69 |
+
|
70 |
+
return out_message, out_file_paths, out_file_paths
|
71 |
+
|
72 |
+
|
73 |
+
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
74 |
+
'''
|
75 |
+
take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
76 |
+
'''
|
77 |
+
|
78 |
+
if not image_paths:
|
79 |
+
|
80 |
+
out_message = "PDF does not exist as images. Converting pages to image"
|
81 |
+
print(out_message)
|
82 |
+
progress(0, desc=out_message)
|
83 |
+
|
84 |
+
image_paths = process_file(file_path)
|
85 |
+
|
86 |
+
# Create a new PDF
|
87 |
+
#pdf = pikepdf.new()
|
88 |
+
|
89 |
+
images = []
|
90 |
+
number_of_pages = len(image_paths)
|
91 |
+
|
92 |
+
out_message = "Redacting pages"
|
93 |
+
print(out_message)
|
94 |
+
progress(0.1, desc=out_message)
|
95 |
+
|
96 |
+
for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
97 |
+
|
98 |
+
print("Redacting page ", str(i + 1))
|
99 |
+
|
100 |
+
# Get the image to redact using PIL lib (pillow)
|
101 |
+
image = image_paths[i] #Image.open(image_paths[i])
|
102 |
+
|
103 |
+
# %%
|
104 |
+
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
105 |
+
engine = ImageRedactorEngine(image_analyser)
|
106 |
+
|
107 |
+
if language == 'en':
|
108 |
+
ocr_lang = 'eng'
|
109 |
+
else: ocr_lang = language
|
110 |
+
|
111 |
+
# %%
|
112 |
+
# Redact the image with pink color
|
113 |
+
redacted_image = engine.redact(image,
|
114 |
+
fill=(0, 0, 0),
|
115 |
+
ocr_kwargs={"lang": ocr_lang},
|
116 |
+
allow_list=allow_list,
|
117 |
+
ad_hoc_recognizers= None,
|
118 |
+
**{
|
119 |
+
"language": language,
|
120 |
+
"entities": chosen_redact_entities,
|
121 |
+
"score_threshold": score_threshold
|
122 |
+
},
|
123 |
+
)
|
124 |
+
|
125 |
+
images.append(redacted_image)
|
126 |
+
|
127 |
+
return images
|
128 |
+
|
129 |
+
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
130 |
+
'''
|
131 |
+
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
132 |
+
'''
|
133 |
+
|
134 |
+
combined_analyzer_results = []
|
135 |
+
analyser_explanations = []
|
136 |
+
annotations_all_pages = []
|
137 |
+
analyzed_bounding_boxes_df = pd.DataFrame()
|
138 |
+
|
139 |
+
pdf = Pdf.open(filename)
|
140 |
+
|
141 |
+
page_num = 0
|
142 |
+
|
143 |
+
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
144 |
+
|
145 |
+
|
146 |
+
print("Page number is: ", page_num)
|
147 |
+
|
148 |
+
annotations_on_page = []
|
149 |
+
analyzed_bounding_boxes = []
|
150 |
+
|
151 |
+
for page_layout in extract_pages(filename, page_numbers = [page_num], maxpages=1):
|
152 |
+
analyzer_results = []
|
153 |
+
|
154 |
+
for text_container in page_layout:
|
155 |
+
if isinstance(text_container, LTTextContainer):
|
156 |
+
text_to_analyze = text_container.get_text()
|
157 |
+
|
158 |
+
analyzer_results = []
|
159 |
+
characters = []
|
160 |
+
|
161 |
+
analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
|
162 |
+
language=language,
|
163 |
+
entities=chosen_redact_entities,
|
164 |
+
score_threshold=score_threshold,
|
165 |
+
return_decision_process=False,
|
166 |
+
allow_list=allow_list)
|
167 |
+
|
168 |
+
#if analyzer_results:
|
169 |
+
# pass
|
170 |
+
#explanation = analyzer_results[0].analysis_explanation.to_dict()
|
171 |
+
#analyser_explanations.append(explanation)
|
172 |
+
characters = [char # This is what we want to include in the list
|
173 |
+
for line in text_container # Loop through each line in text_container
|
174 |
+
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
175 |
+
for char in line] # Loop through each character in the line
|
176 |
+
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
177 |
+
|
178 |
+
# If any results found
|
179 |
+
print(analyzer_results)
|
180 |
+
|
181 |
+
if len(analyzer_results) > 0 and len(characters) > 0:
|
182 |
+
analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
|
183 |
+
combined_analyzer_results.extend(analyzer_results)
|
184 |
+
|
185 |
+
if len(analyzer_results) > 0:
|
186 |
+
# Create summary df of annotations to be made
|
187 |
+
analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
|
188 |
+
analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
189 |
+
analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
|
190 |
+
analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
|
191 |
+
analyzed_bounding_boxes_df_new['page'] = page_num + 1
|
192 |
+
analyzed_bounding_boxes_df = pd.concat([analyzed_bounding_boxes_df, analyzed_bounding_boxes_df_new], axis = 0)
|
193 |
+
|
194 |
+
for analyzed_bounding_box in analyzed_bounding_boxes:
|
195 |
+
bounding_box = analyzed_bounding_box["boundingBox"]
|
196 |
+
annotation = Dictionary(
|
197 |
+
Type=Name.Annot,
|
198 |
+
Subtype=Name.Highlight,
|
199 |
+
QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
|
200 |
+
Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
|
201 |
+
C=[0, 0, 0],
|
202 |
+
CA=1, # Transparency
|
203 |
+
T=analyzed_bounding_box["result"].entity_type
|
204 |
+
)
|
205 |
+
annotations_on_page.append(annotation)
|
206 |
+
|
207 |
+
annotations_all_pages.extend([annotations_on_page])
|
208 |
+
|
209 |
+
print("For page number: ", page_num, " there are ", len(annotations_all_pages[page_num]), " annotations")
|
210 |
+
page.Annots = pdf.make_indirect(annotations_on_page)
|
211 |
+
|
212 |
+
page_num += 1
|
213 |
+
|
214 |
+
# Extracting data from dictionaries
|
215 |
+
# extracted_data = []
|
216 |
+
# for item in annotations_all_pages:
|
217 |
+
# temp_dict = {}
|
218 |
+
# #print(item)
|
219 |
+
# for key, value in item.items():
|
220 |
+
# if isinstance(value, Decimal):
|
221 |
+
# temp_dict[key] = float(value)
|
222 |
+
# elif isinstance(value, list):
|
223 |
+
# temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
|
224 |
+
# else:
|
225 |
+
# temp_dict[key] = value
|
226 |
+
# extracted_data.append(temp_dict)
|
227 |
+
|
228 |
+
# Creating DataFrame
|
229 |
+
# annotations_out = pd.DataFrame(extracted_data)
|
230 |
+
#print(df)
|
231 |
+
|
232 |
+
#annotations_out.to_csv("examples/annotations.csv")
|
233 |
+
|
234 |
+
analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
|
235 |
+
|
236 |
+
return pdf
|
tools/helper_functions.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import gzip
|
4 |
+
import pickle
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
def get_file_path_end(file_path):
|
8 |
+
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
9 |
+
basename = os.path.basename(file_path)
|
10 |
+
|
11 |
+
# Then, split the basename and its extension and return only the basename without the extension
|
12 |
+
filename_without_extension, _ = os.path.splitext(basename)
|
13 |
+
|
14 |
+
#print(filename_without_extension)
|
15 |
+
|
16 |
+
return filename_without_extension
|
17 |
+
|
18 |
+
def get_file_path_end_with_ext(file_path):
|
19 |
+
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
20 |
+
basename = os.path.basename(file_path)
|
21 |
+
|
22 |
+
return basename
|
23 |
+
|
24 |
+
def ensure_output_folder_exists():
|
25 |
+
"""Checks if the 'output/' folder exists, creates it if not."""
|
26 |
+
|
27 |
+
folder_name = "output/"
|
28 |
+
|
29 |
+
if not os.path.exists(folder_name):
|
30 |
+
# Create the folder if it doesn't exist
|
31 |
+
os.makedirs(folder_name)
|
32 |
+
print(f"Created the 'output/' folder.")
|
33 |
+
else:
|
34 |
+
print(f"The 'output/' folder already exists.")
|
35 |
+
|
36 |
+
def detect_file_type(filename):
|
37 |
+
"""Detect the file type based on its extension."""
|
38 |
+
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
39 |
+
return 'csv'
|
40 |
+
elif filename.endswith('.xlsx'):
|
41 |
+
return 'xlsx'
|
42 |
+
elif filename.endswith('.parquet'):
|
43 |
+
return 'parquet'
|
44 |
+
elif filename.endswith('.pkl.gz'):
|
45 |
+
return 'pkl.gz'
|
46 |
+
elif filename.endswith('.pkl'):
|
47 |
+
return 'pkl'
|
48 |
+
elif filename.endswith('.npz'):
|
49 |
+
return 'npz'
|
50 |
+
else:
|
51 |
+
raise ValueError("Unsupported file type.")
|
52 |
+
|
53 |
+
|
54 |
+
def read_file(filename, headers=0):
|
55 |
+
"""Read the file based on its detected type."""
|
56 |
+
file_type = detect_file_type(filename)
|
57 |
+
|
58 |
+
print("Loading in file")
|
59 |
+
|
60 |
+
if file_type == 'csv':
|
61 |
+
file = pd.read_csv(filename, low_memory=False, header=headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
62 |
+
elif file_type == 'xlsx':
|
63 |
+
file = pd.read_excel(filename, header=headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
64 |
+
elif file_type == 'parquet':
|
65 |
+
file = pd.read_parquet(filename, header = headers)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
|
66 |
+
elif file_type == 'pkl.gz':
|
67 |
+
with gzip.open(filename, 'rb') as file:
|
68 |
+
file = pickle.load(file)
|
69 |
+
#file = pd.read_pickle(filename)
|
70 |
+
elif file_type == 'npz':
|
71 |
+
file = np.load(filename)['arr_0']
|
72 |
+
|
73 |
+
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
74 |
+
if "compress" in filename:
|
75 |
+
file /= 100
|
76 |
+
|
77 |
+
print("File load complete")
|
78 |
+
|
79 |
+
return file
|
80 |
+
|
81 |
+
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
82 |
+
def add_folder_to_path(folder_path: str):
|
83 |
+
'''
|
84 |
+
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
|
85 |
+
'''
|
86 |
+
|
87 |
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
88 |
+
print(folder_path, "folder exists.")
|
89 |
+
|
90 |
+
# Resolve relative path to absolute path
|
91 |
+
absolute_path = os.path.abspath(folder_path)
|
92 |
+
|
93 |
+
current_path = os.environ['PATH']
|
94 |
+
if absolute_path not in current_path.split(os.pathsep):
|
95 |
+
full_path_extension = absolute_path + os.pathsep + current_path
|
96 |
+
os.environ['PATH'] = full_path_extension
|
97 |
+
print(f"Updated PATH with: ", full_path_extension)
|
98 |
+
else:
|
99 |
+
print(f"Directory {folder_path} already exists in PATH.")
|
100 |
+
else:
|
101 |
+
print(f"Folder not found at {folder_path} - not added to PATH")
|
102 |
+
|
103 |
+
def custom_regex_load(in_file, headers = None):
|
104 |
+
'''
|
105 |
+
When file is loaded, update the column dropdown choices and write to relevant data states.
|
106 |
+
'''
|
107 |
+
|
108 |
+
custom_regex = pd.DataFrame()
|
109 |
+
|
110 |
+
file_list = [string.name for string in in_file]
|
111 |
+
|
112 |
+
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
113 |
+
if regex_file_names:
|
114 |
+
regex_file_name = regex_file_names[0]
|
115 |
+
custom_regex = read_file(regex_file_name, headers)
|
116 |
+
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
117 |
+
|
118 |
+
output_text = "Data file loaded."
|
119 |
+
print(output_text)
|
120 |
+
else:
|
121 |
+
error = "No regex file provided."
|
122 |
+
print(error)
|
123 |
+
output_text = error
|
124 |
+
return error, custom_regex
|
125 |
+
|
126 |
+
return output_text, custom_regex
|
tools/load_spacy_model_custom_recognisers.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
from typing import List
|
3 |
+
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
4 |
+
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
5 |
+
import spacy
|
6 |
+
spacy.prefer_gpu()
|
7 |
+
from spacy.cli.download import download
|
8 |
+
import re
|
9 |
+
|
10 |
+
# %%
|
11 |
+
model_name = "en_core_web_sm" #"en_core_web_trf"
|
12 |
+
score_threshold = 0.001
|
13 |
+
|
14 |
+
# %% [markdown]
|
15 |
+
# #### Custom recognisers
|
16 |
+
|
17 |
+
# %%
|
18 |
+
# Custom title recogniser
|
19 |
+
import re
|
20 |
+
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
21 |
+
titles_regex = '\\b' + ' \\b|\\b'.join(rf"{re.escape(street_type)}" for street_type in titles_list) + ' \\b'
|
22 |
+
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
23 |
+
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern])
|
24 |
+
|
25 |
+
# %%
|
26 |
+
# Custom postcode recogniser
|
27 |
+
|
28 |
+
# Define the regex pattern in a Presidio `Pattern` object:
|
29 |
+
ukpostcode_pattern = Pattern(name="ukpostcode_pattern",regex="\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b", score = 1)
|
30 |
+
|
31 |
+
# Define the recognizer with one or more patterns
|
32 |
+
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
|
33 |
+
|
34 |
+
# %%
|
35 |
+
# Examples for testing
|
36 |
+
|
37 |
+
#text = "I live in 510 Broad st SE5 9NG ."
|
38 |
+
|
39 |
+
#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
|
40 |
+
#print("Result:")
|
41 |
+
#print(numbers_result)
|
42 |
+
|
43 |
+
# %%
|
44 |
+
def extract_street_name(text:str) -> str:
|
45 |
+
"""
|
46 |
+
Extracts the street name and preceding word (that should contain at least one number) from the given text.
|
47 |
+
|
48 |
+
"""
|
49 |
+
|
50 |
+
street_types = [
|
51 |
+
'Street', 'St', 'Boulevard', 'Blvd', 'Highway', 'Hwy', 'Broadway', 'Freeway',
|
52 |
+
'Causeway', 'Cswy', 'Expressway', 'Way', 'Walk', 'Lane', 'Ln', 'Road', 'Rd',
|
53 |
+
'Avenue', 'Ave', 'Circle', 'Cir', 'Cove', 'Cv', 'Drive', 'Dr', 'Parkway', 'Pkwy',
|
54 |
+
'Park', 'Court', 'Ct', 'Square', 'Sq', 'Loop', 'Place', 'Pl', 'Parade', 'Estate',
|
55 |
+
'Alley', 'Arcade', 'Avenue', 'Ave', 'Bay', 'Bend', 'Brae', 'Byway', 'Close', 'Corner', 'Cove',
|
56 |
+
'Crescent', 'Cres', 'Cul-de-sac', 'Dell', 'Drive', 'Dr', 'Esplanade', 'Glen', 'Green', 'Grove', 'Heights', 'Hts',
|
57 |
+
'Mews', 'Parade', 'Path', 'Piazza', 'Promenade', 'Quay', 'Ridge', 'Row', 'Terrace', 'Ter', 'Track', 'Trail', 'View', 'Villas',
|
58 |
+
'Marsh', 'Embankment', 'Cut', 'Hill', 'Passage', 'Rise', 'Vale', 'Side'
|
59 |
+
]
|
60 |
+
|
61 |
+
# Dynamically construct the regex pattern with all possible street types
|
62 |
+
street_types_pattern = '|'.join(rf"{re.escape(street_type)}" for street_type in street_types)
|
63 |
+
|
64 |
+
# The overall regex pattern to capture the street name and preceding word(s)
|
65 |
+
|
66 |
+
pattern = rf'(?P<preceding_word>\w*\d\w*)\s*'
|
67 |
+
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
|
68 |
+
|
69 |
+
# Find all matches in text
|
70 |
+
matches = re.finditer(pattern, text, re.IGNORECASE)
|
71 |
+
|
72 |
+
start_positions = []
|
73 |
+
end_positions = []
|
74 |
+
|
75 |
+
for match in matches:
|
76 |
+
preceding_word = match.group('preceding_word').strip()
|
77 |
+
street_name = match.group('street_name').strip()
|
78 |
+
start_pos = match.start()
|
79 |
+
end_pos = match.end()
|
80 |
+
print(f"Start: {start_pos}, End: {end_pos}")
|
81 |
+
print(f"Preceding words: {preceding_word}")
|
82 |
+
print(f"Street name: {street_name}")
|
83 |
+
print()
|
84 |
+
|
85 |
+
start_positions.append(start_pos)
|
86 |
+
end_positions.append(end_pos)
|
87 |
+
|
88 |
+
return start_positions, end_positions
|
89 |
+
|
90 |
+
|
91 |
+
# %%
|
92 |
+
# Some examples for testing
|
93 |
+
|
94 |
+
#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
|
95 |
+
#text = "Roberto lives in Five 10 Broad st in Oregon"
|
96 |
+
#text = "Roberto lives in 55 Oregon Square"
|
97 |
+
#text = "There is 51a no way I will do that"
|
98 |
+
#text = "I am writing to apply for"
|
99 |
+
|
100 |
+
#extract_street_name(text)
|
101 |
+
|
102 |
+
# %%
|
103 |
+
class StreetNameRecognizer(EntityRecognizer):
|
104 |
+
|
105 |
+
def load(self) -> None:
|
106 |
+
"""No loading is required."""
|
107 |
+
pass
|
108 |
+
|
109 |
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
|
110 |
+
"""
|
111 |
+
Logic for detecting a specific PII
|
112 |
+
"""
|
113 |
+
|
114 |
+
start_pos, end_pos = extract_street_name(text)
|
115 |
+
|
116 |
+
results = []
|
117 |
+
|
118 |
+
for i in range(0, len(start_pos)):
|
119 |
+
|
120 |
+
result = RecognizerResult(
|
121 |
+
entity_type="STREETNAME",
|
122 |
+
start = start_pos[i],
|
123 |
+
end = end_pos[i],
|
124 |
+
score= 1
|
125 |
+
)
|
126 |
+
|
127 |
+
results.append(result)
|
128 |
+
|
129 |
+
return results
|
130 |
+
|
131 |
+
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
132 |
+
|
133 |
+
# %%
|
134 |
+
# Create a class inheriting from SpacyNlpEngine
|
135 |
+
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
136 |
+
def __init__(self, loaded_spacy_model):
|
137 |
+
super().__init__()
|
138 |
+
self.nlp = {"en": loaded_spacy_model}
|
139 |
+
|
140 |
+
# %%
|
141 |
+
# Load spacy model
|
142 |
+
try:
|
143 |
+
import en_core_web_lg
|
144 |
+
nlp = en_core_web_lg.load()
|
145 |
+
print("Successfully imported spaCy model")
|
146 |
+
|
147 |
+
except:
|
148 |
+
download("en_core_web_lg")
|
149 |
+
nlp = spacy.load("en_core_web_lg")
|
150 |
+
print("Successfully downloaded and imported spaCy model")
|
151 |
+
|
152 |
+
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
153 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
# %%
|
158 |
+
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
159 |
+
default_score_threshold=score_threshold,
|
160 |
+
supported_languages=["en"],
|
161 |
+
log_decision_process=True,
|
162 |
+
)
|
163 |
+
|
164 |
+
# %%
|
165 |
+
nlp_analyser.registry.add_recognizer(street_recogniser)
|
166 |
+
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
167 |
+
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
168 |
+
|
tools/presidio_analyzer_custom.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine
|
6 |
+
from presidio_analyzer.nlp_engine import NlpArtifacts
|
7 |
+
|
8 |
+
def analyze_iterator_custom(
|
9 |
+
self,
|
10 |
+
texts: Iterable[Union[str, bool, float, int]],
|
11 |
+
language: str,
|
12 |
+
list_length:int,
|
13 |
+
progress=gr.Progress(),
|
14 |
+
**kwargs,
|
15 |
+
) -> List[List[RecognizerResult]]:
|
16 |
+
"""
|
17 |
+
Analyze an iterable of strings.
|
18 |
+
|
19 |
+
:param texts: An list containing strings to be analyzed.
|
20 |
+
:param language: Input language
|
21 |
+
:param list_length: Length of the input list.
|
22 |
+
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
|
23 |
+
"""
|
24 |
+
|
25 |
+
# validate types
|
26 |
+
texts = self._validate_types(texts)
|
27 |
+
|
28 |
+
# Process the texts as batch for improved performance
|
29 |
+
nlp_artifacts_batch: Iterator[
|
30 |
+
Tuple[str, NlpArtifacts]
|
31 |
+
] = self.analyzer_engine.nlp_engine.process_batch(
|
32 |
+
texts=texts, language=language
|
33 |
+
)
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
list_results = []
|
38 |
+
for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
|
39 |
+
results = self.analyzer_engine.analyze(
|
40 |
+
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
|
41 |
+
)
|
42 |
+
|
43 |
+
list_results.append(results)
|
44 |
+
|
45 |
+
return list_results
|
46 |
+
|
47 |
+
def analyze_dict(
|
48 |
+
self,
|
49 |
+
input_dict: Dict[str, Union[Any, Iterable[Any]]],
|
50 |
+
language: str,
|
51 |
+
keys_to_skip: Optional[List[str]] = None,
|
52 |
+
**kwargs,
|
53 |
+
) -> Iterator[DictAnalyzerResult]:
|
54 |
+
"""
|
55 |
+
Analyze a dictionary of keys (strings) and values/iterable of values.
|
56 |
+
|
57 |
+
Non-string values are returned as is.
|
58 |
+
|
59 |
+
:param input_dict: The input dictionary for analysis
|
60 |
+
:param language: Input language
|
61 |
+
:param keys_to_skip: Keys to ignore during analysis
|
62 |
+
:param kwargs: Additional keyword arguments
|
63 |
+
for the `AnalyzerEngine.analyze` method.
|
64 |
+
Use this to pass arguments to the analyze method,
|
65 |
+
such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
|
66 |
+
See `AnalyzerEngine.analyze` for the full list.
|
67 |
+
"""
|
68 |
+
|
69 |
+
context = []
|
70 |
+
if "context" in kwargs:
|
71 |
+
context = kwargs["context"]
|
72 |
+
del kwargs["context"]
|
73 |
+
|
74 |
+
if not keys_to_skip:
|
75 |
+
keys_to_skip = []
|
76 |
+
|
77 |
+
|
78 |
+
for key, value in input_dict.items():
|
79 |
+
if not value or key in keys_to_skip:
|
80 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
|
81 |
+
continue # skip this key as requested
|
82 |
+
|
83 |
+
# Add the key as an additional context
|
84 |
+
specific_context = context[:]
|
85 |
+
specific_context.append(key)
|
86 |
+
|
87 |
+
if type(value) in (str, int, bool, float):
|
88 |
+
results: List[RecognizerResult] = self.analyzer_engine.analyze(
|
89 |
+
text=str(value), language=language, context=[key], **kwargs
|
90 |
+
)
|
91 |
+
elif isinstance(value, dict):
|
92 |
+
new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
|
93 |
+
results = self.analyze_dict(
|
94 |
+
input_dict=value,
|
95 |
+
language=language,
|
96 |
+
context=specific_context,
|
97 |
+
keys_to_skip=new_keys_to_skip,
|
98 |
+
**kwargs,
|
99 |
+
)
|
100 |
+
elif isinstance(value, Iterable):
|
101 |
+
# Recursively iterate nested dicts
|
102 |
+
list_length = len(value)
|
103 |
+
|
104 |
+
results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
|
105 |
+
texts=value,
|
106 |
+
language=language,
|
107 |
+
context=specific_context,
|
108 |
+
list_length=list_length,
|
109 |
+
**kwargs,
|
110 |
+
)
|
111 |
+
else:
|
112 |
+
raise ValueError(f"type {type(value)} is unsupported.")
|
113 |
+
|
114 |
+
yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)
|
tools/unstructured_funcs.py
ADDED
@@ -0,0 +1,884 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from unstructured.partition.auto import partition
|
2 |
+
from unstructured.chunking.title import chunk_by_title
|
3 |
+
from unstructured.chunking.basic import chunk_elements
|
4 |
+
from unstructured.documents.elements import Element, Title, CompositeElement
|
5 |
+
from unstructured.staging.base import convert_to_dataframe
|
6 |
+
from typing import Type, List, Literal, Tuple
|
7 |
+
|
8 |
+
from unstructured.cleaners.core import replace_unicode_quotes, clean_non_ascii_chars, clean_ordered_bullets, group_broken_paragraphs, replace_unicode_quotes, clean, clean_trailing_punctuation, remove_punctuation, bytes_string_to_string
|
9 |
+
import gradio as gr
|
10 |
+
import time
|
11 |
+
import pandas as pd
|
12 |
+
import re
|
13 |
+
import gzip
|
14 |
+
import pickle
|
15 |
+
from pydantic import BaseModel, Field
|
16 |
+
|
17 |
+
from tools.helper_functions import get_file_path_end, get_file_path_end_with_ext
|
18 |
+
|
19 |
+
# Creating an alias for pandas DataFrame using Type
|
20 |
+
PandasDataFrame = Type[pd.DataFrame]
|
21 |
+
|
22 |
+
# %%
|
23 |
+
# pdf partitioning strategy vars
|
24 |
+
pdf_partition_strat = "ocr_only" # ["fast", "ocr_only", "hi_res"]
|
25 |
+
|
26 |
+
# %%
|
27 |
+
# Element metadata modification vars
|
28 |
+
meta_keys_to_filter = ["file_directory", "filetype"]
|
29 |
+
element_types_to_filter = ['UncategorizedText', 'Header']
|
30 |
+
|
31 |
+
# %%
|
32 |
+
# Clean function vars
|
33 |
+
|
34 |
+
bytes_to_string=False
|
35 |
+
replace_quotes=True
|
36 |
+
clean_non_ascii=False
|
37 |
+
clean_ordered_list=True
|
38 |
+
group_paragraphs=True
|
39 |
+
trailing_punctuation=False
|
40 |
+
all_punctuation=False
|
41 |
+
clean_text=True
|
42 |
+
extra_whitespace=True
|
43 |
+
dashes=True
|
44 |
+
bullets=True
|
45 |
+
lowercase=False
|
46 |
+
|
47 |
+
# %%
|
48 |
+
# Chunking vars
|
49 |
+
|
50 |
+
minimum_chunk_length = 2000
|
51 |
+
start_new_chunk_after_end_of_this_element_length = 2000
|
52 |
+
hard_max_character_length_chunks = 3000
|
53 |
+
multipage_sections=True
|
54 |
+
overlap_all=True
|
55 |
+
include_orig_elements=True
|
56 |
+
|
57 |
+
# %%
|
58 |
+
class Document(BaseModel):
|
59 |
+
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
|
60 |
+
|
61 |
+
page_content: str
|
62 |
+
"""String text."""
|
63 |
+
metadata: dict = Field(default_factory=dict)
|
64 |
+
"""Arbitrary metadata about the page content (e.g., source, relationships to other
|
65 |
+
documents, etc.).
|
66 |
+
"""
|
67 |
+
type: Literal["Document"] = "Document"
|
68 |
+
|
69 |
+
# %%
|
70 |
+
def create_title_id_dict(elements:List[Element]):
|
71 |
+
|
72 |
+
# Assuming the object is stored in a variable named 'elements_list'
|
73 |
+
titles = [item.text for item in elements if isinstance(item, Title)]
|
74 |
+
|
75 |
+
#### Get all elements under these titles
|
76 |
+
chapter_ids = {}
|
77 |
+
for element in elements:
|
78 |
+
for chapter in titles:
|
79 |
+
if element.text == chapter and element.category == "Title":
|
80 |
+
chapter_ids[element._element_id] = chapter
|
81 |
+
break
|
82 |
+
|
83 |
+
chapter_to_id = {v: k for k, v in chapter_ids.items()}
|
84 |
+
|
85 |
+
return chapter_ids, chapter_to_id
|
86 |
+
|
87 |
+
# %%
|
88 |
+
def filter_elements(elements:List[Element], excluded_elements: List[str] = ['']):
|
89 |
+
"""
|
90 |
+
Filter out elements from a list based on their categories.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
elements: The list of elements to filter.
|
94 |
+
excluded_elements: A list of element categories to exclude.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
A new list containing the filtered elements.
|
98 |
+
"""
|
99 |
+
filtered_elements = []
|
100 |
+
for element in elements:
|
101 |
+
if element.category not in excluded_elements:
|
102 |
+
filtered_elements.append(element)
|
103 |
+
return filtered_elements
|
104 |
+
|
105 |
+
# %%
|
106 |
+
def remove_keys_from_meta(
|
107 |
+
elements: List[Element],
|
108 |
+
meta_remove_keys: List[str],
|
109 |
+
excluded_element_types: List[str] = []
|
110 |
+
) -> List[Element]:
|
111 |
+
'''
|
112 |
+
Remove specified metadata keys from an Unstructured Element object
|
113 |
+
'''
|
114 |
+
|
115 |
+
for element in elements:
|
116 |
+
if element.category not in excluded_element_types:
|
117 |
+
for key in meta_remove_keys:
|
118 |
+
try:
|
119 |
+
del element.metadata.__dict__[key] # Directly modify metadata
|
120 |
+
except KeyError:
|
121 |
+
print(f"Key '{key}' not found in element metadata.")
|
122 |
+
|
123 |
+
return elements
|
124 |
+
|
125 |
+
def filter_elements_and_metadata(
|
126 |
+
elements: List[Element],
|
127 |
+
excluded_categories: List[str] = [],
|
128 |
+
meta_remove_keys: List[str] = [],
|
129 |
+
) -> List[Element]:
|
130 |
+
"""
|
131 |
+
Filters elements based on categories and removes specified metadata keys.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
elements: The list of elements to process.
|
135 |
+
excluded_categories: A list of element categories to exclude.
|
136 |
+
meta_remove_keys: A list of metadata keys to remove.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
A new list containing the processed elements.
|
140 |
+
"""
|
141 |
+
|
142 |
+
filtered_elements = []
|
143 |
+
for element in elements:
|
144 |
+
if element.category not in excluded_categories:
|
145 |
+
for key in meta_remove_keys:
|
146 |
+
try:
|
147 |
+
del element.metadata.__dict__[key]
|
148 |
+
except KeyError:
|
149 |
+
# Better logging/error handling instead of just printing
|
150 |
+
# Use a proper logger or raise a warning/exception
|
151 |
+
pass
|
152 |
+
filtered_elements.append(element)
|
153 |
+
|
154 |
+
return filtered_elements
|
155 |
+
|
156 |
+
# %%
|
157 |
+
def add_parent_title_to_meta(elements:List[Element], chapter_ids:List[str], excluded_element_types:List[str]=['']) -> List[Element]:
|
158 |
+
'''
|
159 |
+
Add parent title to Unstructured metadata elements
|
160 |
+
|
161 |
+
'''
|
162 |
+
for element in elements:
|
163 |
+
if element.category in excluded_element_types:
|
164 |
+
pass
|
165 |
+
|
166 |
+
else:
|
167 |
+
meta = element.metadata.to_dict()
|
168 |
+
|
169 |
+
if "parent_id" in meta and meta["parent_id"] in chapter_ids and "title_name" not in meta:
|
170 |
+
title_name = chapter_ids[meta["parent_id"]]
|
171 |
+
# Directly modify the existing element metadata object
|
172 |
+
element.metadata.title_name = title_name
|
173 |
+
|
174 |
+
return elements
|
175 |
+
|
176 |
+
|
177 |
+
def chunk_all_elements(elements:List[Element], file_name_base:str, chunk_type:str = "Basic_chunking", minimum_chunk_length:int=minimum_chunk_length, start_new_chunk_after_end_of_this_element_length:int=start_new_chunk_after_end_of_this_element_length, hard_max_character_length_chunks:int=hard_max_character_length_chunks, multipage_sections:bool=multipage_sections, overlap_all:bool=overlap_all, include_orig_elements:bool=include_orig_elements):
|
178 |
+
|
179 |
+
'''
|
180 |
+
Use Unstructured.io functions to chunk an Element object by Title or across all elements.
|
181 |
+
'''
|
182 |
+
output_files = []
|
183 |
+
output_summary = ""
|
184 |
+
|
185 |
+
chapter_ids, chapter_to_id = create_title_id_dict(elements)
|
186 |
+
|
187 |
+
### Break text down into chunks
|
188 |
+
|
189 |
+
try:
|
190 |
+
|
191 |
+
if chunk_type == "Chunk within title":
|
192 |
+
chunks = chunk_by_title(
|
193 |
+
elements,
|
194 |
+
include_orig_elements=include_orig_elements,
|
195 |
+
combine_text_under_n_chars=minimum_chunk_length,
|
196 |
+
new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
|
197 |
+
max_characters=hard_max_character_length_chunks,
|
198 |
+
multipage_sections=multipage_sections,
|
199 |
+
overlap_all=overlap_all
|
200 |
+
)
|
201 |
+
|
202 |
+
else:
|
203 |
+
chunks = chunk_elements(
|
204 |
+
elements,
|
205 |
+
include_orig_elements=include_orig_elements,
|
206 |
+
new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
|
207 |
+
max_characters=hard_max_character_length_chunks,
|
208 |
+
overlap_all=overlap_all
|
209 |
+
)
|
210 |
+
|
211 |
+
except Exception as output_summary:
|
212 |
+
print(output_summary)
|
213 |
+
return output_summary, output_files, file_name_base
|
214 |
+
|
215 |
+
chunk_sections, chunk_df, chunks_out = element_chunks_to_document(chunks, chapter_ids)
|
216 |
+
|
217 |
+
file_name_suffix = "_chunk"
|
218 |
+
|
219 |
+
# The new file name does not overwrite the old file name as the 'chunked' elements are only used as an output, and not an input to other functions
|
220 |
+
output_summary, output_files, file_name_base_new = export_elements_as_table_to_file(chunks_out, file_name_base, file_name_suffix, chunk_sections)
|
221 |
+
|
222 |
+
return output_summary, output_files, file_name_base
|
223 |
+
|
224 |
+
# %%
|
225 |
+
def element_chunks_to_document(chunks:CompositeElement, chapter_ids:List[str]) -> Tuple[List[Document], PandasDataFrame, List[str]]:
|
226 |
+
'''
|
227 |
+
Take an Unstructured.io chunk_by_title output with the original parsed document elements and turn it into a Document format commonly used by vector databases, and a Pandas dataframe.
|
228 |
+
'''
|
229 |
+
chunk_sections = []
|
230 |
+
current_title_id = ''
|
231 |
+
current_title = ''
|
232 |
+
last_page = ''
|
233 |
+
chunk_df_list = []
|
234 |
+
|
235 |
+
for chunk in chunks:
|
236 |
+
chunk_meta = chunk.metadata.to_dict()
|
237 |
+
true_element_ids = []
|
238 |
+
element_categories = []
|
239 |
+
titles = []
|
240 |
+
titles_id = []
|
241 |
+
|
242 |
+
if "page_number" in chunk_meta:
|
243 |
+
last_page = chunk_meta["page_number"]
|
244 |
+
|
245 |
+
chunk_text = chunk.text
|
246 |
+
#chunk_page_number = chunk.metadata.to_dict()["page_number"]
|
247 |
+
|
248 |
+
# If the same element text is found, add the element_id to the chunk (NOT PERFECT. THIS WILL FAIL IF THE SAME TEXT IS SEEN MULTIPL TIMES)
|
249 |
+
for element in chunk.metadata.orig_elements:
|
250 |
+
|
251 |
+
#element_text = element.text
|
252 |
+
element_id = element._element_id
|
253 |
+
element_category = element.category
|
254 |
+
element_meta = element.metadata.to_dict()
|
255 |
+
|
256 |
+
if "page_number" in element_meta:
|
257 |
+
element_page_number = element_meta["page_number"]
|
258 |
+
last_page = element_page_number
|
259 |
+
|
260 |
+
true_element_ids.append(element_id)
|
261 |
+
element_categories.append(element_category)
|
262 |
+
|
263 |
+
|
264 |
+
# Set new metadata for chunk
|
265 |
+
if "page_number" in element_meta:
|
266 |
+
chunk_meta["last_page_number"] = last_page
|
267 |
+
|
268 |
+
chunk_meta["true_element_ids"] = true_element_ids
|
269 |
+
|
270 |
+
for loop_id in chunk_meta['true_element_ids']:
|
271 |
+
if loop_id in chapter_ids:
|
272 |
+
current_title = chapter_ids[loop_id]
|
273 |
+
current_title_id = loop_id
|
274 |
+
|
275 |
+
titles.append(current_title)
|
276 |
+
titles_id.append(current_title_id)
|
277 |
+
|
278 |
+
chunk_meta['titles'] = titles
|
279 |
+
chunk_meta['titles_id'] = titles_id
|
280 |
+
|
281 |
+
# Remove original elements data for documents
|
282 |
+
chunk_meta.pop('orig_elements')
|
283 |
+
|
284 |
+
chunk_dict_for_df = chunk_meta.copy()
|
285 |
+
chunk_dict_for_df['text'] = chunk.text
|
286 |
+
|
287 |
+
chunk_df_list.append(chunk_dict_for_df)
|
288 |
+
|
289 |
+
|
290 |
+
chunk_doc = [Document(page_content=chunk_text, metadata=chunk_meta)]
|
291 |
+
chunk_sections.extend(chunk_doc)
|
292 |
+
|
293 |
+
## Write metadata back to elements
|
294 |
+
chunk.metadata.__dict__ = chunk_meta
|
295 |
+
|
296 |
+
chunk_df = pd.DataFrame(chunk_df_list)
|
297 |
+
|
298 |
+
# print("Doc format: ", chunk_sections)
|
299 |
+
|
300 |
+
return chunk_sections, chunk_df, chunks
|
301 |
+
|
302 |
+
# %%
|
303 |
+
def write_elements_to_documents(elements:List[Element]):
|
304 |
+
'''
|
305 |
+
Take Unstructured.io parsed elements and write it into a 'Document' format commonly used by vector databases
|
306 |
+
'''
|
307 |
+
|
308 |
+
doc_sections = []
|
309 |
+
|
310 |
+
for element in elements:
|
311 |
+
meta = element.metadata.to_dict()
|
312 |
+
|
313 |
+
meta["type"] = element.category
|
314 |
+
meta["element_id"] = element._element_id
|
315 |
+
|
316 |
+
element_doc = [Document(page_content=element.text, metadata= meta)]
|
317 |
+
doc_sections.extend(element_doc)
|
318 |
+
|
319 |
+
#print("Doc format: ", doc_sections)
|
320 |
+
|
321 |
+
|
322 |
+
return doc_sections
|
323 |
+
|
324 |
+
# %%
|
325 |
+
def clean_elements(elements:List[Element], dropdown_options: List[str] = [''],
|
326 |
+
output_name:str = "combined_elements",
|
327 |
+
bytes_to_string:bool=False,
|
328 |
+
replace_quotes:bool=True,
|
329 |
+
clean_non_ascii:bool=False,
|
330 |
+
clean_ordered_list:bool=True,
|
331 |
+
group_paragraphs:bool=True,
|
332 |
+
trailing_punctuation:bool=False,
|
333 |
+
all_punctuation:bool=False,
|
334 |
+
clean_text:bool=True,
|
335 |
+
extra_whitespace:bool=True,
|
336 |
+
dashes:bool=True,
|
337 |
+
bullets:bool=True,
|
338 |
+
lowercase:bool=False) -> List[Element]:
|
339 |
+
|
340 |
+
'''
|
341 |
+
Apply Unstructured cleaning processes to a list of parse elements.
|
342 |
+
'''
|
343 |
+
|
344 |
+
out_files = []
|
345 |
+
output_summary = ""
|
346 |
+
|
347 |
+
# Set variables to True based on dropdown selections
|
348 |
+
for option in dropdown_options:
|
349 |
+
if option == "Convert bytes to string":
|
350 |
+
bytes_to_string = True
|
351 |
+
elif option == "Replace quotes":
|
352 |
+
replace_quotes = True
|
353 |
+
elif option == "Clean non ASCII":
|
354 |
+
clean_non_ascii = True
|
355 |
+
elif option == "Clean ordered list":
|
356 |
+
clean_ordered_list = True
|
357 |
+
elif option == "Group paragraphs":
|
358 |
+
group_paragraphs = True
|
359 |
+
elif option == "Remove trailing punctuation":
|
360 |
+
trailing_punctuation = True
|
361 |
+
elif option == "Remove all punctuation":
|
362 |
+
all_punctuation = True
|
363 |
+
elif option == "Clean text":
|
364 |
+
clean_text = True
|
365 |
+
elif option == "Remove extra whitespace":
|
366 |
+
extra_whitespace = True
|
367 |
+
elif option == "Remove dashes":
|
368 |
+
dashes = True
|
369 |
+
elif option == "Remove bullets":
|
370 |
+
bullets = True
|
371 |
+
elif option == "Make lowercase":
|
372 |
+
lowercase = True
|
373 |
+
|
374 |
+
|
375 |
+
cleaned_elements = elements.copy()
|
376 |
+
|
377 |
+
for element in cleaned_elements:
|
378 |
+
|
379 |
+
try:
|
380 |
+
if element: # Check if element is not None or empty
|
381 |
+
if bytes_to_string:
|
382 |
+
element.apply(bytes_string_to_string)
|
383 |
+
if replace_quotes:
|
384 |
+
element.apply(replace_unicode_quotes)
|
385 |
+
if clean_non_ascii:
|
386 |
+
element.apply(clean_non_ascii_chars)
|
387 |
+
if clean_ordered_list:
|
388 |
+
element.apply(clean_ordered_bullets)
|
389 |
+
if group_paragraphs:
|
390 |
+
element.apply(group_broken_paragraphs)
|
391 |
+
if trailing_punctuation:
|
392 |
+
element.apply(clean_trailing_punctuation)
|
393 |
+
if all_punctuation:
|
394 |
+
element.apply(remove_punctuation)
|
395 |
+
if group_paragraphs:
|
396 |
+
element.apply(group_broken_paragraphs)
|
397 |
+
if clean_text:
|
398 |
+
element.apply(lambda x: clean(x, extra_whitespace=extra_whitespace, dashes=dashes, bullets=bullets, lowercase=lowercase))
|
399 |
+
except Exception as e:
|
400 |
+
print(e)
|
401 |
+
element = element
|
402 |
+
|
403 |
+
alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(cleaned_elements, output_name, file_name_suffix="_clean")
|
404 |
+
|
405 |
+
output_summary = "Text elements successfully cleaned."
|
406 |
+
print(output_summary)
|
407 |
+
|
408 |
+
return cleaned_elements, output_summary, out_files, output_file_base
|
409 |
+
|
410 |
+
# %% [markdown]
|
411 |
+
def export_elements_as_table_to_file(elements:List[Element], file_name_base:str, file_name_suffix:str="", chunk_documents:List[Document]=[]):
|
412 |
+
'''
|
413 |
+
Export elements as as a table.
|
414 |
+
'''
|
415 |
+
output_summary = ""
|
416 |
+
out_files = []
|
417 |
+
|
418 |
+
# Convert to dataframe format
|
419 |
+
out_table = convert_to_dataframe(elements)
|
420 |
+
|
421 |
+
# If the file suffix already exists in the output file name, don't add it again.
|
422 |
+
if file_name_suffix not in file_name_base:
|
423 |
+
out_file_name_base = file_name_base + file_name_suffix
|
424 |
+
|
425 |
+
else:
|
426 |
+
out_file_name_base = file_name_base
|
427 |
+
|
428 |
+
out_file_name = "output/" + out_file_name_base + ".csv"
|
429 |
+
|
430 |
+
out_table.to_csv(out_file_name)
|
431 |
+
out_files.append(out_file_name)
|
432 |
+
|
433 |
+
# Convert to document format
|
434 |
+
if chunk_documents:
|
435 |
+
out_documents = chunk_documents
|
436 |
+
else:
|
437 |
+
out_documents = write_elements_to_documents(elements)
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
out_file_name_docs = "output/" + out_file_name_base + "_docs.pkl.gz"
|
442 |
+
with gzip.open(out_file_name_docs, 'wb') as file:
|
443 |
+
pickle.dump(out_documents, file)
|
444 |
+
|
445 |
+
out_files.append(out_file_name_docs)
|
446 |
+
|
447 |
+
output_summary = "File successfully exported."
|
448 |
+
|
449 |
+
return output_summary, out_files, out_file_name_base
|
450 |
+
|
451 |
+
# # Partition PDF
|
452 |
+
|
453 |
+
def get_file_type(filename):
|
454 |
+
pattern = r"\.(\w+)$" # Match a dot followed by one or more word characters at the end of the string
|
455 |
+
|
456 |
+
match = re.search(pattern, filename)
|
457 |
+
if match:
|
458 |
+
file_type = match.group(1) # Extract the captured file type (without the dot)
|
459 |
+
print(file_type) # Output: "png"
|
460 |
+
else:
|
461 |
+
print("No file type found.")
|
462 |
+
|
463 |
+
return file_type
|
464 |
+
|
465 |
+
# %%
|
466 |
+
def partition_file(filenames:List[str], pdf_partition_strat:str = pdf_partition_strat, progress = gr.Progress()):
|
467 |
+
'''
|
468 |
+
Partition document files into text elements using the Unstructured package. Currently supports PDF, docx, pptx, html, several image file types, text document types, email messages, code files.
|
469 |
+
'''
|
470 |
+
|
471 |
+
out_message = ""
|
472 |
+
combined_elements = []
|
473 |
+
out_files = []
|
474 |
+
|
475 |
+
for file in progress.tqdm(filenames, desc="Partitioning files", unit="files"):
|
476 |
+
|
477 |
+
try:
|
478 |
+
|
479 |
+
tic = time.perf_counter()
|
480 |
+
print(file)
|
481 |
+
|
482 |
+
file_name = get_file_path_end_with_ext(file)
|
483 |
+
file_name_base = get_file_path_end(file)
|
484 |
+
file_type = get_file_type(file_name)
|
485 |
+
|
486 |
+
image_file_type_list = ["jpg", "jpeg", "png", "heic"]
|
487 |
+
|
488 |
+
if file_type in image_file_type_list:
|
489 |
+
print("File is an image. Using OCR method to partition.")
|
490 |
+
file_elements = partition(file, strategy="ocr_only")
|
491 |
+
else:
|
492 |
+
file_elements = partition(file, strategy=pdf_partition_strat)
|
493 |
+
|
494 |
+
toc = time.perf_counter()
|
495 |
+
|
496 |
+
|
497 |
+
new_out_message = f"Successfully partitioned file: {file_name} in {toc - tic:0.1f} seconds\n"
|
498 |
+
print(new_out_message)
|
499 |
+
|
500 |
+
out_message = out_message + new_out_message
|
501 |
+
combined_elements.extend(file_elements)
|
502 |
+
|
503 |
+
except Exception as e:
|
504 |
+
new_out_message = f"Failed to partition file: {file_name} due to {e}. Partitioning halted."
|
505 |
+
print(new_out_message)
|
506 |
+
out_message = out_message + new_out_message
|
507 |
+
break
|
508 |
+
|
509 |
+
out_table = convert_to_dataframe(combined_elements)
|
510 |
+
|
511 |
+
# If multiple files, overwrite default file name for outputs
|
512 |
+
if len(filenames) > 1:
|
513 |
+
file_name_base = "combined_files"
|
514 |
+
|
515 |
+
alt_out_message, out_files, output_file_base = export_elements_as_table_to_file(combined_elements, file_name_base, file_name_suffix="_elements")
|
516 |
+
|
517 |
+
return out_message, combined_elements, out_files, output_file_base, out_table
|
518 |
+
|
519 |
+
# %%
|
520 |
+
def modify_metadata_elements(elements_out_cleaned:List[Element], meta_keys_to_filter:List[str]=meta_keys_to_filter, element_types_to_filter:List[str]=element_types_to_filter) -> List[Element]:
|
521 |
+
|
522 |
+
'''
|
523 |
+
Take an element object, add parent title names to metadata. Remove specified metadata keys or element types from element list.
|
524 |
+
'''
|
525 |
+
|
526 |
+
chapter_ids, chapter_to_id = create_title_id_dict(elements_out_cleaned.copy())
|
527 |
+
elements_out_meta_mod = add_parent_title_to_meta(elements_out_cleaned.copy(), chapter_ids)
|
528 |
+
elements_out_meta_mod_meta_filt = remove_keys_from_meta(elements_out_meta_mod.copy(), meta_keys_to_filter)
|
529 |
+
elements_out_filtered_meta_mod = filter_elements(elements_out_meta_mod_meta_filt, element_types_to_filter)
|
530 |
+
|
531 |
+
return elements_out_filtered_meta_mod
|
532 |
+
# %%
|
533 |
+
# file_stub = "C:/Users/SPedrickCase/OneDrive - Lambeth Council/Apps/doc_rag_prep/examples/"
|
534 |
+
# filenames = []
|
535 |
+
# pdf_filename = [file_stub + "Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf"]
|
536 |
+
# filenames.extend(pdf_filename)
|
537 |
+
|
538 |
+
# html_filename = [file_stub + "transport-strategy.html"]
|
539 |
+
# filenames.extend(html_filename)
|
540 |
+
|
541 |
+
# docx_filename = [file_stub + "FINAL Policy and Procedure for Writing Housing Policies.docx"]
|
542 |
+
# filenames.extend(docx_filename)
|
543 |
+
|
544 |
+
# out_message, elements_parse = partition_file(filenames=filenames, pdf_partition_strat="ocr_only")
|
545 |
+
|
546 |
+
# for element in elements_parse[:10]:
|
547 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
548 |
+
# elements_out = elements_parse.copy()
|
549 |
+
|
550 |
+
# %% [markdown]
|
551 |
+
# ### Process with document layout detection - fast strategy
|
552 |
+
#
|
553 |
+
# The "fast" strategy will extract the text using pdfminer and process the raw text with partition_text. If the PDF text is not extractable, partition_pdf will fall back to "ocr_only". We recommend using the "fast" strategy in most cases where the PDF has extractable text.
|
554 |
+
# elements_out_parse = partition_pdf(filename=filename, strategy="fast")
|
555 |
+
# for element in elements_out_parse[:10]:
|
556 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
557 |
+
# elements_out = elements_out_parse.copy()
|
558 |
+
# ### OCR only
|
559 |
+
#
|
560 |
+
# The "ocr_only" strategy runs the document through Tesseract for OCR and then runs the raw text through partition_text. Currently, "hi_res" has difficulty ordering elements for documents with multiple columns. If you have a document with multiple columns that does not have extractable text, we recommend using the "ocr_only" strategy. "ocr_only" falls back to "fast" if Tesseract is not available and the document has extractable text.
|
561 |
+
# elements_out_parse = partition_pdf(filename=filename, strategy="ocr_only")
|
562 |
+
# for element in elements_out_parse[:10]:
|
563 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
564 |
+
# elements_out = elements_out_parse.copy()
|
565 |
+
# ### Hi-res partitioning
|
566 |
+
#
|
567 |
+
# The "hi_res" strategy will identify the layout of the document using detectron2. The advantage of “hi_res” is that it uses the document layout to gain additional information about document elements. We recommend using this strategy if your use case is highly sensitive to correct classifications for document elements. If detectron2 is not available, the "hi_res" strategy will fall back to the "ocr_only" strategy.
|
568 |
+
# elements_out = partition_pdf(filename=filename, strategy="hi_res")
|
569 |
+
# for element in elements_out[:10]:
|
570 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
571 |
+
|
572 |
+
# %% [markdown]
|
573 |
+
# ## Clean data
|
574 |
+
|
575 |
+
# %%
|
576 |
+
# elements_out_cleaned = clean_elements(elements_out.copy(), bytes_to_string=False,
|
577 |
+
# replace_quotes=True ,
|
578 |
+
# clean_non_ascii=False,
|
579 |
+
# clean_ordered_list=True ,
|
580 |
+
# group_paragraphs=True,
|
581 |
+
# trailing_punctuation=False,
|
582 |
+
# all_punctuation=False,
|
583 |
+
# clean_text=True ,
|
584 |
+
# extra_whitespace=True,
|
585 |
+
# dashes=True ,
|
586 |
+
# bullets=True ,
|
587 |
+
# lowercase=False)
|
588 |
+
|
589 |
+
# %% [markdown]
|
590 |
+
# ## Add/remove elements to/from metadata
|
591 |
+
|
592 |
+
|
593 |
+
|
594 |
+
# %% [markdown]
|
595 |
+
# ### Write to table, dictionary, document format
|
596 |
+
|
597 |
+
# %%
|
598 |
+
### Dataframe format
|
599 |
+
|
600 |
+
# elements_out_filtered_df = convert_to_dataframe(elements_out_filtered_meta_mod)
|
601 |
+
|
602 |
+
# elements_out_filtered_df.to_csv("table.csv")
|
603 |
+
# elements_out_filtered_df.head(6)
|
604 |
+
|
605 |
+
# # %%
|
606 |
+
# ### Dictionary format
|
607 |
+
|
608 |
+
# elements_out_filtered_dict = convert_to_dict(elements_out_filtered_meta_mod)
|
609 |
+
# elements_out_filtered_dict[20]
|
610 |
+
|
611 |
+
# # %% [markdown]
|
612 |
+
# # ### Document format for embeddings
|
613 |
+
|
614 |
+
# # %%
|
615 |
+
# doc_sections = write_elements_to_documents(elements_out_filtered_meta_mod, element_types_to_filter)
|
616 |
+
|
617 |
+
# doc_sections[0:10]
|
618 |
+
|
619 |
+
# # %% [markdown]
|
620 |
+
# # ### Break text down into chunks
|
621 |
+
|
622 |
+
# # %%
|
623 |
+
# chunks_by_title = chunk_by_title(
|
624 |
+
# elements_out_filtered_meta_mod,
|
625 |
+
# include_orig_elements=True,
|
626 |
+
# combine_text_under_n_chars=minimum_chunk_length,
|
627 |
+
# new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
|
628 |
+
# max_characters=hard_max_character_length_chunks,
|
629 |
+
# multipage_sections=True,
|
630 |
+
# overlap_all=True
|
631 |
+
# )
|
632 |
+
|
633 |
+
# chunk_sections, chunk_df = element_chunks_to_document(chunks_by_title, chapter_ids)
|
634 |
+
# chunk_df.to_csv("chunked_df.csv")
|
635 |
+
# print(chunk_sections[2])
|
636 |
+
|
637 |
+
# # %%
|
638 |
+
# chunks_basic = chunk_elements(
|
639 |
+
# elements_out_filtered_meta_mod,
|
640 |
+
# include_orig_elements=True,
|
641 |
+
# new_after_n_chars=start_new_chunk_after_end_of_this_element_length,
|
642 |
+
# max_characters=hard_max_character_length_chunks,
|
643 |
+
# overlap_all=True
|
644 |
+
# )
|
645 |
+
|
646 |
+
# chunk_basic_sections, chunk_basic_df = element_chunks_to_document(chunks_basic, chapter_ids)
|
647 |
+
# chunk_basic_df.to_csv("chunked_basic_df.csv")
|
648 |
+
|
649 |
+
# %% [markdown]
|
650 |
+
# # Partition Word document
|
651 |
+
#
|
652 |
+
# You cannot get location metadata for bounding boxes from word documents
|
653 |
+
|
654 |
+
# %%
|
655 |
+
# word_filename = "../examples/FINAL Policy and Procedure for Writing Housing Policies.docx"
|
656 |
+
|
657 |
+
# # %%
|
658 |
+
# docx_elements = partition(filename=word_filename)
|
659 |
+
# for element in docx_elements:
|
660 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
661 |
+
|
662 |
+
# # %%
|
663 |
+
# docx_elements[5].text
|
664 |
+
|
665 |
+
# # %%
|
666 |
+
# docx_elements[5].category
|
667 |
+
|
668 |
+
# # %%
|
669 |
+
# docx_elements[5].metadata.to_dict()
|
670 |
+
|
671 |
+
# # %% [markdown]
|
672 |
+
# # ## Find elements associated with chapters
|
673 |
+
|
674 |
+
# # %%
|
675 |
+
# chapter_ids, chapter_to_id = create_title_id_dict(docx_elements)
|
676 |
+
|
677 |
+
# chapter_ids
|
678 |
+
|
679 |
+
# # %%
|
680 |
+
# doc_sections = write_elements_to_documents(docx_elements.copy(), chapter_ids)
|
681 |
+
|
682 |
+
# # %%
|
683 |
+
# doc_sections
|
684 |
+
|
685 |
+
# # %% [markdown]
|
686 |
+
# # ### Chunk documents
|
687 |
+
|
688 |
+
# # %%
|
689 |
+
# chunks = chunk_by_title(
|
690 |
+
# docx_elements,
|
691 |
+
# include_orig_elements=False,
|
692 |
+
# combine_text_under_n_chars=0,
|
693 |
+
# new_after_n_chars=500,
|
694 |
+
# max_characters=1000,
|
695 |
+
# multipage_sections=True,
|
696 |
+
# overlap_all=True
|
697 |
+
# )
|
698 |
+
|
699 |
+
# # %%
|
700 |
+
# print(chunks)
|
701 |
+
|
702 |
+
# # %%
|
703 |
+
# chunk_sections = element_chunks_to_document(chunks.copy(), docx_elements.copy(), chapter_ids)
|
704 |
+
|
705 |
+
# # %%
|
706 |
+
# chunk_sections[5].page_content
|
707 |
+
|
708 |
+
# # %%
|
709 |
+
# chunk_sections[5].metadata["true_element_ids"]
|
710 |
+
|
711 |
+
# # %%
|
712 |
+
# for element in docx_elements:
|
713 |
+
# if element._element_id in chunk_sections[5].metadata["true_element_ids"]:
|
714 |
+
# print(element.text)
|
715 |
+
|
716 |
+
# # %% [markdown]
|
717 |
+
# # # Partition PPTX document
|
718 |
+
|
719 |
+
# # %%
|
720 |
+
# pptx_filename = "../examples/LOTI presentation Jan 2024.pptx"
|
721 |
+
|
722 |
+
# # %%
|
723 |
+
# pptx_elements = partition(filename=pptx_filename)
|
724 |
+
# for element in pptx_elements[:10]:
|
725 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
726 |
+
|
727 |
+
# # %%
|
728 |
+
# chapter_ids, chapter_to_id = create_title_id_dict(pptx_elements)
|
729 |
+
# chapter_ids
|
730 |
+
|
731 |
+
# # %%
|
732 |
+
# pptx_sections = write_elements_to_documents(pptx_elements.copy(), chapter_ids)
|
733 |
+
|
734 |
+
# # %%
|
735 |
+
# pptx_sections
|
736 |
+
|
737 |
+
# # %%
|
738 |
+
# pptx_chunks = chunk_by_title(
|
739 |
+
# pptx_elements,
|
740 |
+
# include_orig_elements=False,
|
741 |
+
# combine_text_under_n_chars=0,
|
742 |
+
# new_after_n_chars=500,
|
743 |
+
# max_characters=1000,
|
744 |
+
# multipage_sections=True,
|
745 |
+
# overlap_all=True
|
746 |
+
# )
|
747 |
+
|
748 |
+
# # %%
|
749 |
+
# pptx_chunk_sections = element_chunks_to_document(pptx_chunks.copy(), pptx_elements.copy(), chapter_ids)
|
750 |
+
|
751 |
+
# # %% [markdown]
|
752 |
+
# # ### Load documents into a vectorDB (Not necessary)
|
753 |
+
|
754 |
+
# # %%
|
755 |
+
# import chromadb
|
756 |
+
|
757 |
+
# # %%
|
758 |
+
# client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
|
759 |
+
# client.reset()
|
760 |
+
|
761 |
+
# # %%
|
762 |
+
# collection = client.create_collection(
|
763 |
+
# name="policy_statements",
|
764 |
+
# metadata={"hnsw:space": "cosine"}
|
765 |
+
# )
|
766 |
+
|
767 |
+
# # %%
|
768 |
+
# chapter_ids
|
769 |
+
|
770 |
+
# # %%
|
771 |
+
# for element in docx_elements:
|
772 |
+
# parent_id = element.metadata.parent_id
|
773 |
+
# #print(element.text)
|
774 |
+
# #print(parent_id)
|
775 |
+
# #print(element.metadata.to_dict())
|
776 |
+
# if parent_id:
|
777 |
+
# try:
|
778 |
+
# print(parent_id)
|
779 |
+
# chapter = chapter_ids[parent_id]
|
780 |
+
# print(chapter)
|
781 |
+
# except KeyError:
|
782 |
+
# chapter = "None"
|
783 |
+
# else:
|
784 |
+
# chapter = "None"
|
785 |
+
# collection.add(
|
786 |
+
# documents=[element.text],
|
787 |
+
# ids=[element._element_id],
|
788 |
+
# metadatas=[{"chapter": chapter}]
|
789 |
+
# )
|
790 |
+
|
791 |
+
# # %% [markdown]
|
792 |
+
# # #### See the elements in the VectorDB and perform hybrid search
|
793 |
+
|
794 |
+
# # %%
|
795 |
+
# results = collection.peek()
|
796 |
+
# print(results["documents"])
|
797 |
+
|
798 |
+
# # %%
|
799 |
+
# print(collection.metadata)
|
800 |
+
|
801 |
+
# # %%
|
802 |
+
# import json
|
803 |
+
|
804 |
+
# result = collection.query(
|
805 |
+
# query_texts=["What should policies do?"],
|
806 |
+
# n_results=2,
|
807 |
+
# where={"chapter": '3.0 Policy Statements'},
|
808 |
+
# )
|
809 |
+
# print(json.dumps(result, indent=2))
|
810 |
+
|
811 |
+
# # %%
|
812 |
+
# collection = client.create_collection(
|
813 |
+
# name="policy_statements_chunk",
|
814 |
+
# metadata={"hnsw:space": "cosine"}
|
815 |
+
# )
|
816 |
+
|
817 |
+
# # %%
|
818 |
+
# for element in chunks:
|
819 |
+
# parent_id = element.metadata.parent_id
|
820 |
+
# #print(element.text)
|
821 |
+
# #print(parent_id)
|
822 |
+
# #print(element.metadata.to_dict())
|
823 |
+
# if parent_id:
|
824 |
+
# try:
|
825 |
+
# print(parent_id)
|
826 |
+
# chapter = chapter_ids[parent_id]
|
827 |
+
# print(chapter)
|
828 |
+
# except KeyError:
|
829 |
+
# chapter = "None"
|
830 |
+
# else:
|
831 |
+
# chapter = "None"
|
832 |
+
|
833 |
+
# print(element._element_id)
|
834 |
+
# collection.add(
|
835 |
+
# documents=[element.text],
|
836 |
+
# ids=[element.orig_elements],
|
837 |
+
# metadatas=[{"chapter": chapter}]
|
838 |
+
# )
|
839 |
+
|
840 |
+
# # %% [markdown]
|
841 |
+
# # # Partition HTML
|
842 |
+
|
843 |
+
# # %%
|
844 |
+
# html_filename = "../examples/transport-strategy.html"
|
845 |
+
|
846 |
+
# # %%
|
847 |
+
# html_elements = partition(filename=html_filename)
|
848 |
+
# for element in html_elements[:10]:
|
849 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
850 |
+
|
851 |
+
# # %% [markdown]
|
852 |
+
# # # Partition image
|
853 |
+
|
854 |
+
# # %%
|
855 |
+
# img_filename = "../examples/example_complaint_letter.jpg"
|
856 |
+
|
857 |
+
# # %%
|
858 |
+
# img_elements = partition(filename=img_filename)
|
859 |
+
# for element in img_elements[:10]:
|
860 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
861 |
+
|
862 |
+
# # %% [markdown]
|
863 |
+
# # # Partition XLSX
|
864 |
+
|
865 |
+
# # %%
|
866 |
+
# xlsx_filename = "../examples/fuel-poverty-sub-regional-tables-2020-2018-data.xlsx"
|
867 |
+
|
868 |
+
# # %%
|
869 |
+
# xlsx_elements = partition(filename=xlsx_filename)
|
870 |
+
# for element in xlsx_elements[:10]:
|
871 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
872 |
+
|
873 |
+
# # %% [markdown]
|
874 |
+
# # # Partition .py
|
875 |
+
|
876 |
+
# # %%
|
877 |
+
# py_filename = "../examples/app.py"
|
878 |
+
|
879 |
+
# # %%
|
880 |
+
# py_elements = partition(filename=py_filename)
|
881 |
+
# for element in py_elements[:10]:
|
882 |
+
# print(f"{element.category.upper()}: {element.text} - Metadata: {element.metadata.to_dict()}")
|
883 |
+
|
884 |
+
|