Spaces:

seanpedrickcase
/

document_redaction

Running

+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_data_files
+datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
+datas += collect_data_files('gradio_client')
+datas += collect_data_files('gradio')
+a = Analysis(
+    ['app.py'],
+    pathex=[],
+    binaries=[],
+    datas=datas,
+    hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
+    hookspath=['build_deps'],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    }
+)
+pyz = PYZ(a.pure)
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='DocRedactApp_0.1',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='DocRedactApp_0.1',
+)

app.py CHANGED Viewed

@@ -3,12 +3,16 @@ import os
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 from tools.aws_functions import load_data_from_aws
 import gradio as gr
-#file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
@@ -57,8 +61,8 @@ with block:
         with gr.Accordion(label = "AWS data access", open = True):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
                 with gr.Row():
-                    in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
-                    load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
                 aws_log_box = gr.Textbox(label="AWS data load status")

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 from tools.aws_functions import load_data_from_aws
 import gradio as gr
+add_folder_to_path("_internal/tesseract/")
+add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
+ensure_output_folder_exists()
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
         with gr.Accordion(label = "AWS data access", open = True):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
                 with gr.Row():
+                    in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
+                    load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
                 aws_log_box = gr.Textbox(label="AWS data load status")

how_to_create_exe_dist.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
+2. Activate the environment 'conda activate new_env'
+3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
+NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
+6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
+7. pip install pyinstaller
+8. In command line, cd to the folder that contains app.py.
+9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio  --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders  --name DocRedactApp_0.1 app.py
+# Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
+b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
+a = Analysis(
+    ...
+    module_collection_mode={
+        'gradio': 'py',  # Collect gradio package as source .py files
+    }
+)
+c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.1.spec
+9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
+10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
+11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.

tools/helper_functions.py CHANGED Viewed

@@ -10,3 +10,38 @@ def get_file_path_end(file_path):
     #print(filename_without_extension)
     return filename_without_extension

     #print(filename_without_extension)
     return filename_without_extension
+def ensure_output_folder_exists():
+    """Checks if the 'output/' folder exists, creates it if not."""
+    folder_name = "output/"
+    if not os.path.exists(folder_name):
+        # Create the folder if it doesn't exist
+        os.makedirs(folder_name)
+        print(f"Created the 'output/' folder.")
+    else:
+        print(f"The 'output/' folder already exists.")
+# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+def add_folder_to_path(folder_path: str):
+    '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
+    '''
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ['PATH']
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ['PATH'] = full_path_extension
+            print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -3,6 +3,8 @@ from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
 import re
 # %%
@@ -136,8 +138,16 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
         self.nlp = {"en": loaded_spacy_model}
 # %%
-# Load a model a-priori
-nlp = spacy.load(model_name)
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)

 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
+spacy.prefer_gpu()
+from spacy.cli.download import download
 import re
 # %%
         self.nlp = {"en": loaded_spacy_model}
 # %%
+# Load spacy model
+try:
+	import en_core_web_lg
+	nlp = en_core_web_lg.load()
+	print("Successfully imported spaCy model")
+except:
+	download("en_core_web_lg")
+	nlp = spacy.load("en_core_web_lg")
+	print("Successfully downloaded and imported spaCy model")
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)