seanpedrickcase commited on
Commit
2a4b347
1 Parent(s): 3810d26

Version 0.1. Adapted code for pyinstaller local executable conversion (Windows)

Browse files
.dockerignore CHANGED
@@ -8,4 +8,9 @@ examples/*
8
  processing/*
9
  output/*
10
  tools/__pycache__/*
11
- old_code/*
 
 
 
 
 
 
8
  processing/*
9
  output/*
10
  tools/__pycache__/*
11
+ old_code/*
12
+ tesseract/*
13
+ poppler/*
14
+ build/*
15
+ dist/*
16
+ build_deps/*
.gitignore CHANGED
@@ -8,4 +8,9 @@ examples/*
8
  processing/*
9
  output/*
10
  tools/__pycache__/*
11
- old_code/*
 
 
 
 
 
 
8
  processing/*
9
  output/*
10
  tools/__pycache__/*
11
+ old_code/*
12
+ tesseract/*
13
+ poppler/*
14
+ build/*
15
+ dist/*
16
+ build_deps/*
DocRedactApp_0.1.spec ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+ from PyInstaller.utils.hooks import collect_data_files
3
+
4
+ datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
5
+ datas += collect_data_files('gradio_client')
6
+ datas += collect_data_files('gradio')
7
+
8
+
9
+ a = Analysis(
10
+ ['app.py'],
11
+ pathex=[],
12
+ binaries=[],
13
+ datas=datas,
14
+ hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
15
+ hookspath=['build_deps'],
16
+ hooksconfig={},
17
+ runtime_hooks=[],
18
+ excludes=[],
19
+ noarchive=False,
20
+ optimize=0,
21
+ module_collection_mode={
22
+ 'gradio': 'py', # Collect gradio package as source .py files
23
+ }
24
+ )
25
+ pyz = PYZ(a.pure)
26
+
27
+ exe = EXE(
28
+ pyz,
29
+ a.scripts,
30
+ [],
31
+ exclude_binaries=True,
32
+ name='DocRedactApp_0.1',
33
+ debug=False,
34
+ bootloader_ignore_signals=False,
35
+ strip=False,
36
+ upx=True,
37
+ console=True,
38
+ disable_windowed_traceback=False,
39
+ argv_emulation=False,
40
+ target_arch=None,
41
+ codesign_identity=None,
42
+ entitlements_file=None,
43
+ )
44
+ coll = COLLECT(
45
+ exe,
46
+ a.binaries,
47
+ a.datas,
48
+ strip=False,
49
+ upx=True,
50
+ upx_exclude=[],
51
+ name='DocRedactApp_0.1',
52
+ )
app.py CHANGED
@@ -3,12 +3,16 @@ import os
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
 
6
  from tools.file_redaction import choose_and_run_redactor
7
  from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
8
  from tools.aws_functions import load_data_from_aws
9
  import gradio as gr
10
 
11
- #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
 
 
 
12
 
13
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
14
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
@@ -57,8 +61,8 @@ with block:
57
  with gr.Accordion(label = "AWS data access", open = True):
58
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
59
  with gr.Row():
60
- in_aws_file = gr.Dropdown(label="Choose keyword file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
61
- load_aws_data_button = gr.Button(value="Load keyword data from AWS", variant="secondary")
62
 
63
  aws_log_box = gr.Textbox(label="AWS data load status")
64
 
 
3
  # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
4
  os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
5
 
6
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path
7
  from tools.file_redaction import choose_and_run_redactor
8
  from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
9
  from tools.aws_functions import load_data_from_aws
10
  import gradio as gr
11
 
12
+ add_folder_to_path("_internal/tesseract/")
13
+ add_folder_to_path("_internal/poppler/poppler-24.02.0/Library/bin/")
14
+
15
+ ensure_output_folder_exists()
16
 
17
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
18
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 
61
  with gr.Accordion(label = "AWS data access", open = True):
62
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
63
  with gr.Row():
64
+ in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
65
+ load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
66
 
67
  aws_log_box = gr.Textbox(label="AWS data load status")
68
 
how_to_create_exe_dist.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
2
+
3
+ 2. Activate the environment 'conda activate new_env'
4
+
5
+ 3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
6
+
7
+ NOTE: for ensuring that spaCy models are loaded into the program correctly in requirements.txt, follow this guide: https://spacy.io/usage/models#models-download
8
+
9
+ 6. If necessary, create hook- files to tell pyinstaller to include specific packages in the exe build. Examples are provided for en_core_web_sm (a spaCy model). Put these in the build_deps\ subfolder
10
+
11
+ 7. pip install pyinstaller
12
+
13
+ 8. In command line, cd to the folder that contains app.py.
14
+
15
+ 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
+
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.1 app.py
18
+
19
+ # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
+
21
+
22
+ b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
23
+
24
+ a = Analysis(
25
+ ...
26
+ module_collection_mode={
27
+ 'gradio': 'py', # Collect gradio package as source .py files
28
+ }
29
+ )
30
+
31
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.1.spec
32
+
33
+
34
+ 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
35
+
36
+ 10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
37
+
38
+ 11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
tools/helper_functions.py CHANGED
@@ -10,3 +10,38 @@ def get_file_path_end(file_path):
10
  #print(filename_without_extension)
11
 
12
  return filename_without_extension
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  #print(filename_without_extension)
11
 
12
  return filename_without_extension
13
+
14
+ def ensure_output_folder_exists():
15
+ """Checks if the 'output/' folder exists, creates it if not."""
16
+
17
+ folder_name = "output/"
18
+
19
+ if not os.path.exists(folder_name):
20
+ # Create the folder if it doesn't exist
21
+ os.makedirs(folder_name)
22
+ print(f"Created the 'output/' folder.")
23
+ else:
24
+ print(f"The 'output/' folder already exists.")
25
+
26
+
27
+ # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
28
+ def add_folder_to_path(folder_path: str):
29
+ '''
30
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
31
+ '''
32
+
33
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
34
+ print(folder_path, "folder exists.")
35
+
36
+ # Resolve relative path to absolute path
37
+ absolute_path = os.path.abspath(folder_path)
38
+
39
+ current_path = os.environ['PATH']
40
+ if absolute_path not in current_path.split(os.pathsep):
41
+ full_path_extension = absolute_path + os.pathsep + current_path
42
+ os.environ['PATH'] = full_path_extension
43
+ print(f"Updated PATH with: ", full_path_extension)
44
+ else:
45
+ print(f"Directory {folder_path} already exists in PATH.")
46
+ else:
47
+ print(f"Folder not found at {folder_path} - not added to PATH")
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -3,6 +3,8 @@ from typing import List
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
 
 
6
  import re
7
 
8
  # %%
@@ -136,8 +138,16 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
136
  self.nlp = {"en": loaded_spacy_model}
137
 
138
  # %%
139
- # Load a model a-priori
140
- nlp = spacy.load(model_name)
 
 
 
 
 
 
 
 
141
 
142
  # Pass the loaded model to the new LoadedSpacyNlpEngine
143
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
6
+ spacy.prefer_gpu()
7
+ from spacy.cli.download import download
8
  import re
9
 
10
  # %%
 
138
  self.nlp = {"en": loaded_spacy_model}
139
 
140
  # %%
141
+ # Load spacy model
142
+ try:
143
+ import en_core_web_lg
144
+ nlp = en_core_web_lg.load()
145
+ print("Successfully imported spaCy model")
146
+
147
+ except:
148
+ download("en_core_web_lg")
149
+ nlp = spacy.load("en_core_web_lg")
150
+ print("Successfully downloaded and imported spaCy model")
151
 
152
  # Pass the loaded model to the new LoadedSpacyNlpEngine
153
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)