Commit
·
20d940b
1
Parent(s):
713ca11
Zoom and rotate features from forked gradio_annotation package. Fixed csv/xlsx redaction. Updated guide on creating exe.
Browse files- DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec +18 -4
- app.py +1 -1
- how_to_create_exe_dist.txt +24 -6
- requirements.txt +7 -3
- tools/data_anonymise.py +6 -0
- tools/redaction_review.py +2 -2
DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec
RENAMED
@@ -1,17 +1,31 @@
|
|
1 |
# -*- mode: python ; coding: utf-8 -*-
|
2 |
from PyInstaller.utils.hooks import collect_data_files
|
|
|
3 |
|
4 |
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
|
|
|
|
5 |
datas += collect_data_files('gradio_client')
|
6 |
datas += collect_data_files('gradio')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
a = Analysis(
|
10 |
['app.py'],
|
11 |
pathex=[],
|
12 |
-
binaries=
|
13 |
datas=datas,
|
14 |
-
hiddenimports=
|
15 |
hookspath=['build_deps'],
|
16 |
hooksconfig={},
|
17 |
runtime_hooks=[],
|
@@ -29,7 +43,7 @@ exe = EXE(
|
|
29 |
a.scripts,
|
30 |
[],
|
31 |
exclude_binaries=True,
|
32 |
-
name='DocRedactApp_0.2',
|
33 |
debug=False,
|
34 |
bootloader_ignore_signals=False,
|
35 |
strip=False,
|
@@ -48,5 +62,5 @@ coll = COLLECT(
|
|
48 |
strip=False,
|
49 |
upx=True,
|
50 |
upx_exclude=[],
|
51 |
-
name='DocRedactApp_0.2',
|
52 |
)
|
|
|
1 |
# -*- mode: python ; coding: utf-8 -*-
|
2 |
from PyInstaller.utils.hooks import collect_data_files
|
3 |
+
from PyInstaller.utils.hooks import collect_all
|
4 |
|
5 |
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
6 |
+
binaries = []
|
7 |
+
hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
|
8 |
datas += collect_data_files('gradio_client')
|
9 |
datas += collect_data_files('gradio')
|
10 |
+
datas += collect_data_files('gradio_image_annotation')
|
11 |
+
tmp_ret = collect_all('gradio_image_annotation')
|
12 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
13 |
+
tmp_ret = collect_all('safehttpx')
|
14 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
15 |
+
tmp_ret = collect_all('presidio_analyzer')
|
16 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
17 |
+
tmp_ret = collect_all('presidio_anonymizer')
|
18 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
19 |
+
tmp_ret = collect_all('presidio_image_redactor')
|
20 |
+
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
21 |
|
22 |
|
23 |
a = Analysis(
|
24 |
['app.py'],
|
25 |
pathex=[],
|
26 |
+
binaries=binaries,
|
27 |
datas=datas,
|
28 |
+
hiddenimports=hiddenimports,
|
29 |
hookspath=['build_deps'],
|
30 |
hooksconfig={},
|
31 |
runtime_hooks=[],
|
|
|
43 |
a.scripts,
|
44 |
[],
|
45 |
exclude_binaries=True,
|
46 |
+
name='DocRedactApp_0.2.0',
|
47 |
debug=False,
|
48 |
bootloader_ignore_signals=False,
|
49 |
strip=False,
|
|
|
62 |
strip=False,
|
63 |
upx=True,
|
64 |
upx_exclude=[],
|
65 |
+
name='DocRedactApp_0.2.0',
|
66 |
)
|
app.py
CHANGED
@@ -453,7 +453,7 @@ with app:
|
|
453 |
# TABULAR DATA REDACTION
|
454 |
###
|
455 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
456 |
-
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[
|
457 |
|
458 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
459 |
|
|
|
453 |
# TABULAR DATA REDACTION
|
454 |
###
|
455 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
456 |
+
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
457 |
|
458 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
459 |
|
how_to_create_exe_dist.txt
CHANGED
@@ -12,9 +12,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
12 |
|
13 |
8. In command line, cd to the folder that contains app.py.
|
14 |
|
15 |
-
9.Run the following
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -28,11 +28,29 @@ a = Analysis(
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
-
|
32 |
|
|
|
33 |
|
34 |
-
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
35 |
|
36 |
-
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
8. In command line, cd to the folder that contains app.py.
|
14 |
|
15 |
+
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
|
|
28 |
}
|
29 |
)
|
30 |
|
31 |
+
hook-presidio-image-redactor.py
|
32 |
|
33 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
|
34 |
|
|
|
35 |
|
36 |
+
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
37 |
|
38 |
+
10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
|
39 |
+
|
40 |
+
def create_or_modify_pyi(
|
41 |
+
component_class: type, class_name: str, events: list[str | EventListener]
|
42 |
+
):
|
43 |
+
source_file = Path(inspect.getfile(component_class))
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Try to read the source file
|
47 |
+
source_code = source_file.read_text(encoding="utf-8")
|
48 |
+
except FileNotFoundError:
|
49 |
+
# If source file not found, skip pyi generation
|
50 |
+
return None
|
51 |
+
|
52 |
+
11. Copy the poppler and tesseract folders into the location where the .exe is
|
53 |
+
|
54 |
+
12. In 'dist\redaction' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
|
55 |
+
|
56 |
+
12. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
|
requirements.txt
CHANGED
@@ -12,14 +12,18 @@ scikit-learn==1.5.2
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.
|
16 |
-
boto3==1.
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
19 |
Faker==22.2.0
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
-
gradio_image_annotation==0.2.5
|
|
|
|
|
|
|
|
|
23 |
numpy==1.26.4
|
24 |
awslambdaric==3.0.0
|
25 |
|
|
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
+
gradio==5.15.0
|
16 |
+
boto3==1.36.15
|
17 |
pyarrow==18.1.0
|
18 |
openpyxl==3.1.2
|
19 |
Faker==22.2.0
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
+
#gradio_image_annotation==0.2.5
|
23 |
+
# The following version includes rotation and image zoom options
|
24 |
+
git+https://github.com/seanpedrick-case/gradio_image_annotator
|
25 |
+
|
26 |
+
rapidfuzz==3.12.1
|
27 |
numpy==1.26.4
|
28 |
awslambdaric==3.0.0
|
29 |
|
tools/data_anonymise.py
CHANGED
@@ -389,6 +389,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
389 |
if isinstance(out_message, str):
|
390 |
out_message = [out_message]
|
391 |
|
|
|
|
|
|
|
|
|
|
|
392 |
if not out_file_paths:
|
393 |
out_file_paths = []
|
394 |
|
@@ -473,6 +478,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
473 |
sheet_name = ""
|
474 |
anon_df = read_file(anon_file)
|
475 |
out_file_part = get_file_name_without_type(anon_file.name)
|
|
|
476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
477 |
|
478 |
# Increase latest file completed count unless we are at the last file
|
|
|
389 |
if isinstance(out_message, str):
|
390 |
out_message = [out_message]
|
391 |
|
392 |
+
print("log_files_output_paths:",log_files_output_paths)
|
393 |
+
|
394 |
+
if isinstance(log_files_output_paths, str):
|
395 |
+
log_files_output_paths = []
|
396 |
+
|
397 |
if not out_file_paths:
|
398 |
out_file_paths = []
|
399 |
|
|
|
478 |
sheet_name = ""
|
479 |
anon_df = read_file(anon_file)
|
480 |
out_file_part = get_file_name_without_type(anon_file.name)
|
481 |
+
|
482 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
483 |
|
484 |
# Increase latest file completed count unless we are at the last file
|
tools/redaction_review.py
CHANGED
@@ -500,8 +500,8 @@ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
|
500 |
redact_annot.set('interior-color', colour_str)
|
501 |
#redact_annot.set('fill-color', colour_str)
|
502 |
#redact_annot.set('outline-color', colour_str)
|
503 |
-
redact_annot.set('overlay-color', colour_str)
|
504 |
-
redact_annot.set('overlay-text', row['label'])
|
505 |
redact_annot.set('opacity', "0.5")
|
506 |
|
507 |
# Add appearance dictionary
|
|
|
500 |
redact_annot.set('interior-color', colour_str)
|
501 |
#redact_annot.set('fill-color', colour_str)
|
502 |
#redact_annot.set('outline-color', colour_str)
|
503 |
+
#redact_annot.set('overlay-color', colour_str)
|
504 |
+
#redact_annot.set('overlay-text', row['label'])
|
505 |
redact_annot.set('opacity', "0.5")
|
506 |
|
507 |
# Add appearance dictionary
|