seanpedrickcase commited on
Commit
20d940b
·
1 Parent(s): 713ca11

Zoom and rotate features from forked gradio_annotation package. Fixed csv/xlsx redaction. Updated guide on creating exe.

Browse files
DocRedactApp_0.2.spec → DocRedactApp_0.2.0.spec RENAMED
@@ -1,17 +1,31 @@
1
  # -*- mode: python ; coding: utf-8 -*-
2
  from PyInstaller.utils.hooks import collect_data_files
 
3
 
4
  datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
 
 
5
  datas += collect_data_files('gradio_client')
6
  datas += collect_data_files('gradio')
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  a = Analysis(
10
  ['app.py'],
11
  pathex=[],
12
- binaries=[],
13
  datas=datas,
14
- hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
15
  hookspath=['build_deps'],
16
  hooksconfig={},
17
  runtime_hooks=[],
@@ -29,7 +43,7 @@ exe = EXE(
29
  a.scripts,
30
  [],
31
  exclude_binaries=True,
32
- name='DocRedactApp_0.2',
33
  debug=False,
34
  bootloader_ignore_signals=False,
35
  strip=False,
@@ -48,5 +62,5 @@ coll = COLLECT(
48
  strip=False,
49
  upx=True,
50
  upx_exclude=[],
51
- name='DocRedactApp_0.2',
52
  )
 
1
  # -*- mode: python ; coding: utf-8 -*-
2
  from PyInstaller.utils.hooks import collect_data_files
3
+ from PyInstaller.utils.hooks import collect_all
4
 
5
  datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
6
+ binaries = []
7
+ hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
8
  datas += collect_data_files('gradio_client')
9
  datas += collect_data_files('gradio')
10
+ datas += collect_data_files('gradio_image_annotation')
11
+ tmp_ret = collect_all('gradio_image_annotation')
12
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
13
+ tmp_ret = collect_all('safehttpx')
14
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
15
+ tmp_ret = collect_all('presidio_analyzer')
16
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
17
+ tmp_ret = collect_all('presidio_anonymizer')
18
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
19
+ tmp_ret = collect_all('presidio_image_redactor')
20
+ datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
21
 
22
 
23
  a = Analysis(
24
  ['app.py'],
25
  pathex=[],
26
+ binaries=binaries,
27
  datas=datas,
28
+ hiddenimports=hiddenimports,
29
  hookspath=['build_deps'],
30
  hooksconfig={},
31
  runtime_hooks=[],
 
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
+ name='DocRedactApp_0.2.0',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp_0.2.0',
66
  )
app.py CHANGED
@@ -453,7 +453,7 @@ with app:
453
  # TABULAR DATA REDACTION
454
  ###
455
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
456
- then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
457
 
458
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
459
 
 
453
  # TABULAR DATA REDACTION
454
  ###
455
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
456
+ then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
457
 
458
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
459
 
how_to_create_exe_dist.txt CHANGED
@@ -12,9 +12,9 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
12
 
13
  8. In command line, cd to the folder that contains app.py.
14
 
15
- 9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --name DocRedactApp_0.2 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
@@ -28,11 +28,29 @@ a = Analysis(
28
  }
29
  )
30
 
31
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.spec
32
 
 
33
 
34
- 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
35
 
36
- 10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
37
 
38
- 11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  8. In command line, cd to the folder that contains app.py.
14
 
15
+ 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.2.0 app.py
18
 
19
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
20
 
 
28
  }
29
  )
30
 
31
+ hook-presidio-image-redactor.py
32
 
33
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.2.0.spec
34
 
 
35
 
36
+ 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
37
 
38
+ 10. go to dist/APP-NAME/gradio/component_meta.py and modify the start of the 'create_or_modify_pyi(...' function to this:
39
+
40
+ def create_or_modify_pyi(
41
+ component_class: type, class_name: str, events: list[str | EventListener]
42
+ ):
43
+ source_file = Path(inspect.getfile(component_class))
44
+
45
+ try:
46
+ # Try to read the source file
47
+ source_code = source_file.read_text(encoding="utf-8")
48
+ except FileNotFoundError:
49
+ # If source file not found, skip pyi generation
50
+ return None
51
+
52
+ 11. Copy the poppler and tesseract folders into the location where the .exe is
53
+
54
+ 12. In 'dist\redaction' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the IP address of the app that is now running. Copy the IP address. **Do not close this window!**
55
+
56
+ 12. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
requirements.txt CHANGED
@@ -12,14 +12,18 @@ scikit-learn==1.5.2
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.12.0
16
- boto3==1.35.83
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
19
  Faker==22.2.0
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
- gradio_image_annotation==0.2.5
 
 
 
 
23
  numpy==1.26.4
24
  awslambdaric==3.0.0
25
 
 
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
+ gradio==5.15.0
16
+ boto3==1.36.15
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
19
  Faker==22.2.0
20
  python-levenshtein==0.26.1
21
  spaczz==0.6.1
22
+ #gradio_image_annotation==0.2.5
23
+ # The following version includes rotation and image zoom options
24
+ git+https://github.com/seanpedrick-case/gradio_image_annotator
25
+
26
+ rapidfuzz==3.12.1
27
  numpy==1.26.4
28
  awslambdaric==3.0.0
29
 
tools/data_anonymise.py CHANGED
@@ -389,6 +389,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
389
  if isinstance(out_message, str):
390
  out_message = [out_message]
391
 
 
 
 
 
 
392
  if not out_file_paths:
393
  out_file_paths = []
394
 
@@ -473,6 +478,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
  out_file_part = get_file_name_without_type(anon_file.name)
 
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
 
389
  if isinstance(out_message, str):
390
  out_message = [out_message]
391
 
392
+ print("log_files_output_paths:",log_files_output_paths)
393
+
394
+ if isinstance(log_files_output_paths, str):
395
+ log_files_output_paths = []
396
+
397
  if not out_file_paths:
398
  out_file_paths = []
399
 
 
478
  sheet_name = ""
479
  anon_df = read_file(anon_file)
480
  out_file_part = get_file_name_without_type(anon_file.name)
481
+
482
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
483
 
484
  # Increase latest file completed count unless we are at the last file
tools/redaction_review.py CHANGED
@@ -500,8 +500,8 @@ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
500
  redact_annot.set('interior-color', colour_str)
501
  #redact_annot.set('fill-color', colour_str)
502
  #redact_annot.set('outline-color', colour_str)
503
- redact_annot.set('overlay-color', colour_str)
504
- redact_annot.set('overlay-text', row['label'])
505
  redact_annot.set('opacity', "0.5")
506
 
507
  # Add appearance dictionary
 
500
  redact_annot.set('interior-color', colour_str)
501
  #redact_annot.set('fill-color', colour_str)
502
  #redact_annot.set('outline-color', colour_str)
503
+ #redact_annot.set('overlay-color', colour_str)
504
+ #redact_annot.set('overlay-text', row['label'])
505
  redact_annot.set('opacity', "0.5")
506
 
507
  # Add appearance dictionary