semmyk commited on
Commit
42d6e84
·
1 Parent(s): 653f79c

baseline08_beta0.2.2_30Sept25: fix oauth_token. - fixing timeout: progress tracker

Browse files
Files changed (4) hide show
  1. converters/pdf_to_md.py +4 -4
  2. dummy_log.log +0 -0
  3. main.py +2 -1
  4. ui/gradio_ui.py +81 -32
converters/pdf_to_md.py CHANGED
@@ -16,7 +16,7 @@ from utils.logger import get_logger
16
 
17
  logger = get_logger(__name__)
18
 
19
- # Define global variables
20
  docconverter: DocumentConverter = None
21
  converter = None #DocumentConverter
22
  #converter:DocumentConverter.converter = None
@@ -138,7 +138,7 @@ class PdfToMarkdownConverter:
138
 
139
  ## moved from extraction_converter ( to standalone extract_to_md)
140
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
141
- def extract(self, src_path: str, output_dir: str) -> Dict:
142
  #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
143
  """
144
  Convert one file (PDF/HTML) to Markdown + images.
@@ -149,7 +149,7 @@ class PdfToMarkdownConverter:
149
  try:
150
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
151
  # Set a new environment variable
152
- set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
153
  except Exception as exc:
154
  tb = traceback.format_exc()
155
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
@@ -206,7 +206,7 @@ class PdfToMarkdownConverter:
206
 
207
  try:
208
  output_dir = create_outputdir(root=src_path, output_dir_string=self.output_dir_string)
209
- logger.info(f"✓ output_dir created: {output_dir}") #{create_outputdir(src_path)}"
210
  except Exception as exc:
211
  tb = traceback.format_exc()
212
  logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
 
16
 
17
  logger = get_logger(__name__)
18
 
19
+ # Define global variables ##SMY: TODO: consider moving to Globals sigleton constructor
20
  docconverter: DocumentConverter = None
21
  converter = None #DocumentConverter
22
  #converter:DocumentConverter.converter = None
 
138
 
139
  ## moved from extraction_converter ( to standalone extract_to_md)
140
  #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
141
+ def extract(self, src_path: str, output_dir: str): #Dict:
142
  #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
143
  """
144
  Convert one file (PDF/HTML) to Markdown + images.
 
149
  try:
150
  ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
151
  # Set a new environment variable
152
+ set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
153
  except Exception as exc:
154
  tb = traceback.format_exc()
155
  logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
 
206
 
207
  try:
208
  output_dir = create_outputdir(root=src_path, output_dir_string=self.output_dir_string)
209
+ logger.info(f"✓ output_dir created: {output_dir}") #{create_outputdir(src_path)}"
210
  except Exception as exc:
211
  tb = traceback.format_exc()
212
  logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
dummy_log.log ADDED
File without changes
main.py CHANGED
@@ -20,5 +20,6 @@ if __name__ == "__main__":
20
  os.chdir(script_dir) ##Path.cwd()
21
 
22
  demo = build_interface()
 
23
  #demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
24
- demo.launch(debug=True, show_error=True, ssr_mode=True) #ssr_mode=False
 
20
  os.chdir(script_dir) ##Path.cwd()
21
 
22
  demo = build_interface()
23
+ demo.queue() # Enables queue to prevent user's browser premature timeout > 60s
24
  #demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
25
+ demo.launch(debug=True, show_error=True, ssr_mode=True,) #ssr_mode=False
ui/gradio_ui.py CHANGED
@@ -1,29 +1,31 @@
1
  # ui/gradio_ui.py
2
  import gradio as gr
 
 
3
 
4
  from pathlib import Path, WindowsPath
5
- import traceback ## Extract, format and print information about Python stack traces.
6
- from concurrent.futures import ProcessPoolExecutor, as_completed
7
  from typing import Optional, Union #, Dict, List, Any, Tuple
8
 
9
  from huggingface_hub import get_token
10
- import file_handler
 
 
11
  import file_handler.file_utils
12
  from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
13
  from utils.utils import is_dict, is_list_of_dicts
14
  from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
 
 
 
15
  #from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
16
  from llm.provider_validator import is_valid_provider, suggest_providers
17
  from llm.llm_login import is_loggedin_huggingface, login_huggingface
18
-
19
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
20
  from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
21
  #from converters.md_to_pdf import MarkdownToPdfConverter
22
  #from converters.html_to_md import HtmlToMarkdownConverter ##SMY: PENDING: implementation
23
 
24
- from file_handler.file_utils import find_file
25
-
26
- from utils.get_config import get_config_value
27
  from utils.logger import get_logger
28
 
29
  logger = get_logger(__name__) ##NB: setup_logging() ## set logging
@@ -91,6 +93,7 @@ def convert_batch(
91
  page_range: str = None, #Optional[str] = None,
92
  tz_hours: str = None,
93
  oauth_token: gr.OAuthToken | None=None,
 
94
  ): #-> str:
95
  """
96
  Handles the conversion process using multiprocessing.
@@ -100,7 +103,8 @@ def convert_batch(
100
  """
101
 
102
  # login: Update the Gradio UI to improve user-friendly eXperience - commencing
103
- yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"__init__.py"
 
104
 
105
  # get token from logged-in user:
106
  api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
@@ -114,18 +118,21 @@ def convert_batch(
114
  if is_loggedin_huggingface() and (api_token is None or api_token == ""):
115
  api_token = get_token() ##SMY: might be redundant
116
 
117
- elif login_huggingface(api_token):
 
118
  # login: Update the Gradio UI to improve user-friendly eXperience
119
- yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
120
  else:
 
121
  # login: Update the Gradio UI to improve user-friendly eXperience
122
- yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
123
 
124
  except Exception as exc: # Catch all exceptions
125
  tb = traceback.format_exc()
126
  logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
127
- return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"] # return the exception message
128
-
 
129
 
130
  ## debug
131
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
@@ -134,8 +141,9 @@ def convert_batch(
134
  if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
135
  logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
136
  #outputs=[log_output, files_individual_JSON, files_individual_downloads],
137
- return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"__init__.py"]
138
 
 
139
  # Get config values if not provided
140
  config_file = find_file("config.ini") ##from file_handler.file_utils
141
  model_id = get_config_value(config_file, "MARKER_CAP", "MODEL_ID") if not model_id else model_id
@@ -147,8 +155,11 @@ def convert_batch(
147
  output_dir_string = str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR") if not output_dir_string else output_dir_string)
148
  use_llm = get_config_value(config_file, "MARKER_CAP", "USE_LLM") if not use_llm else use_llm
149
  page_range = get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE") if not page_range else page_range
150
-
 
151
  # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
 
 
152
  init_args = (
153
  provider,
154
  model_id,
@@ -174,18 +185,23 @@ def convert_batch(
174
 
175
  #global docextractor ##SMY: deprecated.
176
  try:
 
177
  # Create a pool with init_worker initialiser
 
 
 
 
178
  with ProcessPoolExecutor(
179
  max_workers=max_workers,
180
  initializer=init_worker,
181
  initargs=init_args
182
  ) as pool:
183
- #global docextractor
184
- logger.log(level=30, msg="Initialising ProcessPool: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
185
 
186
  # Update the Gradio UI to improve user-friendly eXperience
187
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
188
- yield gr.update(interactive=False), f"Starting ProcessPool: Processing files...", {"process": "Processing files ..."}, f"__init__.py"
189
 
190
  # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
191
  # The 'docconverter' argument is implicitly handled by the initialiser
@@ -195,16 +211,43 @@ def convert_batch(
195
  #logs = [f.result() for f in futures]
196
 
197
  try:
 
 
198
  pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
199
- #result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
200
- results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  except Exception as exc:
202
  # Raise the exception to stop the Gradio app: exception to halt execution
203
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
204
  traceback.print_exc() # Print the exception traceback
205
- #return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
206
- yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
207
-
 
208
  #'''
209
  try:
210
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
@@ -223,23 +266,24 @@ def convert_batch(
223
  logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
224
  i_image = log.get("images", 0)
225
  # Update the Gradio UI to improve user-friendly eXperience
226
- yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"__init__.py"
227
  logs_count = i+i_image
228
 
 
229
  #logs_files_images.append(logs_filepath) ## to del
230
  #logs_files_images.extend(logs_images) ## to del
231
  except Exception as exc:
232
  logger.exception("Error during processing results logs → {exc}\n{tb}", exc_info=True) # Log the full traceback
233
  traceback.print_exc() # Print the exception traceback
234
- #return f"An error occurred during processing results logs: {str(exc)}\n{tb}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
235
- yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" ## return the exception message
236
 
237
  #'''
238
  except Exception as exc:
239
  tb = traceback.format_exc()
240
  logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True) # Log the full traceback
241
  #traceback.print_exc() # Print the exception traceback
242
- yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
243
 
244
  '''
245
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
@@ -251,21 +295,25 @@ def convert_batch(
251
 
252
  # Zip Processed md Files and images. Insert to first index
253
  try: ##from file_handler.file_utils
 
254
  zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
255
  logs_files_images.insert(0, zipped_processed_files)
256
  #logs_files_images.insert(1, "====================")
257
- yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"__init__.py"
 
 
258
 
259
  except Exception as exc:
260
  tb = traceback.format_exc()
261
  logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True) # Log the full traceback
262
  #traceback.print_exc() # Print the exception traceback
263
  #return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
264
- yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py" # return the exception message
265
 
266
 
267
  # Return processed files log
268
  try:
 
269
  ## # Convert logs list of dicts to formatted json string
270
  logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
271
  #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
@@ -277,10 +325,11 @@ def convert_batch(
277
  logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
278
  logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
279
 
 
280
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
281
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
282
 
283
- yield gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
284
  return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
285
  #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
286
  #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
@@ -289,8 +338,8 @@ def convert_batch(
289
  tb = traceback.format_exc()
290
  logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
291
  #traceback.print_exc() # Print the exception traceback
292
- #return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"] # return the exception message
293
- yield [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"] # return the exception message
294
 
295
  #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
296
  #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
 
1
  # ui/gradio_ui.py
2
  import gradio as gr
3
+ from concurrent.futures import ProcessPoolExecutor, as_completed
4
+ import asyncio
5
 
6
  from pathlib import Path, WindowsPath
 
 
7
  from typing import Optional, Union #, Dict, List, Any, Tuple
8
 
9
  from huggingface_hub import get_token
10
+ from numpy import append, iterable
11
+
12
+ #import file_handler
13
  import file_handler.file_utils
14
  from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
15
  from utils.utils import is_dict, is_list_of_dicts
16
  from file_handler.file_utils import zip_processed_files, process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
17
+ from file_handler.file_utils import find_file
18
+ from utils.get_config import get_config_value
19
+
20
  #from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
21
  from llm.provider_validator import is_valid_provider, suggest_providers
22
  from llm.llm_login import is_loggedin_huggingface, login_huggingface
 
23
  from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
24
  from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
25
  #from converters.md_to_pdf import MarkdownToPdfConverter
26
  #from converters.html_to_md import HtmlToMarkdownConverter ##SMY: PENDING: implementation
27
 
28
+ import traceback ## Extract, format and print information about Python stack traces.
 
 
29
  from utils.logger import get_logger
30
 
31
  logger = get_logger(__name__) ##NB: setup_logging() ## set logging
 
93
  page_range: str = None, #Optional[str] = None,
94
  tz_hours: str = None,
95
  oauth_token: gr.OAuthToken | None=None,
96
+ progress: gr.Progress = gr.Progress(), #Progress tracker to keep tab on pool queue executor
97
  ): #-> str:
98
  """
99
  Handles the conversion process using multiprocessing.
 
103
  """
104
 
105
  # login: Update the Gradio UI to improve user-friendly eXperience - commencing
106
+ #yield gr.update(interactive=False), f"Commencing Processing ... Getting login", {"process": "Commencing Processing"}, f"dummy_log.log"
107
+ #progress((0,16), f"Commencing Processing ...")
108
 
109
  # get token from logged-in user:
110
  api_token = get_login_token(api_token_arg=api_token_gr, oauth_token=oauth_token)
 
118
  if is_loggedin_huggingface() and (api_token is None or api_token == ""):
119
  api_token = get_token() ##SMY: might be redundant
120
 
121
+ elif is_loggedin_huggingface() is False and api_token:
122
+ login_huggingface(api_token)
123
  # login: Update the Gradio UI to improve user-friendly eXperience
124
+ #yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
125
  else:
126
+ pass
127
  # login: Update the Gradio UI to improve user-friendly eXperience
128
+ #yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"dummy_log.log"
129
 
130
  except Exception as exc: # Catch all exceptions
131
  tb = traceback.format_exc()
132
  logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
133
+ return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
134
+
135
+ #progress((1,16), desc=f"Log in: {is_loggedin_huggingface}")
136
 
137
  ## debug
138
  #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
 
141
  if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
142
  logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
143
  #outputs=[log_output, files_individual_JSON, files_individual_downloads],
144
+ return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"dummy_log.log"]
145
 
146
+ #progress((2,16), desc=f"Getting configuration values")
147
  # Get config values if not provided
148
  config_file = find_file("config.ini") ##from file_handler.file_utils
149
  model_id = get_config_value(config_file, "MARKER_CAP", "MODEL_ID") if not model_id else model_id
 
155
  output_dir_string = str(get_config_value(config_file, "MARKER_CAP", "OUTPUT_DIR") if not output_dir_string else output_dir_string)
156
  use_llm = get_config_value(config_file, "MARKER_CAP", "USE_LLM") if not use_llm else use_llm
157
  page_range = get_config_value(config_file,"MARKER_CAP", "PAGE_RANGE") if not page_range else page_range
158
+ #progress((3,16), desc="Retrieved configuration values")
159
+
160
  # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
161
+ #progress((4,16), desc=f"Initialiasing init_args")
162
+ yield gr.update(interactive=False), f"Initialising init_args", {"process": "Processing files ..."}, f"dummy_log.log"
163
  init_args = (
164
  provider,
165
  model_id,
 
185
 
186
  #global docextractor ##SMY: deprecated.
187
  try:
188
+ results = [] ## initialised pool result holder
189
  # Create a pool with init_worker initialiser
190
+ logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
191
+ #progress((5,16), desc=f"Initialising ProcessPoolExecutor: Processing Files ...")
192
+ yield gr.update(interactive=False), f"Initialising ProcessPoolExecutor: Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
193
+
194
  with ProcessPoolExecutor(
195
  max_workers=max_workers,
196
  initializer=init_worker,
197
  initargs=init_args
198
  ) as pool:
199
+ #logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
200
+ #progress((6,16), desc=f"Starting ProcessPool queue: Processing Files ...")
201
 
202
  # Update the Gradio UI to improve user-friendly eXperience
203
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
204
+
205
 
206
  # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
207
  # The 'docconverter' argument is implicitly handled by the initialiser
 
211
  #logs = [f.result() for f in futures]
212
 
213
  try:
214
+ #(7,16), desc=f"ProcessPoolExecutor: Creating output_dir")
215
+ yield gr.update(interactive=False), f"Creating output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
216
  pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
217
+ #progress((8,16), desc=f"ProcessPoolExecutor: Created output_dir.")
218
+ yield gr.update(interactive=False), f"Created output_dir ...", {"process": "Processing files ..."}, f"dummy_log.log"
219
+
220
+ except Exception as exc:
221
+ # Raise the exception to stop the Gradio app: exception to halt execution
222
+ logger.exception("Error during creating output_dir", exc_info=True) # Log the full traceback
223
+ traceback.print_exc() # Print the exception traceback
224
+ #return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
225
+ # Update the Gradio UI to improve user-friendly eXperience
226
+ yield gr.update(interactive=True), f"An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
227
+
228
+ try:
229
+ #progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
230
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
231
+ # Use progress.tqdm to integrate with the executor map
232
+ #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
233
+ for result_interim in progress.tqdm(
234
+ iterable=pool.map(pdf2md_converter.convert_files, pdf_files), total=len(pdf_files)
235
+ ):
236
+ results.append(result_interim)
237
+ #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
238
+ # Update the Gradio UI to improve user-friendly eXperience
239
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
240
+
241
+ #progress((11,16), desc=f"ProcessPoolExecutor: Got Results from files conversion")
242
+ yield gr.update(interactive=True), f"rocessPoolExecutor: Got Results from files conversion: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
243
  except Exception as exc:
244
  # Raise the exception to stop the Gradio app: exception to halt execution
245
  logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
246
  traceback.print_exc() # Print the exception traceback
247
+ return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
248
+ # Update the Gradio UI to improve user-friendly eXperience
249
+ #yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
250
+
251
  #'''
252
  try:
253
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
 
266
  logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
267
  i_image = log.get("images", 0)
268
  # Update the Gradio UI to improve user-friendly eXperience
269
+ #yield gr.update(interactive=False), f"Processing files: {logs_files_images[logs_count]}", {"process": "Processing files"}, f"dummy_log.log"
270
  logs_count = i+i_image
271
 
272
+ #progress((12,16), desc="Processing results from files conversion") ##rekickin
273
  #logs_files_images.append(logs_filepath) ## to del
274
  #logs_files_images.extend(logs_images) ## to del
275
  except Exception as exc:
276
  logger.exception("Error during processing results logs → {exc}\n{tb}", exc_info=True) # Log the full traceback
277
  traceback.print_exc() # Print the exception traceback
278
+ return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
279
+ #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
280
 
281
  #'''
282
  except Exception as exc:
283
  tb = traceback.format_exc()
284
  logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True) # Log the full traceback
285
  #traceback.print_exc() # Print the exception traceback
286
+ yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
287
 
288
  '''
289
  logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
 
295
 
296
  # Zip Processed md Files and images. Insert to first index
297
  try: ##from file_handler.file_utils
298
+ #progress((13,16), desc="Zipping processed files and images")
299
  zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S') #date_format='%d%b%Y'
300
  logs_files_images.insert(0, zipped_processed_files)
301
  #logs_files_images.insert(1, "====================")
302
+
303
+ #progress((14,16), desc="Zipped processed files and images")
304
+ #yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"dummy_log.log"
305
 
306
  except Exception as exc:
307
  tb = traceback.format_exc()
308
  logger.exception(f"✗ Error during zipping processed files → {exc}\n{tb}" , exc_info=True) # Log the full traceback
309
  #traceback.print_exc() # Print the exception traceback
310
  #return gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
311
+ yield gr.update(interactive=True), f"✗ An error occurred during zipping files → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" # return the exception message
312
 
313
 
314
  # Return processed files log
315
  try:
316
+ #progress((15,16), desc="Formatting processed log results")
317
  ## # Convert logs list of dicts to formatted json string
318
  logs_return_formatted_json_string = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
319
  #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
 
325
  logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
326
  logger.log(level=20, msg="File conversion complete. Sending outcome to Gradio:", extra={"logs_files_image_return": str(logs_files_images_return)}) ## debug: FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Error or no image_path'
327
 
328
+ #progress((16,16), desc="Complete processing and formatting file processing results")
329
  #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
330
  #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
331
 
332
+ yield gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True) ##SMY: redundant
333
  return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
334
  #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
335
  #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
 
338
  tb = traceback.format_exc()
339
  logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
340
  #traceback.print_exc() # Print the exception traceback
341
+ #return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
342
+ yield [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] # return the exception message
343
 
344
  #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
345
  #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')