Spaces:

semmyk
/

parserPDF

Sleeping

App Files Files Community

semmyk commited on Sep 29

Commit

653f79c

1 Parent(s): bfbdd1d

baseline08_beta0.2.1_30Sept25: fix oauth_token. Marker converter now initialised with/out log in. - 'pre' load models #UX. Introduce Global sigleton (constructor). - update README

Browse files

Files changed (6) hide show

converters/extraction_converter.py +23 -11
globals.py +13 -0
llm/llm_login.py +3 -3
llm/openai_client.py +1 -1
ui/gradio_ui.py +22 -16
utils/logger.py +2 -2

converters/extraction_converter.py CHANGED Viewed

@@ -23,6 +23,7 @@ logger = get_logger(__name__)
 # create/load models. Called to curtail reloading models at each instance
 def load_models():
     return create_model_dict()
 # Full document converter
@@ -66,7 +67,7 @@ class DocumentConverter:
         self.max_workers = max_workers  ## pass to config_dict["pdftext_workers"]
         self.max_retries = max_retries  ## pass to __call__
         self.output_dir = output_dir    ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
-        self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
         # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None,  ##Example: "0,4-8,16"  ##Marker parses as List[int]  #]debug  #len(pdf_file)
@@ -103,6 +104,7 @@ class DocumentConverter:
             ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
             self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
             logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"})  #, "config": str(self.config_dict)})
@@ -135,10 +137,18 @@ class DocumentConverter:
             raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")  #.with_traceback(tb)
         # 4) Load models if not already loaded in reload mode
         try:
-            if 'model_dict' not in globals():
-                #model_dict = self.load_models()
                 model_dict = load_models()
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error loading models (reload): {exc}\n{tb}")
@@ -146,12 +156,13 @@ class DocumentConverter:
         # 5) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
-        try:
-            llm_service_str = str(self.llm_service).split("'")[1]  ## SMY: split and slicing  ##Gets the string value
-            # sets api_key required by Marker
-            os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or None else self.openai_api_key  ## to handle Marker's assertion test on OpenAI
-            logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token})  ##debug
             config_dict = config_parser.generate_config_dict()
             #config_dict["pdftext_worker"] = self.max_workers  #1  ##SMY: move to get_config_dicts()
@@ -160,7 +171,7 @@ class DocumentConverter:
             self.converter = MarkerConverter(
                 ##artifact_dict=self.artifact_dict,
                 #artifact_dict=create_model_dict(),
-                artifact_dict=model_dict,
                 config=config_dict,
                 #config=config_parser.generate_config_dict(),
                 #llm_service=self.llm_service  ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
@@ -180,8 +191,9 @@ class DocumentConverter:
         try:
             ## Enable higher quality processing with LLMs.  ## See MarkerOpenAIService,
-            #llm_service = llm_service.removeprefix("<class '").removesuffix("'>")  # e.g <class 'marker.services.openai.OpenAIService'>
-            llm_service  = str(llm_service).split("'")[1]  ## SMY: split and slicing
             self.use_llm = self.use_llm[0] if isinstance(self.use_llm, tuple) else self.use_llm
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!

 # create/load models. Called to curtail reloading models at each instance
 def load_models():
+    """ Creates Marker's models dict. Initiate download of models """
     return create_model_dict()
 # Full document converter
         self.max_workers = max_workers  ## pass to config_dict["pdftext_workers"]
         self.max_retries = max_retries  ## pass to __call__
         self.output_dir = output_dir    ## "output_dir": settings.DEBUG_DATA_FOLDER if debug else output_dir,
+        self.use_llm = use_llm if use_llm else False  #use_llm[0] if isinstance(use_llm, tuple) else use_llm,  #False,  #True,
         #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range   ##SMY: iterating twice because self.page casting as hint type tuple!
         self.page_range = page_range if page_range else None
         # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None,  ##Example: "0,4-8,16"  ##Marker parses as List[int]  #]debug  #len(pdf_file)
             ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
             self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
+            self.config_dict.pop("use_llm", None) if not self.config_dict.get("use_llm") or self.config_dict.get("use_llm") is False or self.config_dict.get("use_llm") == 'False'  else None
             logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"})  #, "config": str(self.config_dict)})
             raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")  #.with_traceback(tb)
         # 4) Load models if not already loaded in reload mode
+        from globals import config_load_models
         try:
+            if not config_load_models.model_dict or 'model_dict' not in globals():
                 model_dict = load_models()
+                '''if 'model_dict' not in globals():
+                    #model_dict = self.load_models()
+                    model_dict = load_models()'''
+            else: model_dict = config_load_models.model_dict
+        except OSError as exc_ose:
+            tb = traceback.format_exc()   #exc.__traceback__
+            logger.warning(f"⚠️ OSError: the paging file is too small (to complete reload): {exc_ose}\n{tb}")
+            pass
         except Exception as exc:
             tb = traceback.format_exc()   #exc.__traceback__
             logger.exception(f"✗ Error loading models (reload): {exc}\n{tb}")
         # 5) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
+        try:  # Assign llm_service if api_token.  ##SMY: split and slicing  ##Gets the string value
+            llm_service_str = None if api_token == '' or api_token is None or self.use_llm is False else str(self.llm_service).split("'")[1]  #
+            # sets api_key required by Marker ## to handle Marker's assertion test on OpenAI
+            #os.environ["OPENAI_API_KEY"] = api_token if api_token !='' or api_token is not None else self.openai_api_key  ##SMY: looks lame
+            os.environ["OPENAI_API_KEY"] = api_token if api_token and api_token != '' else os.getenv("OPENAI_API_KEY") or os.getenv("GEMINI_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+            #logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token})  ##debug
             config_dict = config_parser.generate_config_dict()
             #config_dict["pdftext_worker"] = self.max_workers  #1  ##SMY: move to get_config_dicts()
             self.converter = MarkerConverter(
                 ##artifact_dict=self.artifact_dict,
                 #artifact_dict=create_model_dict(),
+                artifact_dict=model_dict if model_dict else create_model_dict(),
                 config=config_dict,
                 #config=config_parser.generate_config_dict(),
                 #llm_service=self.llm_service  ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
         try:
             ## Enable higher quality processing with LLMs.  ## See MarkerOpenAIService,
+            # llm_service disused here
+            ##llm_service = llm_service.removeprefix("<class '").removesuffix("'>")  # e.g <class 'marker.services.openai.OpenAIService'>
+            #llm_service  = str(llm_service).split("'")[1]  ## SMY: split and slicing
             self.use_llm = self.use_llm[0] if isinstance(self.use_llm, tuple) else self.use_llm
             self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None,  ##SMY: passing as hint type tuple!

globals.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# opted for sigleton as opposed to global variable
+# Create a singleton object to hold all shared states
+# This ensures that only one instance of the Config class is ever created
+class Config:
+    """ Single model_dict use across the app"""
+    def __init__(self):
+        self.model_dict = {}
+# Create a single, shared instance of the Config class
+# Other modules will import and use this instance.
+config_load_models = Config()

llm/llm_login.py CHANGED Viewed

@@ -41,17 +41,17 @@ def login_huggingface(token: Optional[str] = None):
             logger.info("✔️ hf_login already: whoami()", extra={"mode": "HF Oauth"})
             #return True
         else:
-            login()   ##SMY: Not visible/interactive to users onH Space. #limitation
             sleep(5)  ##SMY pause for login. Helpful: pool async opex
             logger.info("✔️ hf_login already: login()", extra={"mode": "cli"})
             #return True
     except Exception as exc:
         # Respect common env var names; prefer explicit token arg when provided
-        fallback_token = token if token else get_token() or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
         if fallback_token:
             try:
                 login(token=fallback_token)
-                token = fallback_token
                 logger.info("✔️ hf_login through fallback", extra={"mode": "token"})  ##SMY: This only displays if token is provided
             except Exception as exc_token:
                 logger.warning("❌ hf_login_failed through fallback", extra={"error": str(exc_token)})

             logger.info("✔️ hf_login already: whoami()", extra={"mode": "HF Oauth"})
             #return True
         else:
+            login()   ##SMY: Not visible/interactive to users on HF Space. ## ProcessPoll limitation
             sleep(5)  ##SMY pause for login. Helpful: pool async opex
             logger.info("✔️ hf_login already: login()", extra={"mode": "cli"})
             #return True
     except Exception as exc:
         # Respect common env var names; prefer explicit token arg when provided
+        fallback_token = token if token else get_token() or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")  ##SMY: to revisit
         if fallback_token:
             try:
                 login(token=fallback_token)
+                #token = fallback_token  ##debug
                 logger.info("✔️ hf_login through fallback", extra={"mode": "token"})  ##SMY: This only displays if token is provided
             except Exception as exc_token:
                 logger.warning("❌ hf_login_failed through fallback", extra={"error": str(exc_token)})

llm/openai_client.py CHANGED Viewed

@@ -38,7 +38,7 @@ class OpenAIChatClient:
                 ) -> None:
         try:
-            openai_api_key_env = dotenv.get_key(".env", "OPENAI_API_KEY")
             self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id  ##concatenate so HF can pipe to Hf provider
             self.hf_provider = hf_provider
             self.base_url = base_url  #"https://router.huggingface.co/v1"  #%22"  #HF API proxy

                 ) -> None:
         try:
+            openai_api_key_env = dotenv.get_key(".env", "OPENAI_API_KEY") or dotenv.get_key(".env", "GEMINI_API_KEY")
             self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id  ##concatenate so HF can pipe to Hf provider
             self.hf_provider = hf_provider
             self.base_url = base_url  #"https://router.huggingface.co/v1"  #%22"  #HF API proxy

ui/gradio_ui.py CHANGED Viewed

@@ -37,9 +37,13 @@ pdf2md_converter = PdfToMarkdownConverter()
 # User eXperience: Load Marker models ahead of time if not already loaded in reload mode
 ## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
 from converters.extraction_converter import load_models
 try:
-    if 'model_dict' not in globals():
-        model_dict = load_models()
 except Exception as exc:
     #tb = traceback.format_exc()   #exc.__traceback__
     logger.exception(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
@@ -54,7 +58,7 @@ def get_login_token( api_token_arg, oauth_token: gr.OAuthToken | None=None,):
         oauth_token = oauth_token
     else: get_token()
-    return oauth_token.token  ##token value
 # pool executor to convert files called by Gradio
 ##SMY: TODO: future: refactor to gradio_process.py and
@@ -109,15 +113,18 @@ def convert_batch(
         if is_loggedin_huggingface() and (api_token is None or api_token == ""):
             api_token = get_token()   ##SMY: might be redundant
         else:
-            login_huggingface(api_token)
-        # login: Update the Gradio UI to improve user-friendly eXperience
-        yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
     except Exception as exc:  # Catch all exceptions
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
-        return gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"  # return the exception message
     ## debug
@@ -127,7 +134,7 @@ def convert_batch(
     if not pdf_files or pdf_files is None:  ## Check if files is None. This handles the case where no files are uploaded.
         logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
         #outputs=[log_output, files_individual_JSON, files_individual_downloads],
-        return gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"__init__.py"
     # Get config values if not provided
     config_file = find_file("config.ini")  ##from file_handler.file_utils
@@ -232,7 +239,6 @@ def convert_batch(
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
         #traceback.print_exc()  # Print the exception traceback
-        #return gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}"  # return the exception message
         yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"  # return the exception message
     '''
@@ -245,7 +251,7 @@ def convert_batch(
     # Zip Processed md Files and images. Insert to first index
     try:  ##from file_handler.file_utils
-        zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y')
         logs_files_images.insert(0, zipped_processed_files)
         #logs_files_images.insert(1, "====================")
         yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"__init__.py"
@@ -273,18 +279,18 @@ def convert_batch(
         #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
         #return "\n".join(logs), "\n".join(logs_files_images)    #"\n".join(logs_files)
-        #return logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
-        #return gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
-        #yield  gr.update(interactive=True), gr.update(), gr.update(visible=True), gr.update(visible=True)
         #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
-        return gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
     except Exception as exc:
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
         #traceback.print_exc()  # Print the exception traceback
-        return gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"  # return the exception message
     #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
     #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')

 # User eXperience: Load Marker models ahead of time if not already loaded in reload mode
 ## SMY: 29Sept2025 - Came across https://github.com/xiaoyao9184/docker-marker/tree/master/gradio
 from converters.extraction_converter import load_models
+from globals import config_load_models
 try:
+    if not config_load_models.model_dict:
+        config_load_models.model_dict = load_models()
+    '''if 'model_dict' not in globals():
+        global model_dict
+        model_dict = load_models()'''
 except Exception as exc:
     #tb = traceback.format_exc()   #exc.__traceback__
     logger.exception(f"✗ Error loading models (reload): {exc}")  #\n{tb}")
         oauth_token = oauth_token
     else: get_token()
+    return oauth_token.token if oauth_token else ''  ##token value or empty string
 # pool executor to convert files called by Gradio
 ##SMY: TODO: future: refactor to gradio_process.py and
         if is_loggedin_huggingface() and (api_token is None or api_token == ""):
             api_token = get_token()   ##SMY: might be redundant
+        elif login_huggingface(api_token):
+            # login: Update the Gradio UI to improve user-friendly eXperience
+            yield gr.update(interactive=False), f"login to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
         else:
+            # login: Update the Gradio UI to improve user-friendly eXperience
+            yield gr.update(interactive=False), f"Not logged in to HF: Processing files...", {"process": "Processing files"}, f"__init__.py"
     except Exception as exc:  # Catch all exceptions
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
+        return [gr.update(interactive=True), f"✗ An error occurred during login_huggingface → {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"]  # return the exception message
     ## debug
     if not pdf_files or pdf_files is None:  ## Check if files is None. This handles the case where no files are uploaded.
         logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
         #outputs=[log_output, files_individual_JSON, files_individual_downloads],
+        return [gr.update(interactive=True), "Initialising ProcessPool: No files uploaded.", {"Upload":"No files uploaded"}, f"__init__.py"]
     # Get config values if not provided
     config_file = find_file("config.ini")  ##from file_handler.file_utils
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
         #traceback.print_exc()  # Print the exception traceback
         yield gr.update(interactive=True), f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"  # return the exception message
     '''
     # Zip Processed md Files and images. Insert to first index
     try:  ##from file_handler.file_utils
+        zipped_processed_files = zip_processed_files(root_dir=f"data/{output_dir_string}", file_paths=logs_files_images, tz_hours=tz_hours, date_format='%d%b%Y_%H-%M-%S')  #date_format='%d%b%Y'
         logs_files_images.insert(0, zipped_processed_files)
         #logs_files_images.insert(1, "====================")
         yield gr.update(interactive=False), f"Processing zip and files: {logs_files_images}", {"process": "Processing files"}, f"__init__.py"
         #outputs=[process_button, log_output, files_individual_JSON, files_individual_downloads],
         #return "\n".join(logs), "\n".join(logs_files_images)    #"\n".join(logs_files)
+        yield  gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)
+        return [gr.update(interactive=True), gr.update(value=logs_return_formatted_json_string), gr.update(value=logs_return_formatted_json_string, visible=True), gr.update(value=logs_files_images_return, visible=True)]
         #yield gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return
+        #return [gr.update(interactive=True), logs_return_formatted_json_string, logs_return_formatted_json_string, logs_files_images_return]
     except Exception as exc:
         tb = traceback.format_exc()
         logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True)  # Log the full traceback
         #traceback.print_exc()  # Print the exception traceback
+        #return [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"]  # return the exception message
+        yield  [gr.update(interactive=True), f"✗ An error occurred during returning result logs→ {exc}\n{tb}", {"Error":f"Error: {exc}"}, f"__init__.py"]  # return the exception message
     #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
     #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')

utils/logger.py CHANGED Viewed

@@ -30,13 +30,13 @@ class JsonFormatter(logging.Formatter):
         ##SMY: TODO: local time
         self.tz_hours = tz_hours if tz_hours else 0
         self.date_format = date_format
-        self.time = datetime.now(timezone.utc) + timedelta(hours=tz_hours if tz_hours else 0) #if tz_hours else self.time.utcoffset()  # tzinfo=timezone(timedelta(hours=tz_hours))
     def format(self, record: logging.LogRecord) -> str:  #
         payload = {
             #"ts": datetime.now(timezone.utc).isoformat(),  ## default to 'YYYY-MM-DD HH:MM:SS.mmmmmm',
             #"ts": datetime.now(timezone.utc).strftime("%Y-%m-%d : %H:%M:%S"),  ## SMY: interested in datefmt="%H:%M:%S",
-            "ts": self.time.strftime(self.date_format),  ## SMY: interested in datefmt="%H:%M:%S",
             "level": record.levelname,
             "logger": record.name,
             "message": record.getMessage(),

         ##SMY: TODO: local time
         self.tz_hours = tz_hours if tz_hours else 0
         self.date_format = date_format
+        self.time = datetime.now(timezone.utc) + timedelta(hours=tz_hours if tz_hours else 0) ##SMY: TODO: fiz timedelta()
     def format(self, record: logging.LogRecord) -> str:  #
         payload = {
             #"ts": datetime.now(timezone.utc).isoformat(),  ## default to 'YYYY-MM-DD HH:MM:SS.mmmmmm',
             #"ts": datetime.now(timezone.utc).strftime("%Y-%m-%d : %H:%M:%S"),  ## SMY: interested in datefmt="%H:%M:%S",
+            "ts": f"{self.time.strftime(self.date_format)} (UTC)",  ## SMY: interested in datefmt="%H:%M:%S",
             "level": record.levelname,
             "logger": record.name,
             "message": record.getMessage(),