semmyk commited on
Commit
0fd441a
·
0 Parent(s):

initial commit

Browse files

Baseline initial commit

Files changed (48) hide show
  1. .cursorindexingignore +3 -0
  2. .gitattributes +1 -0
  3. .gitignore +227 -0
  4. .python-version +1 -0
  5. .specstory/.gitignore +2 -0
  6. README.md +178 -0
  7. __init__.py +0 -0
  8. converters/__init__.py +1 -0
  9. converters/extraction_converter.py +264 -0
  10. converters/pdf_to_md.py +332 -0
  11. data/output_dir/.gitignore +4 -0
  12. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/.gitignore +4 -0
  13. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.md +0 -0
  14. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_0_Picture_1.jpeg +0 -0
  15. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_11_Figure_9.jpeg +0 -0
  16. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_18_Figure_1.jpeg +0 -0
  17. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_1.jpeg +0 -0
  18. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_9.jpeg +0 -0
  19. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_6_Figure_1.jpeg +0 -0
  20. data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_8_Figure_1.jpeg +0 -0
  21. data/pdf/.gitignore +3 -0
  22. data/pdf/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.pdf +3 -0
  23. file_handler/__init__.py +1 -0
  24. file_handler/file_utils.py +296 -0
  25. llm/__init__.py +0 -0
  26. llm/hf_client.py +244 -0
  27. llm/llm_login.py +70 -0
  28. llm/openai_client.py +91 -0
  29. llm/provider_validator.py +116 -0
  30. main.py +22 -0
  31. pyproject.toml +9 -0
  32. requirements.txt +5 -0
  33. tests/test_converters.py +98 -0
  34. tests/test_file_handler.py +115 -0
  35. tests/test_llm.py +115 -0
  36. tests/test_main_ui.py +148 -0
  37. tests/test_utils.py +94 -0
  38. tests/tests_converter.py +19 -0
  39. ui/__init__.py +0 -0
  40. ui/gradio_ui.py +850 -0
  41. utils/__init__.py +0 -0
  42. utils/config.ini +158 -0
  43. utils/config.py +83 -0
  44. utils/get_arg_name.py +19 -0
  45. utils/get_config.py +18 -0
  46. utils/lib_loader.py +130 -0
  47. utils/logger.py +81 -0
  48. utils/utils.py +15 -0
.cursorindexingignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ # Don't index SpecStory auto-save files, but allow explicit context inclusion via @ references
3
+ .specstory/**
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data
2
+ # data/
3
+ # !pdf/
4
+ # !output_dir/
5
+
6
+ md_to_pdf*
7
+ html_to_md*
8
+
9
+ # Byte-compiled / optimized / DLL files
10
+ __pycache__/
11
+ *.py[codz]
12
+ *$py.class
13
+
14
+ # C extensions
15
+ *.so
16
+
17
+ # Certificates
18
+ *.pem
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ share/python-wheels/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+ MANIFEST
39
+
40
+ # PyInstaller
41
+ # Usually these files are written by a python script from a template
42
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
43
+ *.manifest
44
+ *.spec
45
+
46
+ # Installer logs
47
+ pip-log.txt
48
+ pip-delete-this-directory.txt
49
+
50
+ # Unit test / coverage reports
51
+ htmlcov/
52
+ .tox/
53
+ .nox/
54
+ .coverage
55
+ .coverage.*
56
+ .cache
57
+ nosetests.xml
58
+ coverage.xml
59
+ *.cover
60
+ *.py.cover
61
+ .hypothesis/
62
+ .pytest_cache/
63
+ cover/
64
+
65
+ # Translations
66
+ *.mo
67
+ *.pot
68
+
69
+ # Django stuff:
70
+ *.log
71
+ local_settings.py
72
+ db.sqlite3
73
+ db.sqlite3-journal
74
+
75
+ # Flask stuff:
76
+ instance/
77
+ .webassets-cache
78
+
79
+ # Scrapy stuff:
80
+ .scrapy
81
+
82
+ # Sphinx documentation
83
+ docs/_build/
84
+
85
+ # PyBuilder
86
+ .pybuilder/
87
+ target/
88
+
89
+ # Jupyter Notebook
90
+ .ipynb_checkpoints
91
+
92
+ # IPython
93
+ profile_default/
94
+ ipython_config.py
95
+
96
+ # pyenv
97
+ # For a library or package, you might want to ignore these files since the code is
98
+ # intended to run in multiple environments; otherwise, check them in:
99
+ # .python-version
100
+
101
+ # pipenv
102
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
104
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
105
+ # install all needed dependencies.
106
+ # Pipfile.lock
107
+
108
+ # UV
109
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # uv.lock
113
+
114
+ # poetry
115
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
117
+ # commonly ignored for libraries.
118
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119
+ # poetry.lock
120
+ # poetry.toml
121
+
122
+ # pdm
123
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
124
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
125
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
126
+ # pdm.lock
127
+ # pdm.toml
128
+ .pdm-python
129
+ .pdm-build/
130
+
131
+ # pixi
132
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
133
+ # pixi.lock
134
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
135
+ # in the .venv directory. It is recommended not to include this directory in version control.
136
+ .pixi
137
+
138
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
139
+ __pypackages__/
140
+
141
+ # Celery stuff
142
+ celerybeat-schedule
143
+ celerybeat.pid
144
+
145
+ # Redis
146
+ *.rdb
147
+ *.aof
148
+ *.pid
149
+
150
+ # RabbitMQ
151
+ mnesia/
152
+ rabbitmq/
153
+ rabbitmq-data/
154
+
155
+ # ActiveMQ
156
+ activemq-data/
157
+
158
+ # SageMath parsed files
159
+ *.sage.py
160
+
161
+ # Environments
162
+ .env
163
+ .envrc
164
+ .venv
165
+ env/
166
+ venv/
167
+ ENV/
168
+ env.bak/
169
+ venv.bak/
170
+
171
+ # Spyder project settings
172
+ .spyderproject
173
+ .spyproject
174
+
175
+ # Rope project settings
176
+ .ropeproject
177
+
178
+ # mkdocs documentation
179
+ /site
180
+
181
+ # mypy
182
+ .mypy_cache/
183
+ .dmypy.json
184
+ dmypy.json
185
+
186
+ # Pyre type checker
187
+ .pyre/
188
+
189
+ # pytype static type analyzer
190
+ .pytype/
191
+
192
+ # Cython debug symbols
193
+ cython_debug/
194
+
195
+ # PyCharm
196
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
197
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
198
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
199
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
200
+ # .idea/
201
+
202
+ # Abstra
203
+ # Abstra is an AI-powered process automation framework.
204
+ # Ignore directories containing user credentials, local state, and settings.
205
+ # Learn more at https://abstra.io/docs
206
+ .abstra/
207
+
208
+ # Visual Studio Code
209
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
210
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
211
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
212
+ # you could uncomment the following to ignore the entire vscode folder
213
+ # .vscode/
214
+
215
+ # Ruff stuff:
216
+ .ruff_cache/
217
+
218
+ # PyPI configuration file
219
+ .pypirc
220
+
221
+ # Marimo
222
+ marimo/_static/
223
+ marimo/_lsp/
224
+ __marimo__/
225
+
226
+ # Streamlit
227
+ .streamlit/secrets.toml
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
.specstory/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # SpecStory explanation file
2
+ /.what-is-this.md
README.md ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ #[project]
3
+ #name: "parserpdf"
4
+ name: "parser2md"
5
+ #title: "parserPDF"
6
+ title: "parser2md"
7
+ emoji: 📝
8
+ colorFrom: yellow
9
+ colorTo: purple
10
+ sdk: gradio
11
+ sdk_version: 5.0.1
12
+ app_file: main.py
13
+ pinned: false
14
+ license: mit
15
+ short_description: 'PDF & HTML parser to markdown'
16
+ version: "0.1.0"
17
+ readme: "README.md"
18
+ requires-python: ">=3.12"
19
+ dependencies: []
20
+ owner: "research-semmyk"
21
+ ---
22
+ # parserPDF
23
+
24
+ [![Gradio](https://img.shields.io/badge/Gradio-SDK-amber?logo=gradio)](https://www.gradio.app/)
25
+ [![Python](https://img.shields.io/badge/Python->=3.12-blue?logo=python)](https://www.python.org/)
26
+ [![MIT License](https://img.shields.io/badge/License-MIT-yellow?logo=mit)](LICENSE)
27
+
28
+ A Gradio-based web application for converting PDF and HTML documents to Markdown format. Powered by the Marker library (a pipeline of deep learning models for document parsing) and optional LLM integration for enhanced processing. Supports batch processing of files and directories via an intuitive UI.
29
+
30
+ ## Features
31
+ - **PDF to Markdown**: Extract text, tables, and images from PDFs using Marker.
32
+ - **HTML to Markdown**: Convert HTML files to clean Markdown.
33
+ - **Batch Processing**: Upload multiple files or entire directories.
34
+ - **LLM Integration**: Optional use of Hugging Face or OpenAI models for advanced conversion (e.g., via Llama or GPT models).
35
+ - **Customizable Settings**: Adjust model parameters, output formats (Markdown/HTML), page ranges, and more via the UI.
36
+ - **Output Management**: Generated Markdown files saved to a configurable output directory, with logs and download links.
37
+
38
+ ## Project Structure
39
+ ```
40
+ parserpdf/
41
+ ├── README.md # Project documentation
42
+ ├── requirements.txt # Python dependencies
43
+ ├── main.py # Entry point – launches the Gradio UI
44
+ ├── pyproject.toml # Project configuration
45
+ ├── .env # Environment variables (e.g., API tokens)
46
+ ├── .gitignore # Git ignore rules
47
+ ├── converters/ # Conversion logic
48
+ │ ├── __init__.py
49
+ │ ├── extraction_converter.py # Document extraction utilities
50
+ │ ├── pdf_to_md.py # Marker-based PDF → Markdown
51
+ │ ├── html_to_md.py # HTML → Markdown
52
+ │ └── md_to_pdf.py # Markdown → PDF (pending full implementation)
53
+ ├── file_handler/ # File handling utilities
54
+ │ ├── __init__.py
55
+ │ └── file_utils.py # Helpers for files, directories, and paths
56
+ ├── llm/ # LLM client integrations
57
+ │ ├── __init__.py
58
+ │ ├── hf_client.py # Hugging Face client wrapper
59
+ │ ├── openai_client.py # Marker OpenAI client
60
+ │ ├── llm_login.py # Authentication handlers
61
+ │ └── provider_validator.py # Provider validation
62
+ ├── ui/ # Gradio UI components
63
+ │ ├── __init__.py
64
+ │ └── gradio_ui.py # UI layout and event handlers
65
+ ├── utils/ # Utility modules
66
+ │ ├── __init__.py
67
+ │ ├── config.py # Configuration constants
68
+ │ ├── config.ini # config file for settings
69
+ │ ├── logger.py # Logging wrapper
70
+ │ ├── lib_loader.py # loads weasyprint lib dependencies to environ
71
+ │ ├── get_config.py # helper for getting configurations
72
+ │ ├── get_arg_name.py # helper for getting argument names
73
+ │ └── utils.py # General utilities and helpers
74
+ ├── data/ # Sample data and outputs (gitignored)
75
+ │ ├── output_dir/ # Output directory
76
+ │ ├── pdf/ # Sample PDFs
77
+ ├── logs/ # Log files (gitignored)
78
+ ├── tests/ # Unit tests
79
+ ├── tests_converter.py # tests for converters
80
+ └── scrapyard/ # Development scraps
81
+
82
+
83
+ [Projected]
84
+ ├── transformers/
85
+ │ ├── __init__.py
86
+ │ ├── marker.py # Marker class
87
+ │ └── marker_utils.py # helpers for Marker class
88
+
89
+ ```
90
+
91
+ ## Installation
92
+ 1. Clone the repository:
93
+ ```
94
+ git clone <repo-url>
95
+ cd parserpdf
96
+ ```
97
+
98
+ 2. Create a virtual environment and install dependencies:
99
+ ```
100
+ python -m venv venv
101
+ source venv/bin/activate # On Windows: venv\Scripts\activate
102
+ pip install -r requirements.txt
103
+ ```
104
+
105
+ 3. Set up environment variables (optional for LLM features):
106
+ - Create a `.env` file with your API tokens, e.g.:
107
+ ```
108
+ HF_TOKEN=hf_xxx
109
+ OPENAI_API_KEY=sk-xxx
110
+ ```
111
+
112
+ 4. Install Marker (if not in requirements.txt):
113
+ ```
114
+ pip install marker-pdf
115
+ ```
116
+
117
+ ## Usage
118
+ 1. Run the application:
119
+ ```
120
+ python main.py
121
+ ```
122
+
123
+ 2. Open the provided local URL (e.g., http://127.0.0.1:7860) in your browser.
124
+
125
+ 3. In the UI:
126
+ - Upload PDF/HTML files or directories via the "PDF & HTML ➜ Markdown" tab.
127
+ - Configure LLM/Marker settings in the accordions (e.g., select provider, model, tokens).
128
+ - Click "Process All Uploaded Files" to convert.
129
+ - View logs, JSON output, and download generated Markdown files.
130
+
131
+ ### Example Workflow
132
+ - Upload a PDF directory.
133
+ - Set model to `meta-llama/Llama-4-Maverick-17B-128E-Instruct` (Hugging Face).
134
+ - Enable LLM if needed, set page range (e.g., "1-10").
135
+ - Process: Outputs Markdown files with extracted text/images to `output_dir`.
136
+
137
+ ## Configuration
138
+ - Edit `utils/config.py` or `utils/config.ini` for defaults (e.g., model ID, output dir).
139
+ - UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
140
+
141
+ ## LLM Providers
142
+ - **Hugging Face**: Supports inference providers like Fireworks AI, Together AI.
143
+ - **OpenAI**: Compatible via router (default: https://router.huggingface.co/v1).
144
+ - Login via UI or CLI: `huggingface-cli login`.
145
+
146
+ ## Output
147
+ - Markdown files saved to `output_dir` (default: `./output_dir`).
148
+ - Images extracted as JPEGs alongside Markdown.
149
+ - Logs in `logs/` and UI textbox.
150
+
151
+ ## Limitations & TODO
152
+ - Markdown → PDF is pending full implementation.
153
+ - HTML tab is deprecated; use main tab for mixed uploads.
154
+ - Large files/directories may require increased `max_workers`.
155
+ - No JSON/chunks output yet (flagged for future).
156
+
157
+ ## Contributing
158
+ Fork the repo, create a branch, and submit a PR.
159
+
160
+ Ensure tests pass: - verify the application's functionality.
161
+ ```
162
+ pytest tests/
163
+ ```
164
+ Test Structure
165
+ - tests/test_converters.py: Tests PDF/HTML/Markdown converters, including init, conversion, batch processing, and error handling.
166
+ - tests/test_file_handler.py: Tests file collection utilities (PDF/HTML/MD paths), data processing, and output directory creation.
167
+ - tests/test_utils.py: Tests logging setup, config loading, utility functions like is_dict/is_list_of_dicts, and configuration access.
168
+ - tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
169
+ - tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
170
+
171
+
172
+ ## License
173
+ MIT License. See [LICENSE](LICENSE) for details.
174
+
175
+ ## Acknowledgments
176
+ - Built with [Gradio](https://gradio.app/) for the UI.
177
+ - PDF parsing via [Marker](https://github.com/VikParuchuri/marker).
178
+ - LLM integrations using Hugging Face Transformers and OpenAI APIs.
__init__.py ADDED
File without changes
converters/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
converters/extraction_converter.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import traceback
4
+ #import time
5
+ from typing import Dict, Any, Type, Optional, Union #, BaseModel
6
+ from pydantic import BaseModel
7
+
8
+ from marker.models import create_model_dict
9
+ #from marker.converters.extraction import ExtractionConverter as MarkerExtractor ## structured pydantic extraction
10
+ from marker.converters.pdf import PdfConverter as MarkerConverter ## full document convertion/extraction
11
+ from marker.config.parser import ConfigParser ## Process custom configuration
12
+ from marker.services.openai import OpenAIService as MarkerOpenAIService
13
+ #from sympy import Union
14
+
15
+ #from llm.hf_client import HFChatClient
16
+ from llm.openai_client import OpenAIChatClient
17
+ from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
18
+ from utils.lib_loader import load_library
19
+
20
+ from utils.logger import get_logger
21
+
22
+ logger = get_logger(__name__)
23
+
24
+ # Full document converter
25
+ class DocumentConverter:
26
+ """
27
+ Business logic wrapper using Marker OpenAI LLM Services to
28
+ convert documents (PDF, HTML files) into markdowns + assets.
29
+ """
30
+
31
+ def __init__(self,
32
+ #provider: str,
33
+ model_id: str,
34
+ #base_url: str,
35
+ hf_provider: str,
36
+ #endpoint_url: str,
37
+ #backend_choice: str,
38
+ #system_message: str,
39
+ #max_tokens: int,
40
+ temperature: float,
41
+ top_p: float,
42
+ #stream: bool,
43
+ api_token: str,
44
+ openai_base_url: str = "https://router.huggingface.co/v1",
45
+ openai_image_format: Optional[str] = "webp",
46
+ #max_workers: Optional[str] = 4,
47
+ max_retries: Optional[int] = 2,
48
+ output_format: str = "markdown",
49
+ output_dir: Optional[Union[str, Path]] = "output_dir",
50
+ use_llm: Optional[bool] = None, #bool = False, #Optional[bool] = False, #True,
51
+ page_range: Optional[str] = None, #str = None #Optional[str] = None,
52
+ ):
53
+
54
+ #self.converter = None #MarkerConverter
55
+ self.model_id = model_id #"model_name"
56
+ self.openai_api_key = api_token ## to replace dependency on self.client.openai_api_key
57
+ self.openai_base_url = openai_base_url #, #self.base_url,
58
+ self.temperature = temperature #, self.client.temperature,
59
+ self.top_p = top_p # self.client.top_p,
60
+ self.llm_service = MarkerOpenAIService
61
+ self.openai_image_format = openai_image_format #"png" #better compatibility
62
+ self.max_retries = max_retries ## pass to __call__
63
+ self.output_dir = output_dir
64
+ self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
65
+ #self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
66
+ self.page_range = page_range if page_range else None
67
+ # self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None, ##Example: "0,4-8,16" ##Marker parses as List[int] #]debug #len(pdf_file)
68
+ '''
69
+ if isinstance(page_range, tuple | str):
70
+ self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range
71
+ else:
72
+ self.page_range = None
73
+ '''
74
+
75
+ # 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
76
+ ##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
77
+ try:
78
+ self.client = OpenAIChatClient(
79
+ model_id=model_id,
80
+ hf_provider=hf_provider,
81
+ #base_url=base_url,
82
+ api_token=api_token,
83
+ temperature=temperature,
84
+ top_p=top_p,
85
+ )
86
+ logger.log(level=20, msg="✔️ OpenAIChatClient instantiated:", extra={"model_id": self.client.model_id, "chatclient": str(self.client)})
87
+
88
+ except Exception as exc:
89
+ tb = traceback.format_exc() #exc.__traceback__
90
+ logger.exception(f"✗ Error initialising OpenAIChatClient: {exc}\n{tb}")
91
+ raise RuntimeError(f"✗ Error initialising OpenAIChatClient: {exc}\n{tb}") #.with_traceback(tb)
92
+
93
+ # 1) # Define the custom configuration for the Hugging Face LLM.
94
+ # Use typing.Dict and typing.Any for flexible dictionary type hints
95
+ try:
96
+ self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
97
+ #self.config_dict.pop("page_range") if self.config_dict.get("page_range")[0] is None else None ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
98
+
99
+ ##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
100
+ self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
101
+
102
+ logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"}) #, "config": str(self.config_dict)})
103
+
104
+ except Exception as exc:
105
+ tb = traceback.format_exc() #exc.__traceback__
106
+ logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
107
+ raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
108
+
109
+ # 2) Use the Marker's ConfigParser to process configuration.
110
+ # The `ConfigParser` class is explicitly imported and used as the type hint.
111
+ try:
112
+ config_parser: ConfigParser = ConfigParser(self.config_dict)
113
+ logger.log(level=20, msg="✔️ parsed/processed custom config_dict:", extra={"config": str(config_parser)}) #.config_dict)})
114
+
115
+ except Exception as exc:
116
+ tb = traceback.format_exc() #exc.__traceback__
117
+ logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
118
+ raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
119
+
120
+ # 3) Create the artifact dictionary and retrieve the LLM service.
121
+ try:
122
+ #self.artifact_dict: Dict[str, Any] = self.get_create_model_dict ##SMY: Might have to eliminate function afterall
123
+ self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
124
+ #logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
125
+
126
+ except Exception as exc:
127
+ tb = traceback.format_exc() #exc.__traceback__
128
+ logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
129
+ raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
130
+
131
+ # 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
132
+ try:
133
+ llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
134
+
135
+ # sets api_key required by Marker
136
+ os.environ["OPENAI_API_KEY"] = self.openai_api_key or api_token ## to handle Marker's assertion test on OpenAI
137
+ logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
138
+
139
+ #self.converter: MarkerConverter = MarkerConverter(
140
+ self.converter = MarkerConverter(
141
+ #artifact_dict=self.artifact_dict,
142
+ artifact_dict=create_model_dict(),
143
+ config=config_parser.generate_config_dict(),
144
+ #llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
145
+ llm_service=llm_service_str ##resolve
146
+ )
147
+
148
+ logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
149
+ #return self.converter ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
150
+ except Exception as exc:
151
+ tb = traceback.format_exc
152
+ logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
153
+ raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
154
+
155
+ # Define the custom configuration for HF LLM.
156
+ def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
157
+ """ Define the custom configuration for the Hugging Face LLM. """
158
+
159
+ try:
160
+ ## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
161
+ #llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
162
+ llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
163
+ self.use_llm = self.use_llm[0]
164
+ self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
165
+
166
+
167
+ config_dict = {
168
+ "output_format" : output_format, #"markdown",
169
+ "openai_model" : self.model_id, #self.client.model_id, #"model_name"
170
+ "openai_api_key" : self.client.openai_api_key, #self.client.openai_api_key, #self.api_token,
171
+ "openai_base_url": self.openai_base_url, #self.client.base_url, #self.base_url,
172
+ "temperature" : self.temperature, #self.client.temperature,
173
+ "top_p" : self.top_p, #self.client.top_p,
174
+ "openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
175
+ "max_retries" : self.max_retries, #3, ## pass to __call__
176
+ "output_dir" : self.output_dir,
177
+ "use_llm" : self.use_llm, #False, #True,
178
+ "page_range" : self.page_range, #]debug #len(pdf_file)
179
+ }
180
+ return config_dict
181
+ except Exception as exc:
182
+ tb = traceback.format_exc() #exc.__traceback__
183
+ logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
184
+ raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
185
+ #raise
186
+
187
+ ##SMY: flagged for deprecation
188
+ ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
189
+ #def get_extraction_converter(self, chat_fn):
190
+ def get_create_model_dict(self):
191
+ """
192
+ Wraps the LLM chat_fn into marker’s artifact_dict
193
+ and returns an ExtractionConverter for PDFs & HTML.
194
+ """
195
+ return create_model_dict()
196
+ #artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
197
+ #return artifact_dict
198
+
199
+ ## SMY: Kept for future implementation (and historic reasoning). Keeping the classes separate to avoid confusion with the original implementation
200
+ '''
201
+ class DocumentExtractor:
202
+ """
203
+ Business logic wrapper using HFChatClient and Marker to
204
+ convert documents (PDF, HTML files) into markdowns + assets
205
+ Wrapper around the Marker extraction converter for PDFs & HTML.
206
+ """
207
+
208
+ def __init__(self,
209
+ provider: str,
210
+ model_id: str,
211
+ hf_provider: str,
212
+ endpoint_url: str,
213
+ backend_choice: str,
214
+ system_message: str,
215
+ max_tokens: int,
216
+ temperature: float,
217
+ top_p: float,
218
+ stream: bool,
219
+ api_token: str,
220
+ ):
221
+ # 1) Instantiate the LLM Client (HFChatClient): Get a provider‐agnostic chat function
222
+ try:
223
+ self.client = HFChatClient(
224
+ provider=provider,
225
+ model_id=model_id,
226
+ hf_provider=hf_provider,
227
+ endpoint_url=endpoint_url,
228
+ backend_choice=backend_choice, #choices=["model-id", "provider", "endpoint"]
229
+ system_message=system_message,
230
+ max_tokens=max_tokens,
231
+ temperature=temperature,
232
+ top_p=top_p,
233
+ stream=stream,
234
+ api_token=api_token,
235
+ )
236
+ logger.log(level=20, msg="✔️ HFChatClient instantiated:", extra={"model_id": model_id, "chatclient": str(self.client)})
237
+
238
+ except Exception as exc:
239
+ tb = traceback.format_exc() #exc.__traceback__
240
+ logger.exception(f"✗ Error initialising HFChatClient: {exc}")
241
+ raise RuntimeError(f"✗ Error initialising HFChatClient: {exc}").with_traceback(tb)
242
+ #raise
243
+
244
+ # 2) Build Marker's artifact dict using the client's chat method
245
+ self.artifact_dict = self.get_extraction_converter(self.client)
246
+
247
+ # 3) Instantiate Marker's ExtractionConverter (ExtractionConverter)
248
+ try:
249
+ self.extractor = MarkerExtractor(artifact_dict=self.artifact_dict)
250
+ except Exception as exc:
251
+ logger.exception(f"✗ Error initialising MarkerExtractor: {exc}")
252
+ raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}")
253
+
254
+ ##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
255
+ def get_extraction_converter(self, chat_fn):
256
+ """
257
+ Wraps the LLM chat_fn into marker’s artifact_dict
258
+ and returns an ExtractionConverter for PDFs & HTML.
259
+ """
260
+
261
+ artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
262
+ return artifact_dict
263
+ '''
264
+
converters/pdf_to_md.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # converters/pdf_to_md.py
2
+ import os
3
+ from pathlib import Path
4
+ from typing import List, Dict, Optional, Union
5
+ import traceback ## Extract, format and print information about Python stack traces.
6
+ import time
7
+
8
+ #from llm.hf_client import HFChatClient
9
+ from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
10
+ from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir, write_markdown, dump_images
11
+
12
+
13
+ from utils import config
14
+ from utils.lib_loader import set_weasyprint_library
15
+ from utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
18
+
19
+ # Define global variables
20
+ docconverter: DocumentConverter = None
21
+ converter = None #DocumentConverter
22
+ #converter:DocumentConverter.converter = None
23
+
24
+ # Define docextractor in the pool as serialised object and passed to each worker process.
25
+ # Note: DocumentConverter must be "picklable".
26
+ def init_worker(#self,
27
+ provider: str,
28
+ model_id: str,
29
+ #base_url,
30
+ hf_provider: str,
31
+ endpoint_url: str,
32
+ backend_choice: str,
33
+ system_message: str,
34
+ max_tokens: int,
35
+ temperature: float,
36
+ top_p: float,
37
+ stream: bool,
38
+ api_token: str,
39
+ openai_base_url: str, #: str = "https://router.huggingface.co/v1",
40
+ openai_image_format: str, #: str | None = "webp",
41
+ max_workers: int,
42
+ max_retries: int, #: int | None = 2,
43
+ output_format: str, #: str = "markdown",
44
+ output_dir: str, #: Union | None = "output_dir",
45
+ use_llm: bool, #: bool | None = False,
46
+ page_range: str, #: str | None = None
47
+ ):
48
+
49
+ #'''
50
+ """
51
+ instantiate DocumentConverter/DocumentExtractor for use in each pool worker
52
+ Args:
53
+
54
+ """
55
+
56
+ ## moved to class
57
+ # Initialise the global `converter` in each worker
58
+ # Define global variables
59
+ global docconverter
60
+ global converter
61
+
62
+
63
+ ##SMY: kept for future implementation. Replaced with DocumentConverter.
64
+ '''
65
+ # 1) Instantiate the DocumentExtractor
66
+ logger.log(level=20, msg="initialising docextractor:", extra={"model_id": model_id, "hf_provider": hf_provider})
67
+ try:
68
+ docextractor = DocumentExtractor(
69
+ provider=provider,
70
+ model_id=model_id,
71
+ hf_provider=hf_provider,
72
+ endpoint_url=endpoint_url,
73
+ backend_choice=backend_choice,
74
+ system_message=system_message,
75
+ max_tokens=max_tokens,
76
+ temperature=temperature,
77
+ top_p=top_p,
78
+ stream=stream,
79
+ api_token=api_token,
80
+ )
81
+ logger.log(level=20, msg="✔️ docextractor initialised:", extra={"model_id": model_id, "hf_provider": hf_provider})
82
+ except Exception as exc:
83
+ #logger.error(f"Failed to initialise DocumentExtractor: {exc}")
84
+ tb = traceback.format_exc()
85
+ logger.exception(f"init_worker: Error initialising DocumentExtractor → {exc}\n{tb}", exc_info=True)
86
+ return f"✗ init_worker: error initialising DocumentExtractor → {exc}\n{tb}"
87
+
88
+ self.docextractor = docextractor
89
+ '''
90
+
91
+ #'''
92
+ # 1) Instantiate the DocumentConverter
93
+ logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
94
+ try:
95
+ docconverter = DocumentConverter(
96
+ model_id, #: str,
97
+ hf_provider, #: str,
98
+ temperature, #: float,
99
+ top_p, #: float,
100
+ api_token, #: str,
101
+ openai_base_url, #: str = "https://router.huggingface.co/v1",
102
+ openai_image_format, #: str | None = "webp",
103
+ max_retries, #: int | None = 2,
104
+ output_format, #: str = "markdown",
105
+ output_dir, #: Union | None = "output_dir",
106
+ use_llm, #: bool | None = False,
107
+ page_range, #: str | None = None
108
+ )
109
+ logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
110
+ except Exception as exc:
111
+ #logger.error(f"Failed to initialise DocumentConverter: {exc}") #debug
112
+ tb = traceback.format_exc()
113
+ logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
114
+ return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
115
+
116
+ #docconverter = docconverter
117
+ converter = docconverter.converter
118
+ #self.llm_service = docconverter.llm_service ##duplicate?
119
+ #self.model_id = model_id ##duplicate?
120
+ #'''
121
+
122
+ class PdfToMarkdownConverter:
123
+ """
124
+ Wrapper around the Marker library that converts PDFs to Markdown.
125
+ """
126
+
127
+ #def __init__(self, options: Dict | None = None):
128
+ def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
129
+ self.options = options or {}
130
+ self.output_dir_string = ''
131
+ #self.OUTPUT_DIR = config.OUTPUT_DIR ##flag unused
132
+ #self.MAX_RETRIES = config.MAX_RETRIES ##flag unused
133
+ #self.docconverter = None #DocumentConverter
134
+ #self.converter = self.docconverter.converter #None
135
+
136
+ # This global will be set (re-initialised) in each worker after init_worker runs
137
+
138
+ ## moved from extraction_converter ( to standalone extract_to_md)
139
+ #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
140
+ def extract(self, src_path: str, output_dir: str) -> Dict:
141
+ #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
142
+ """
143
+ Convert one file (PDF/HTML) to Markdown + images.
144
+ Writes a `.md` file and any extracted images under `output_dir`.
145
+ Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
146
+ """
147
+
148
+ try:
149
+ ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
150
+ # Set a new environment variable
151
+ set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
152
+ except Exception as exc:
153
+ tb = traceback.format_exc()
154
+ logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
155
+ raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
156
+
157
+
158
+ # Run Marker conversion with LLM if use_llm is true
159
+ try:
160
+ #rendered = self.docconverter.converter(src_path, use_llm=True)
161
+ #rendered = self.docconverter.converter(src_path)
162
+ rendered = converter(src_path)
163
+ logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
164
+ except Exception as exc:
165
+ tb = traceback.format_exc()
166
+ logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True) # Log the full traceback
167
+
168
+ return f"✗ error during extraction → {exc}\n{tb}"
169
+
170
+ # Write Markdown file
171
+ '''
172
+ base = Path(str_path).stem ## Get filename without extension
173
+ md_path = output_dir / f"{base}.md" # Join output dir and new markdown file with the slash operator
174
+
175
+ with open(md_path, "w", encoding="utf-8") as f:
176
+ f.write(rendered.markdown)
177
+ '''
178
+ try:
179
+ md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered)
180
+ #debug md_file = "debug_md_file dummy name" ##debug
181
+ except Exception as exc:
182
+ tb = traceback.format_exc()
183
+ logger.exception(f"✗ error creating md_file → {exc}\n{tb}", exc_info=True)
184
+ #return f"✗ error creating md_file → {exc}\n{tb}"
185
+
186
+ # Dump extracted images
187
+ #debug images_count = 100 ##debug
188
+ try:
189
+ images_count, image_path = dump_images(src_path, output_dir, rendered)
190
+ except Exception as exc:
191
+ tb = traceback.format_exc()
192
+ logger.exception(f"✗ error counting and creating image_path → {exc}\n{tb}", exc_info=True)
193
+ #return f"✗ error counting andcreating image_path → {exc}\n{tb}"
194
+
195
+ #return {"images": len(rendered.images), "file": md_file} ##debug
196
+ return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
197
+
198
+ #def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
199
+ def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2) -> Union[Dict, str]: #str:
200
+ #def convert_files(self, src_path: str) -> str:
201
+ """
202
+ Worker task: use `extractor` to convert file with retry/backoff.
203
+ Returns a short log line.
204
+ """
205
+
206
+ try:
207
+ output_dir = create_outputdir(root=src_path, output_dir_string=self.output_dir_string)
208
+ logger.info(f"✓ output_dir created: {output_dir}") #{create_outputdir(src_path)}"
209
+ except Exception as exc:
210
+ tb = traceback.format_exc()
211
+ logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
212
+ return f"✗ error creating output_dir → {exc}\n{tb}"
213
+
214
+ try:
215
+ #if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
216
+ #if not Path(src_path).name.endswith(tuple({".pdf", ".html"})): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
217
+ if not Path(src_path).name.endswith((".pdf", ".html")): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
218
+ logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
219
+ return f"skipped {Path(src_path).name}"
220
+ except Exception as exc:
221
+ tb = traceback.format_exc()
222
+ logger.exception("✗ error during suffix extraction → {exc}\n{tb}", exc_info=True)
223
+ return f"✗ error during suffix extraction → {exc}"
224
+
225
+ #max_retries = self.MAX_RETRIES
226
+ for attempt in range(1, max_retries + 1):
227
+ try:
228
+ info = self.extract(str(src_path), str(output_dir.stem)) #extractor.converter(str(src_path), str(output_dir)) #
229
+ logger.log(level=20, msg=f"✓ : info about extracted {Path(src_path).name}: ", extra={"info": str(info)})
230
+ ''' ##SMY: moving formating to calling Gradio
231
+ img_count = info.get("images", 0)
232
+ md_filename = info.get("file", 0)
233
+ md_filepath = info.get("filepath", 0)
234
+ #return f"✓ {src_path.name} ({img_count} images)"
235
+ return f"✓ {md_filename}: ({img_count} images)", md_filepath
236
+ '''
237
+ return info ##SMY: simply return the dict
238
+ except Exception as exc:
239
+ if attempt == max_retries:
240
+ tb = traceback.format_exc()
241
+ return f"✗ {info.get("file")} → {exc}\n{tb}"
242
+ #return f"✗ {md_filename} → {exc}\n{tb}"
243
+
244
+ #time.sleep(2 ** attempt)
245
+ # Exponential backoff before retry
246
+ logger.warning(f"Attempt {attempt} failed for {Path(src_path).name}: {exc}. Retrying in {2 ** attempt}s...")
247
+
248
+ time.sleep(2 ** attempt)
249
+
250
+ ## SMY: unused
251
+ #===================== discarded
252
+ '''
253
+ def convert(self, pdf_path: Path) -> str:
254
+ """
255
+ Convert a single PDF file to Markdown string.
256
+
257
+ Parameters
258
+ ----------
259
+ pdf_path : pathlib.Path
260
+ Path to the source PDF.
261
+
262
+ Returns
263
+ -------
264
+ str
265
+ The extracted Markdown content.
266
+ """
267
+ logger.info(f"Converting {pdf_path} → Markdown")
268
+ try:
269
+ md_text = self.marker.extract_markdown(str(pdf_path))
270
+ return md_text
271
+ except Exception as exc:
272
+ logger.exception("Marker failed to convert PDF.")
273
+ raise RuntimeError(f"Failed to convert {pdf_path}") from exc
274
+
275
+
276
+ def batch_convert(self, pdf_paths: List[Path]) -> Dict[str, str]:
277
+ """
278
+ Convert multiple PDFs and return a mapping of filename → Markdown.
279
+
280
+ Parameters
281
+ ----------
282
+ pdf_paths : list[pathlib.Path]
283
+ List of PDF files to process.
284
+
285
+ Returns
286
+ -------
287
+ dict
288
+ Mapping from original file name (without extension) to Markdown string.
289
+ """
290
+ results = {}
291
+ for p in pdf_paths:
292
+ try:
293
+ md = self.convert(p)
294
+ key = p.stem # filename without .pdf
295
+ results[key] = md
296
+ except Exception as exc:
297
+ logger.warning(f"Skipping {p}: {exc}")
298
+ return results
299
+
300
+ def convert_file(self, src_path: Path, extractor: DocumentConverter): #DocumentExtractor): #-> str:
301
+ """
302
+ Converts one PDF or HTML file to Markdown + images
303
+ with retry/backoff on errors.
304
+ """
305
+ path = src_path
306
+ out_dir = path.parent / self.OUTPUT_DIR
307
+ out_dir.mkdir(parents=True, exist_ok=True)
308
+
309
+ for attempt in range(1, self.MAX_RETRIES + 1):
310
+ try:
311
+ rendered = extractor.converter(str(path), use_llm=True)
312
+
313
+ # Write Markdown
314
+ md_file = out_dir / f"{path.stem}.md"
315
+ md_file.write_text(rendered.markdown, encoding="utf-8")
316
+
317
+ # Dump images
318
+ for name, content in rendered.images.items():
319
+ (out_dir / name).write_bytes(content)
320
+
321
+ print(f"[ok] {path.name}")
322
+ return
323
+
324
+ except Exception as e:
325
+ if attempt == self.MAX_RETRIES:
326
+ print(f"[fail] {path.name} after {attempt} attempts")
327
+ traceback.print_exc()
328
+ else:
329
+ backoff = 2 ** attempt
330
+ print(f"[retry] {path.name} in {backoff}s ({e})")
331
+ time.sleep(backoff)
332
+ '''
data/output_dir/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ **
2
+ # !*.md
3
+ !COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/
4
+ !.gitignore
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ !*.md
2
+ !*.jpeg
3
+ # !COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/
4
+ !.gitignore
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.md ADDED
The diff for this file is too large to render. See raw diff
 
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_0_Picture_1.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_11_Figure_9.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_18_Figure_1.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_1.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_9.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_6_Figure_1.jpeg ADDED
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_8_Figure_1.jpeg ADDED
data/pdf/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ **
2
+ !*.pdf
3
+ !.gitignore
data/pdf/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e971eac65cce7be288302e2b1faf8c622b62bb9c8fedb60a3f88ff385c3104c
3
+ size 2137689
file_handler/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
file_handler/file_utils.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file_handler/file_utils.py
2
+ #import os
3
+ from pathlib import Path
4
+ from itertools import chain
5
+ from typing import List, Union, Any, Mapping
6
+ from PIL import Image
7
+
8
+ import utils.config as config
9
+
10
+ ##SMY: Might be deprecated vis duplicated. See marker/marker/config/parser.py ~ https://github.com/datalab-to/marker/blob/master/marker/config/parser.py#L169
11
+ #def create_outputdir(root: Union[str, Path], out_dir:Union[str, Path] = None) -> Path: #List[Path]:
12
+ def create_outputdir(root: Union[str, Path], output_dir_string:str = None) -> Path: #List[Path]:
13
+ """ Create output dir under the input folder """
14
+
15
+ ''' ##preserved for future implementation if needed again
16
+ root = root if isinstance(root, Path) else Path(root)
17
+ #root = Path(root)
18
+ if not root.exists():
19
+ raise FileNotFoundError(f"Root path {root} does not exist: cannot create output dir.")
20
+ out_dir = out_dir if out_dir else "output_md" ## SMY: default to outputdir in config file = "output_md"
21
+ output_dir = root.parent / out_dir #"md_output" ##SMY: concatenating output str with src Path
22
+ '''
23
+
24
+ ## map to img_path. Opt to putting output within same output_md folder rather than individual source folders
25
+ output_dir_string = output_dir_string if output_dir_string else "output_dir" ##redundant SMY: default to outputdir in config file = "output_md"
26
+ output_dir = Path("data") / output_dir_string #"output_md" ##SMY: concatenating output str with src Path
27
+ output_dir.mkdir(mode=0o2644, parents=True, exist_ok=True)
28
+ return output_dir
29
+
30
+ def is_file_with_extension(path_obj: Path) -> bool:
31
+ """
32
+ Checks if a pathlib.Path object is a file and has a non-empty extension.
33
+ """
34
+ path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
35
+ return path_obj.is_file() and bool(path_obj.suffix)
36
+
37
+ def process_dicts_data(data:Union[dict, list[dict]]):
38
+ """ Returns formatted JSON string for a single dictionary or a list of dictionaries"""
39
+ import json
40
+ from pathlib import WindowsPath
41
+ #from typing import dict, list
42
+
43
+ # Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
44
+ class PathEncoder(json.JSONEncoder):
45
+ def default(self, obj):
46
+ if isinstance(obj, WindowsPath):
47
+ return str(obj)
48
+ # Let the base class default method raise the TypeError for other types
49
+ return json.JSONEncoder.default(self, obj)
50
+
51
+ # Convert the list of dicts to a formatted JSON string
52
+ formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
53
+
54
+ return formatted_string
55
+
56
+ ##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
57
+ def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
58
+ """
59
+ Recursively walk *root* and return a list of all PDF files.
60
+ """
61
+ root = Path(root)
62
+ patterns = ["*.pdf", "*.html"] #, "*.htm*"]
63
+ if not root.exists():
64
+ raise FileNotFoundError(f"Root path {root} does not exist.")
65
+ #pdfs_htmls = [p for p in root.rglob("*.pdf", "*.html", "*.htm*") if p.is_file()]
66
+ #pdfs_htmls = [chain.from_iterable(root.rglob(pattern) for pattern in patterns)]
67
+ # Use itertools.chain to combine the generators from multiple rglob calls
68
+ pdfs_htmls = list(chain.from_iterable(root.rglob(pattern) for pattern in patterns))
69
+
70
+ return pdfs_htmls
71
+
72
+ def collect_pdf_paths(root: Union[str, Path]) -> List[Path]:
73
+ """
74
+ Recursively walk *root* and return a list of all PDF files.
75
+ """
76
+ root = Path(root)
77
+ if not root.exists():
78
+ raise FileNotFoundError(f"Root path {root} does not exist.")
79
+ pdfs = [p for p in root.rglob("*.pdf") if p.is_file()]
80
+ return pdfs
81
+
82
+ def collect_html_paths(root: Union[str, Path]) -> List[Path]:
83
+ """
84
+ Recursively walk *root* and return a list of all PDF files.
85
+ """
86
+ root = Path(root)
87
+ if not root.exists():
88
+ raise FileNotFoundError(f"Root path {root} does not exist.")
89
+ htmls = [p for p in root.rglob("*.html", ".htm") if p.is_file()]
90
+
91
+ ## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
92
+
93
+ return htmls
94
+
95
+ def collect_markdown_paths(root: Union[str, Path]) -> List[Path]:
96
+ """
97
+ Recursively walk *root* and return a list of all Markdown files.
98
+ """
99
+ root = Path(root)
100
+ md_files = [p for p in root.rglob("*.md") if p.is_file()]
101
+ return md_files
102
+
103
+ #m __future__ import annotations
104
+ def write_markdown(
105
+ src_path: Union[str, Path],
106
+ output_dir: Union[str, Path],
107
+ rendered: Any,
108
+ ) -> Path:
109
+
110
+ """
111
+ Write the Markdown representation of a source file to an output directory.
112
+
113
+ Parameters
114
+ ----------
115
+ src_path : str | Path
116
+ Path to the original source file. Only its base name is used for naming
117
+ the resulting Markdown file.
118
+ output_dir : str | Path
119
+ Directory where the Markdown file will be written. It was created if it does not
120
+ exist with create_outputdir().
121
+ rendered : object
122
+ Object that provides a ``markdown`` attribute containing the text to write.
123
+
124
+ Returns
125
+ -------
126
+ pathlib.Path
127
+ The full path of the written Markdown file.
128
+
129
+ Raises
130
+ ------
131
+ FileNotFoundError
132
+ If *src_path* does not point to an existing file.
133
+ OSError
134
+ If writing the file fails for any reason (e.g. permission denied).
135
+ AttributeError
136
+ If *rendered* does not expose a ``markdown`` attribute.
137
+
138
+ Notes
139
+ -----
140
+ The function is intentionally lightweight: it only handles path resolution,
141
+ directory creation, and file I/O. All rendering logic should be performed before
142
+ calling this helper.
143
+ """
144
+ src = Path(src_path)
145
+ if not src.is_file():
146
+ raise FileNotFoundError(f"Source file does not exist: {src}")
147
+
148
+ #out_dir = Path(output_dir)
149
+ #out_dir.mkdir(parents=True, exist_ok=True)
150
+
151
+ md_name = f"{src.stem}.md"
152
+ if isinstance(output_dir, Path):
153
+ md_path = output_dir / f"{src.stem}" / md_name
154
+ else:
155
+ #md_path = Path(src.parent) / f"{Path(output_dir).stem}" / f"{src.stem}" / md_name
156
+
157
+ ## Opt to putting output within same output_md folder rather than individual source folders
158
+ #md_path = Path("data\\pdf") / "output_md" / f"{src.stem}" / md_name ##debug
159
+ md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
160
+ ##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
161
+ md_path.parent.mkdir(mode=0o2644, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
162
+ md_path.parent.chmod(0)
163
+
164
+ try:
165
+ markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
166
+ except AttributeError as exc: # pragma: no cover
167
+ raise AttributeError(
168
+ "Extractor Rendered object must have a 'markdown' attribute"
169
+ ) from exc
170
+
171
+ with md_path.open(mode="w", encoding="utf-8") as md_f:
172
+ md_f.write(markdown_text) ##SMY: write markdown content to markdown file
173
+
174
+ return md_path ##SMY: return the markdown file #✓
175
+ #return {"files": md_path} ##SMY: return dict of file with markdown filename.
176
+
177
+ # Dummp Markdown extracted images
178
+ def dump_images(
179
+ src_path: Union[str, Path],
180
+ output_dir: Union[str, Path],
181
+ rendered: Any,
182
+ ) -> int:
183
+
184
+ """
185
+ Dump the images of the Markdown representation of a source file to an output directory.
186
+
187
+ Parameters
188
+ ----------
189
+ src_path : str | Path
190
+ Path to the original source file. Only its base name is used for naming
191
+ the resulting Markdown file.
192
+ output_dir : str | Path
193
+ Directory where the Markdown file will be written. It was created if it does not
194
+ exist with create_outputdir().
195
+ rendered : object
196
+ Object that provides a ``markdown`` attribute containing the text to write.
197
+
198
+ Returns
199
+ -------
200
+ Number of images dumped from the Markdown file.
201
+ """
202
+
203
+ try:
204
+ images: Image.Image = getattr(rendered, "images")
205
+ except TypeError as exc: # pragma: no cover
206
+ raise AttributeError(
207
+ "Extracted images from rendered.images must be a mapping of str -> PIL.Image"
208
+ ) from exc
209
+
210
+ # Initialise variables
211
+ images_count = 0
212
+ img_path_list = []
213
+ ##SMY: See marker.output.save_output() : https://github.com/datalab-to/marker/blob/master/marker/output.py
214
+ #for img_name, img_bytes in images.items():
215
+
216
+ src = Path(src_path) ##SMY: keep uniform with write_markdown. No need is exists anymore
217
+ for img_name, img in images.items():
218
+ # Resolve the full path and make sure any sub‑directories exist.
219
+ #img_path = Path(output_dir) / src_path / img_name ##SMY: image files ##concatenate Path + str
220
+ #img_path = create_outputdir(src_path) / img_name
221
+
222
+ if isinstance(output_dir, Path):
223
+ img_path = output_dir.stem / img_name
224
+ else:
225
+ # #img_path = Path(output_dir) / f"{src.stem}" / img_name ##SMY: create markdown file ##SMY concatenating Path with str
226
+ # #img_path = Path(output_dir) / img_name ##SMY: create markdown file ##SMY concatenating Path with str
227
+ #img_path = Path(src.parent) / f"{Path(output_dir).stem}" / f"{src.stem}" / img_name
228
+
229
+ #img_path = Path("data\\pdf") / "output_md" / f"{src.stem}" / img_name ##debug
230
+ img_path = Path("data") / output_dir / f"{src.stem}" / img_name ##debug
231
+ #img_path.mkdir(mode=0o777, parents=True, exist_ok=True) ##SMY: create nested img_path if not exists
232
+ #img_path.parent.mkdir(parents=True, exist_ok=True)
233
+
234
+ img.save(img_path) ##SMY: save images (of type PIL.Image.Image) to markdown folder
235
+ images_count += 1
236
+ #img_path_list = img_path_list.append(img_path)
237
+ img_path_list.append(img_path)
238
+
239
+ return images_count, img_path_list ##SMY: return number of images and path
240
+ #return images.items().count
241
+ #return len(images)
242
+
243
+ # Dummp Markdown extracted images ##SMY: Marked for deprecated
244
+ '''
245
+ def dump_images(
246
+ src_path: Union[str, Path],
247
+ output_dir: Union[str, Path],
248
+ rendered: Any,
249
+ ) -> int:
250
+
251
+ """
252
+ Dump the images of the Markdown representation of a source file to an output directory.
253
+
254
+ Parameters
255
+ ----------
256
+ src_path : str | Path
257
+ Path to the original source file. Only its base name is used for naming
258
+ the resulting Markdown file.
259
+ output_dir : str | Path
260
+ Directory where the Markdown file will be written. It was created if it does not
261
+ exist with create_outputdir().
262
+ rendered : object
263
+ Object that provides a ``markdown`` attribute containing the text to write.
264
+
265
+ Returns
266
+ -------
267
+ Number of images dumped from the Markdown file.
268
+ """
269
+
270
+ try:
271
+ images: Mapping[str, bytes] = getattr(rendered, "images")
272
+ except TypeError as exc: # pragma: no cover
273
+ raise AttributeError(
274
+ "Extracted images from rendered.images must be a mapping of str -> bytes"
275
+ ) from exc
276
+
277
+ images_count = 0
278
+ ##SMY: See marker.output.save_output() : https://github.com/datalab-to/marker/blob/master/marker/output.py
279
+ #for img_name, img_bytes in images.items():
280
+ for img_name, img in images.items():
281
+ # Resolve the full path and make sure any sub‑directories exist.
282
+ img_path = Path(output_dir) / src_path / img_name ##SMY: image files ##concatenate Path + str
283
+ img_path.parent.mkdir(parents=True, exist_ok=True)
284
+
285
+ #'' '
286
+ #with img_path.open("wb") as fp:
287
+ # fp.write(img_bytes) ##SMY: write images to markdown folder
288
+ #images_count += 1
289
+ #'' '
290
+ img.save(img_path) ##SMY: save images (of type PIL.Image.Image) to markdown folder
291
+ images_count += 1
292
+
293
+ return images_count ##SMY: return number of images
294
+ #return images.items().count
295
+ #return len(images)
296
+ '''
llm/__init__.py ADDED
File without changes
llm/hf_client.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Literal, Optional
4
+ import os
5
+ import time
6
+ import traceback
7
+ from huggingface_hub import InferenceClient, login, logout as hf_logout
8
+
9
+ from llm.llm_login import login_huggingface, is_login_huggingface
10
+
11
+ from utils.logger import get_logger
12
+
13
+ ## Get logger instance
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class HFChatClient:
18
+ """
19
+ Provider‐agnostic LLM client interface.
20
+ Encapsulate `huggingface_hub.InferenceClient` setup and chat calls.
21
+
22
+ Backends:
23
+ - model: plain HF model id (e.g., "HuggingFaceH4/zephyr-7b-beta")
24
+ - provider: provider-routed id (e.g., "openai/gpt-oss-120b:fireworks-ai")
25
+ - endpoint: full inference endpoint URL (e.g., "http://localhost:1234").
26
+ """
27
+
28
+ def __init__(self,
29
+ #api_token: str,
30
+ #model_id: str = "gpt2",
31
+ provider: str = "huggingface", ## "huggingface2", "openai"
32
+ model_id: str = "openai/gpt-oss-120b", ##default_model
33
+ hf_provider: str = "huggingface",
34
+ endpoint_url: Optional[str] = None,
35
+ #backend: Literal["model", "provider", "endpoint"] = [],
36
+ backend_choice: Optional[str] = None, #choices=["model-id", "provider", "endpoint"]
37
+ system_message: str = "",
38
+ max_tokens: int = 4096,
39
+ temperature: float = 0.0,
40
+ top_p: float = 0.1,
41
+ stream: bool = False,
42
+ api_token: Optional[str] = None
43
+ ) -> None:
44
+
45
+ try:
46
+ self.model_id = model_id
47
+ self.provider = provider.lower()
48
+ self.hf_provider = hf_provider.lower()
49
+ self.endpoint_url = endpoint_url
50
+ #self.backend = backend
51
+ #self.backend_literal: Literal["model", "provider", "endpoint"] = (
52
+ '''
53
+ self.backend: Literal["model", "provider", "endpoint"] = (
54
+ "model" if backend_choice == "Hugging Face Model ID" else (
55
+ "provider" if backend_choice == "HF Provider Route" else "endpoint")
56
+ ),
57
+ '''
58
+ self.backend: Literal["model", "provider", "endpoint"] = (
59
+ "model" if backend_choice == "model-id" else (
60
+ "provider" if backend_choice == "provider" else "endpoint")
61
+ ) ## see Gradio backend_choice dropdown
62
+ self.system_message = system_message
63
+ self.max_tokens = max_tokens
64
+ self.temperature = temperature
65
+ self.top_p = top_p
66
+ self.stream = stream
67
+ self.token = api_token if api_token else None #"" # invalid; preserved
68
+ #self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
69
+
70
+ self.base_url = "https://router.huggingface.co/v1" #%22" #HF API proxy
71
+ except Exception as exc:
72
+ #logger.error(f"client_init_failed", extra={"error": str(exc)}")
73
+ tb = traceback.format_exc()
74
+ logger.exception(f'✗ client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
75
+ raise RuntimeError(f"✗ Failed to initialise client: {exc}\n{tb}")
76
+
77
+ ##SMY: //TOBE: Deprecated : Moved to llm.llm_login
78
+ '''
79
+ # # Disable implicit token propagation for determinism
80
+ # Explicitly disable implicit token propagation; we rely on explicit auth or env var
81
+ os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
82
+
83
+ # Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
84
+ try:
85
+ login()
86
+ time.sleep(15) ##SMY pause for login. Helpful: pool async opex
87
+ logger.info("hf_login", extra={"mode": "cli"})
88
+ except Exception as exc:
89
+ # Respect common env var names; prefer explicit token arg when provided
90
+ fallback_token = self.token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
91
+ if fallback_token:
92
+ try:
93
+ login(token=fallback_token)
94
+ self.token = fallback_token
95
+ logger.info("hf_login", extra={"mode": "token"})
96
+ except Exception as exc_token:
97
+ logger.warning("hf_login_failed", extra={"error": str(exc_token)})
98
+ else:
99
+ logger.warning("hf_login_failed", extra={"error": str(exc)})
100
+ # Silent fallback; client will still work if token is passed directly
101
+ #pass
102
+ '''
103
+ login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
104
+
105
+ @staticmethod
106
+ def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
107
+ """
108
+ `prompt` prefixed by system_message if set
109
+ Normalise chat history to list of {"role": role, "content": content} dicts.
110
+ Supports both dict and tuple formats for history items.
111
+ """
112
+ messages: list[dict] = []
113
+ if system_message:
114
+ messages.append({"role": "system", "content": system_message})
115
+ for item in history or []:
116
+ if isinstance(item, dict) and "role" in item and "content" in item:
117
+ if item["role"] in ("user", "assistant"):
118
+ messages.append({"role": item["role"], "content": item["content"]})
119
+ elif isinstance(item, (list, tuple)) and len(item) == 2:
120
+ usr, asst = item
121
+ if usr:
122
+ messages.append({"role": "user", "content": usr})
123
+ if asst:
124
+ messages.append({"role": "assistant", "content": asst})
125
+ messages.append({"role": "user", "content": latest_user_message})
126
+ return messages
127
+
128
+ @staticmethod
129
+ def _initialise_client(self,
130
+ backend: Literal["model", "provider", "endpoint"],
131
+ model_id: Optional[str] = None,
132
+ hf_provider: Optional[str] = None,
133
+ endpoint_url: Optional[str] = None,
134
+ token: Optional[str] = None) -> InferenceClient:
135
+
136
+ try:
137
+ match backend:
138
+ case "endpoint" | "model":
139
+ logger.debug("_initialise_client: initialising with:", extra={"model":model_id}) ## debug
140
+ hf_client = InferenceClient(model=model_id or endpoint_url, token=token) #endpoint=target) ##, token=api_token or self.token)
141
+ logger.log(20, "client: ", extra={"model":model_id}) ## debug
142
+ case "provider":
143
+ logger.info("_initialise_client: initialising with:", extra={"provider":hf_provider}) ## debug
144
+ hf_client = InferenceClient(provider=hf_provider, model=model_id, token=token) ##, token=api_token or self.token)
145
+ #client = client(model = model_id, provider=provider, token=token) ##target
146
+ logger.log(20, "client: ", extra={"backend":backend}) ## debug
147
+ case _:
148
+ raise ValueError("Invalid backend.")
149
+ return hf_client
150
+ except Exception as exc:
151
+ logger.log(40, "_initialise_client: client_init_failed", extra={"error": str(exc)}) ## debug
152
+ raise RuntimeError(f"_initialise_client: Failed to initialise client: {exc}")
153
+
154
+ ## wrap HF client for marker
155
+ def chat_fn(
156
+ self,
157
+ message: str,
158
+ history: list = [],
159
+ ) -> Iterable[str]:
160
+ """
161
+ messages = self._normalise_history(history, system_message, message)
162
+ token = api_token or self.token
163
+ """
164
+ ## set prompt and token
165
+ messages = self._normalise_history(message, history, self.system_message)
166
+ #token = api_token or self.token
167
+ #token = self.token ## redundant
168
+
169
+ logger.log(20,"chat: initialising client", extra={
170
+ "backend": self.backend, "model": self.model_id, "provider": self.hf_provider, "endpoint": self.endpoint_url,
171
+ "stream": self.stream, "max_tokens": self.max_tokens, "temperature": self.temperature, "top_p": self.top_p,
172
+ })
173
+
174
+ ## initialised client
175
+ try:
176
+ client = self._initialise_client(self, self.backend, self.model_id, self.hf_provider, self.endpoint_url, self.token) #api_token)
177
+ logger.log(20, "chat: client initialised") ## debug
178
+ except Exception as exc:
179
+ ##logger.error
180
+ logger.log(40,"chat client_init_failed", extra={"error": str(exc)})
181
+ raise RuntimeError(f"chat: Failed to initialise client: {exc}")
182
+
183
+ logger.log(20, "chat_start", extra={
184
+ "backend": self.backend, "model": self.model_id, "provider": self.hf_provider, "endpoint": self.endpoint_url,
185
+ "stream": self.stream, "max_tokens": self.max_tokens, "temperature": self.temperature, "top_p": self.top_p,
186
+ })
187
+
188
+ if self.stream:
189
+ acc = ""
190
+ for chunk in client.chat_completion(
191
+ messages=messages,
192
+ #model=client.model, ## moved back to client initialise
193
+ max_tokens=self.max_tokens,
194
+ stream=True,
195
+ temperature=self.temperature,
196
+ top_p=self.top_p,
197
+ ):
198
+ delta = getattr(chunk.choices[0].delta, "content", None) or ""
199
+ if delta:
200
+ acc += delta
201
+ yield acc
202
+ return
203
+
204
+ result = client.chat_completion(
205
+ messages=messages,
206
+ #model=client.model, ## moved back to client initialised
207
+ max_tokens=self.max_tokens,
208
+ stream=False,
209
+ temperature=self.temperature,
210
+ top_p=self.top_p,
211
+ )
212
+ yield result.choices[0].message.content
213
+
214
+ '''
215
+ ## future consideration
216
+ response = client.text_generation(
217
+ #model=model_name,
218
+ inputs=prompt,
219
+ parameters={
220
+ "max_new_tokens": max_new_tokens,
221
+ "temperature": temperature,
222
+ },
223
+ )
224
+ return response[0].generated_text
225
+ '''
226
+
227
+ def logout(self) -> bool:
228
+ """Logout from Hugging Face and clear in-process tokens.
229
+
230
+ Returns True on success, False otherwise.
231
+ """
232
+ try:
233
+ hf_logout()
234
+ except Exception as exc:
235
+ logger.error("hf_logout_failed", extra={"error": str(exc)})
236
+ return False
237
+ # Clear process environment tokens
238
+ for key in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN"):
239
+ if key in os.environ:
240
+ os.environ.pop(key, None)
241
+ self.token = None
242
+ logger.info("hf_logout_success")
243
+ return True
244
+
llm/llm_login.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login, logout
2
+ import os
3
+ import traceback
4
+ from time import sleep
5
+ from typing import Optional
6
+
7
+ from utils.logger import get_logger
8
+
9
+ ## Get logger instance
10
+ logger = get_logger(__name__)
11
+
12
+ def login_huggingface(token: Optional[str] = None):
13
+ """
14
+ Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
15
+
16
+ Attempts to log in to Hugging Face Hub.
17
+ First, it tries to log in interactively via the Hugging Face CLI.
18
+ If that fails, it falls back to using a token provided as an argument or
19
+ found in the environment variables HF_TOKEN or HUGGINGFACEHUB_API_TOKEN.
20
+
21
+ If both methods fail, it logs a warning and continues without logging in.
22
+ """
23
+
24
+ logger.info("Attempting Hugging Face login...")
25
+
26
+ # Disable implicit token propagation for determinism
27
+ # Explicitly disable implicit token propagation; we rely on explicit auth or env var
28
+ os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
29
+
30
+ token = token
31
+ # Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
32
+ try:
33
+ login()
34
+ sleep(5) ##SMY pause for login. Helpful: pool async opex
35
+ logger.info("✔️ hf_login already", extra={"mode": "cli"})
36
+ except Exception as exc:
37
+ # Respect common env var names; prefer explicit token arg when provided
38
+ fallback_token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
39
+ if fallback_token:
40
+ try:
41
+ login(token=fallback_token)
42
+ token = fallback_token
43
+ logger.info("✔️ hf_login through fallback", extra={"mode": "token"}) ##SMY: This only displays if token is provided
44
+ except Exception as exc_token:
45
+ logger.warning("❌ hf_login_failed", extra={"error": str(exc_token)})
46
+ else:
47
+ logger.warning("❌ hf_login_failed", extra={"error": str(exc)})
48
+ # Silent fallback; client will still work if token is passed directly
49
+ #pass
50
+
51
+ def is_login_huggingface():
52
+ from huggingface_hub import HfApi
53
+ from huggingface_hub.utils import HfHubHTTPError
54
+
55
+ try:
56
+ HfApi().whoami()
57
+ logger.log(level=20, msg=("✔️ You are logged in."), extra={"is_logged_in": True})
58
+ return True
59
+ except HfHubHTTPError as exc:
60
+ # A 401 status code indicates an authentication error.
61
+ if exc.response.status_code == 401:
62
+ print("⚠️ You are not logged in. You can still access public models.")
63
+ else:
64
+ # Handle other HTTP errors if necessary
65
+ #print(f"An unexpected HTTP error occurred: {exc}")
66
+ tb = traceback.format_exc()
67
+ logger.exception(f"✗ An unexpected HTTP error occurred: → {exc}\n{tb}", exc_info=True)
68
+ #raise RuntimeError(f"✗ An unexpected HTTP error occurred: → {exc}\n{tb}") from exc
69
+ return False
70
+
llm/openai_client.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional #Iterable, Literal
5
+ #import os
6
+ #import time
7
+ import traceback
8
+ #from huggingface_hub import InferenceClient, login, logout as hf_logout
9
+
10
+ from llm.llm_login import login_huggingface, is_login_huggingface
11
+
12
+ import dotenv
13
+ #dotenv.load_dotenv(".env")
14
+
15
+
16
+ from utils.logger import get_logger
17
+
18
+ ## Get logger instance
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class OpenAIChatClient:
23
+ """
24
+ Provider‐agnostic OpenAI-based LLM client interface.
25
+ Compatible with `huggingface_hub.InferenceClient` setup and chat calls.
26
+
27
+ - base_url="https://router.huggingface.co/v1",
28
+ - api_key=os.environ["HF_TOKEN"],
29
+ """
30
+
31
+ def __init__(self,
32
+ model_id: Optional[str] = None,
33
+ hf_provider: Optional[str] = None,
34
+ base_url: Optional[str] = "https://router.huggingface.co/v1", #None,
35
+ api_token: Optional[str] = None,
36
+ temperature: Optional[float] = 0.2,
37
+ top_p: Optional[float] = 0.2,
38
+ ) -> None:
39
+
40
+ try:
41
+ openai_api_key_env = dotenv.get_key(".env", "OPENAI_API_KEY")
42
+ self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
43
+ self.hf_provider = hf_provider
44
+ self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
45
+ #self.token = api_token if api_token else None ##debug
46
+ self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
47
+ #self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
48
+ login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
49
+ #self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
50
+ self.openai_api_key = self.token #self.fake_token
51
+ self.temperature = temperature
52
+ self.top_p = top_p
53
+
54
+ logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
55
+
56
+ except Exception as exc:
57
+ #logger.error(f"OpenAI client_init_failed", extra={"error": str(exc)}")
58
+ tb = traceback.format_exc()
59
+ logger.exception(f'✗ OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
60
+ raise RuntimeError(f"✗ Failed to initialise OpenAI client: {exc}\n{tb}")
61
+
62
+ #login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
63
+
64
+ ####IN PROGRESS
65
+ #
66
+ """
67
+ ## HuggingFace API-proxy Inference Provider - https://huggingface.co/docs/inference-providers/index?python-clients=openai
68
+ ## https://huggingface.co/openai/gpt-oss-20b?inference_api=true&inference_provider=fireworks-ai&language=python&client=openai
69
+
70
+ import os
71
+ from openai import OpenAI
72
+
73
+ client = OpenAI(
74
+ base_url="https://router.huggingface.co/v1",
75
+ api_key=os.environ["HF_TOKEN"],
76
+ )
77
+
78
+ stream = client.chat.completions.create(
79
+ model="openai/gpt-oss-20b:fireworks-ai",
80
+ messages=[
81
+ {
82
+ "role": "user",
83
+ "content": "What is the capital of France?"
84
+ }
85
+ ],
86
+ stream=True,
87
+ )
88
+
89
+ for chunk in stream:
90
+ print(chunk.choices[0].delta.content, end="")
91
+ """
llm/provider_validator.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Validate provider names against Hugging Face Inference Providers list.
2
+
3
+ Source: https://huggingface.co/docs/inference-providers/index
4
+
5
+ Functions:
6
+ - get_supported_providers() -> set[str]
7
+ - normalize_provider(text: str) -> str | None
8
+ - is_valid_provider(text: str) -> bool
9
+ - suggest_providers(text: str, limit: int = 3) -> list[str]
10
+
11
+ Supports common aliases (e.g., "together-ai" -> "together", "fireworks" -> "fireworks-ai").
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from difflib import get_close_matches
17
+ from typing import Iterable
18
+
19
+
20
+ # Canonical provider slugs from docs (table and provider URLs)
21
+ _CANONICAL: set[str] = {
22
+ "cerebras",
23
+ "cohere",
24
+ "fal-ai",
25
+ "featherless-ai",
26
+ "fireworks-ai",
27
+ "groq",
28
+ "hf-inference",
29
+ "hyperbolic",
30
+ "nebius",
31
+ "novita",
32
+ "nscale",
33
+ "replicate",
34
+ "sambanova",
35
+ "together",
36
+ }
37
+
38
+ # Common aliases users may type; maps to canonical slug
39
+ _ALIASES: dict[str, str] = {
40
+ "together-ai": "together",
41
+ "fireworks": "fireworks-ai",
42
+ "falai": "fal-ai",
43
+ "featherless": "featherless-ai",
44
+ "hf": "hf-inference",
45
+ "huggingface": "hf-inference",
46
+ }
47
+
48
+
49
+ def _to_key(text: str) -> str:
50
+ return (text or "").strip().lower()
51
+
52
+
53
+ def get_supported_providers(extra: Iterable[str] | None = None) -> set[str]:
54
+ """Return set of canonical provider slugs.
55
+
56
+ Optionally extend with additional slugs via `extra`.
57
+ """
58
+ return _CANONICAL | set(map(_to_key, (extra or [])))
59
+
60
+
61
+ def normalize_provider(text: str) -> str | None:
62
+ """Return canonical provider slug for `text`, if known; else None.
63
+
64
+ Accepts canonical slugs and common aliases.
65
+ """
66
+ key = _to_key(text)
67
+ if not key:
68
+ return None
69
+ if key in _CANONICAL:
70
+ return key
71
+ if key in _ALIASES:
72
+ return _ALIASES[key]
73
+ return None
74
+
75
+
76
+ def is_valid_provider(text: str) -> bool:
77
+ """True if `text` is a known provider or alias."""
78
+ return normalize_provider(text) is not None
79
+
80
+
81
+ def suggest_providers(text: str, limit: int = 3) -> list[str]:
82
+ """Suggest close canonical matches for `text`.
83
+
84
+ Uses difflib to match against canonical slugs; returns up to `limit` suggestions.
85
+ """
86
+ key = _to_key(text)
87
+ if not key:
88
+ return []
89
+ # Search both canonical and alias keys to be helpful, then map to canonical
90
+ candidates = list(_CANONICAL | set(_ALIASES))
91
+ suggestions = get_close_matches(key, candidates, n=limit, cutoff=0.6)
92
+ canon = []
93
+ for s in suggestions:
94
+ canon_slug = s if s in _CANONICAL else _ALIASES.get(s)
95
+ if canon_slug and canon_slug not in canon:
96
+ canon.append(canon_slug)
97
+ return canon[:limit]
98
+
99
+
100
+ if __name__ == "__main__":
101
+ import sys
102
+
103
+ query = " ".join(sys.argv[1:])
104
+ if not query:
105
+ print("Usage: python provider_validator.py <provider-name>")
106
+ raise SystemExit(2)
107
+
108
+ norm = normalize_provider(query)
109
+ if norm:
110
+ print(f"valid: {norm}")
111
+ else:
112
+ print("invalid")
113
+ suggestions = suggest_providers(query)
114
+ if suggestions:
115
+ print("did_you_mean:", ", ".join(suggestions))
116
+
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from ui.gradio_ui import build_interface
6
+ from utils.logger import get_logger, setup_logging
7
+
8
+ setup_logging() ## set logging
9
+ #logger = get_logger("pypdfmd")
10
+ logger = get_logger("parserpdf")
11
+
12
+ if __name__ == "__main__":
13
+ # Ensure the working directory is clean
14
+ #os.chdir(os.path.dirname(__file__))
15
+ ## script working dir absolute path
16
+ script_dir = Path(__file__).resolve().parent
17
+ ## change the cwd to the script's dir
18
+ os.chdir(script_dir) ##Path.cwd()
19
+
20
+ demo = build_interface()
21
+ #demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
22
+ demo.launch(debug=True, show_error=True)
pyproject.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ #name = "parserpdf"
3
+ name = "parser2md"
4
+ version = "0.1.0"
5
+ description = "PDF & HTML parser to markdown"
6
+ readme = "README.md"
7
+ requires-python = ">=3.12"
8
+ dependencies = []
9
+ owner "research-semmyk"
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0
2
+ #marker==1.3.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
3
+ #pandoc==2.3 # for Markdown → PDF conversion
4
+ #weasyprint==59.0 # optional fallback if pandoc is not available
5
+ python-magic==0.4.27 # file‑type detection
tests/test_converters.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_converters.py
2
+ # run with pytest tests/.
3
+
4
+ import pytest
5
+ import os
6
+ from unittest.mock import patch, MagicMock
7
+ from pathlib import Path
8
+
9
+ from converters.pdf_to_md import PdfToMarkdownConverter
10
+ from converters.html_to_md import HtmlToMarkdownConverter
11
+ from converters.md_to_pdf import MarkdownToPdfConverter
12
+ from converters.extraction_converter import DocumentConverter
13
+
14
+ @pytest.fixture
15
+ def sample_pdf_path():
16
+ # Create a temporary PDF file for testing
17
+ pdf_path = Path("tests/sample.pdf")
18
+ pdf_path.write_bytes(b"%PDF-1.4\nSample PDF content")
19
+ yield pdf_path
20
+ if pdf_path.exists():
21
+ pdf_path.unlink()
22
+
23
+ @pytest.fixture
24
+ def sample_html_path():
25
+ html_path = Path("tests/sample.html")
26
+ html_path.write_text("<html><body><h1>Test</h1><p>Hello World</p></body></html>")
27
+ yield html_path
28
+ if html_path.exists():
29
+ html_path.unlink()
30
+
31
+ @pytest.fixture
32
+ def sample_md_path():
33
+ md_path = Path("tests/sample.md")
34
+ md_path.write_text("# Test\nHello World")
35
+ yield md_path
36
+ if md_path.exists():
37
+ md_path.unlink()
38
+
39
+ def test_pdf_to_markdown_converter_init():
40
+ converter = PdfToMarkdownConverter()
41
+ assert isinstance(converter, PdfToMarkdownConverter)
42
+ assert hasattr(converter, 'output_dir_string')
43
+
44
+ @patch('converters.pdf_to_md.Marker') # Assuming Marker is imported in pdf_to_md.py
45
+ def test_pdf_to_markdown_convert_file(mock_marker, sample_pdf_path):
46
+ mock_marker.convert_single.return_value = {"markdown": "# Converted\nContent", "images": []}
47
+
48
+ converter = PdfToMarkdownConverter()
49
+ result = converter.convert_file(sample_pdf_path)
50
+
51
+ assert isinstance(result, dict)
52
+ assert "markdown" in result
53
+ assert "filepath" in result
54
+ mock_marker.convert_single.assert_called_once_with(str(sample_pdf_path), prefer_latex=False)
55
+
56
+ def test_html_to_markdown_converter(sample_html_path):
57
+ converter = HtmlToMarkdownConverter()
58
+ result = converter.batch_convert([sample_html_path])
59
+
60
+ assert isinstance(result, dict)
61
+ assert Path(sample_html_path.name) in result
62
+ assert result[Path(sample_html_path.name)].startswith("# Test")
63
+
64
+ def test_markdown_to_pdf_converter(sample_md_path):
65
+ converter = MarkdownToPdfConverter()
66
+ output_dir = Path("tests/output_pdf")
67
+ output_dir.mkdir(exist_ok=True)
68
+
69
+ pdf_files = converter.batch_convert([sample_md_path], output_dir)
70
+
71
+ assert isinstance(pdf_files, list)
72
+ if pdf_files:
73
+ pdf_path = pdf_files[0]
74
+ assert pdf_path.exists()
75
+ assert pdf_path.suffix == ".pdf"
76
+ pdf_path.unlink()
77
+
78
+ output_dir.rmdir()
79
+
80
+ @patch('converters.extraction_converter.get_token')
81
+ def test_document_converter_login(mock_get_token):
82
+ mock_get_token.return_value = "test_token"
83
+ converter = DocumentConverter()
84
+ assert converter.client.token == "test_token"
85
+
86
+ def test_pdf_to_markdown_batch_convert(tmp_path):
87
+ # Test batch with multiple files
88
+ pdf1 = tmp_path / "test1.pdf"
89
+ pdf2 = tmp_path / "test2.pdf"
90
+ pdf1.write_bytes(b"%PDF-1.4")
91
+ pdf2.write_bytes(b"%PDF-1.4")
92
+
93
+ converter = PdfToMarkdownConverter()
94
+ with patch.object(converter, 'convert_file', return_value={"markdown": "test", "filepath": str(pdf1)}):
95
+ results = converter.batch_convert([pdf1, pdf2])
96
+
97
+ assert len(results) == 2
98
+ assert all("markdown" in res for res in results)
tests/test_file_handler.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_file_handler.py
2
+ # run with pytest tests/.
3
+
4
+ import pytest
5
+ from pathlib import Path
6
+ import tempfile
7
+ from unittest.mock import patch
8
+
9
+ from file_handler.file_utils import (
10
+ collect_pdf_paths, collect_html_paths, collect_markdown_paths,
11
+ process_dicts_data, create_outputdir
12
+ )
13
+
14
+ @pytest.fixture
15
+ def temp_dir_with_pdfs():
16
+ with tempfile.TemporaryDirectory() as tmpdirname:
17
+ tmpdir = Path(tmpdirname)
18
+ # Create sample PDF files
19
+ (tmpdir / "doc1.pdf").touch()
20
+ (tmpdir / "subfolder/doc2.pdf").mkdir(parents=True)
21
+ (tmpdir / "subfolder/doc2.pdf").touch()
22
+ (tmpdir / "not_pdf.txt").touch()
23
+ yield tmpdir
24
+
25
+ @pytest.fixture
26
+ def temp_dir_with_html():
27
+ with tempfile.TemporaryDirectory() as tmpdirname:
28
+ tmpdir = Path(tmpdirname)
29
+ (tmpdir / "page1.html").touch()
30
+ (tmpdir / "subfolder/page2.htm").mkdir(parents=True)
31
+ (tmpdir / "subfolder/page2.htm").touch()
32
+ (tmpdir / "not_html.md").touch()
33
+ yield tmpdir
34
+
35
+ @pytest.fixture
36
+ def temp_dir_with_md():
37
+ with tempfile.TemporaryDirectory() as tmpdirname:
38
+ tmpdir = Path(tmpdirname)
39
+ (tmpdir / "note1.md").touch()
40
+ (tmpdir / "subfolder/note2.md").mkdir(parents=True)
41
+ (tmpdir / "subfolder/note2.md").touch()
42
+ (tmpdir / "not_md.pdf").touch()
43
+ yield tmpdir
44
+
45
+ def test_collect_pdf_paths(temp_dir_with_pdfs):
46
+ paths = collect_pdf_paths(str(temp_dir_with_pdfs))
47
+ assert len(paths) == 2
48
+ assert all(p.suffix.lower() == '.pdf' for p in paths)
49
+ assert Path(str(temp_dir_with_pdfs) / "doc1.pdf") in paths
50
+ assert Path(str(temp_dir_with_pdfs) / "subfolder/doc2.pdf") in paths
51
+
52
+ def test_collect_pdf_paths_no_pdfs(temp_dir_with_html):
53
+ paths = collect_pdf_paths(str(temp_dir_with_html))
54
+ assert len(paths) == 0
55
+
56
+ def test_collect_html_paths(temp_dir_with_html):
57
+ paths = collect_html_paths(str(temp_dir_with_html))
58
+ assert len(paths) == 2
59
+ assert all(p.suffix.lower() in ['.html', '.htm'] for p in paths)
60
+ assert Path(str(temp_dir_with_html) / "page1.html") in paths
61
+ assert Path(str(temp_dir_with_html) / "subfolder/page2.htm") in paths
62
+
63
+ def test_collect_html_paths_no_html(temp_dir_with_pdfs):
64
+ paths = collect_html_paths(str(temp_dir_with_pdfs))
65
+ assert len(paths) == 0
66
+
67
+ def test_collect_markdown_paths(temp_dir_with_md):
68
+ paths = collect_markdown_paths(str(temp_dir_with_md))
69
+ assert len(paths) == 2
70
+ assert all(p.suffix.lower() == '.md' for p in paths)
71
+ assert Path(str(temp_dir_with_md) / "note1.md") in paths
72
+ assert Path(str(temp_dir_with_md) / "subfolder/note2.md") in paths
73
+
74
+ def test_collect_markdown_paths_no_md(temp_dir_with_pdfs):
75
+ paths = collect_markdown_paths(str(temp_dir_with_pdfs))
76
+ assert len(paths) == 0
77
+
78
+ def test_process_dicts_data():
79
+ sample_logs = [
80
+ {"filepath": Path("file1.md"), "markdown": "Content1", "image_path": ["img1.jpg"]},
81
+ {"filepath": Path("file2.md"), "markdown": "Content2", "image_path": []},
82
+ {"error": "Conversion failed for file3"}
83
+ ]
84
+ result = process_dicts_data(sample_logs)
85
+ assert "file1.md" in result
86
+ assert "Content1" in result
87
+ assert "img1.jpg" in result
88
+ assert "Conversion failed" in result
89
+
90
+ def test_process_dicts_data_empty():
91
+ result = process_dicts_data([])
92
+ assert result == ""
93
+
94
+ def test_process_dicts_data_invalid():
95
+ with pytest.raises(ValueError):
96
+ process_dicts_data([{"invalid": "data"}])
97
+
98
+ def test_create_outputdir(tmp_path):
99
+ output_dir = tmp_path / "test_output"
100
+ create_outputdir(str(output_dir))
101
+ assert output_dir.exists()
102
+ assert output_dir.is_dir()
103
+
104
+ def test_create_outputdir_existing(tmp_path):
105
+ output_dir = tmp_path / "test_output"
106
+ output_dir.mkdir()
107
+ create_outputdir(str(output_dir))
108
+ assert output_dir.exists()
109
+ assert output_dir.is_dir()
110
+
111
+ @patch('pathlib.Path.mkdir')
112
+ def test_create_outputdir_error(mock_mkdir):
113
+ mock_mkdir.side_effect = OSError("Permission denied")
114
+ with pytest.raises(OSError):
115
+ create_outputdir("protected_dir")
tests/test_llm.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_llm.py
2
+ # run with pytest tests/.
3
+ #
4
+ # import pytest
5
+ from unittest.mock import patch, MagicMock
6
+ import huggingface_hub
7
+ from huggingface_hub import get_token
8
+
9
+ from llm.llm_login import login_huggingface
10
+ from llm.provider_validator import is_valid_provider, suggest_providers
11
+ from llm.hf_client import HFChatClient # Assuming this exists
12
+ from llm.openai_client import OpenAIClient # Assuming this exists
13
+
14
+ def test_login_huggingface_success():
15
+ with patch('huggingface_hub.login') as mock_login:
16
+ api_token = "hf_test_token"
17
+ login_huggingface(api_token)
18
+ mock_login.assert_called_once_with(token=api_token, add_to_git_credential=False)
19
+
20
+ def test_login_huggingface_no_token():
21
+ with patch('huggingface_hub.login') as mock_login:
22
+ with pytest.raises(ValueError, match="API token required"):
23
+ login_huggingface(None)
24
+
25
+ @patch('huggingface_hub.login')
26
+ def test_login_huggingface_error(mock_login):
27
+ mock_login.side_effect = Exception("Login failed")
28
+ with pytest.raises(Exception, match="Login failed"):
29
+ login_huggingface("invalid_token")
30
+
31
+ def test_is_valid_provider():
32
+ assert is_valid_provider("huggingface") is True
33
+ assert is_valid_provider("openai") is True
34
+ assert is_valid_provider("invalid_provider") is False
35
+ assert is_valid_provider("") is False
36
+ assert is_valid_provider(None) is False
37
+
38
+ def test_suggest_providers():
39
+ suggestions = suggest_providers("hugngface") # Typo example
40
+ assert isinstance(suggestions, list)
41
+ assert "huggingface" in suggestions
42
+
43
+ no_suggestions = suggest_providers("completely_unknown")
44
+ assert isinstance(no_suggestions, list)
45
+ assert len(no_suggestions) == 0
46
+
47
+ @patch('llm.hf_client.HFChatClient.__init__')
48
+ def test_hf_client_init(mock_init):
49
+ mock_init.return_value = None
50
+ client = HFChatClient(model_id="test-model", api_token="test_token")
51
+ mock_init.assert_called_once_with(
52
+ model_id="test-model",
53
+ api_token="test_token",
54
+ # Add other expected params based on actual __init__
55
+ )
56
+
57
+ @patch('llm.hf_client.login_huggingface')
58
+ @patch('llm.hf_client.get_token')
59
+ def test_hf_client_token(mock_get_token, mock_login):
60
+ mock_get_token.return_value = "cached_token"
61
+ mock_login.return_value = None
62
+
63
+ client = HFChatClient(model_id="test-model")
64
+ assert client.api_token == "cached_token"
65
+
66
+ @patch('openai.OpenAI')
67
+ def test_openai_client_init(mock_openai):
68
+ mock_client = MagicMock()
69
+ mock_openai.return_value = mock_client
70
+
71
+ client = OpenAIClient(api_key="sk_test_key", base_url="https://api.openai.com/v1")
72
+ mock_openai.assert_called_once_with(
73
+ api_key="sk_test_key",
74
+ base_url="https://api.openai.com/v1"
75
+ )
76
+ assert client.client == mock_client
77
+
78
+ @patch('openai.OpenAI')
79
+ def test_openai_client_chat(mock_openai):
80
+ mock_client = MagicMock()
81
+ mock_response = MagicMock()
82
+ mock_response.choices = [MagicMock(content="Hello!")]
83
+ mock_client.chat.completions.create.return_value = mock_response
84
+ mock_openai.return_value = mock_client
85
+
86
+ client = OpenAIClient(api_key="sk_test_key")
87
+ response = client.chat("Hello", model="gpt-3.5-turbo")
88
+
89
+ assert response == "Hello!"
90
+ mock_client.chat.completions.create.assert_called_once_with(
91
+ model="gpt-3.5-turbo",
92
+ messages=[{"role": "user", "content": "Hello"}]
93
+ )
94
+
95
+ def test_provider_validator_edge_cases():
96
+ # Test with non-string inputs
97
+ assert is_valid_provider(123) is False
98
+ assert suggest_providers(123) == []
99
+
100
+ # Test case insensitivity
101
+ assert is_valid_provider("HUGGINGFACE") is True
102
+ assert is_valid_provider("OpEnAi") is True
103
+
104
+ @patch('huggingface_hub.get_token')
105
+ def test_get_token_from_env(mock_get_token):
106
+ mock_get_token.return_value = None
107
+ with patch.dict('os.environ', {'HUGGINGFACE_HUB_TOKEN': 'env_token'}):
108
+ token = get_token()
109
+ assert token == 'env_token'
110
+
111
+ @patch('huggingface_hub.get_token')
112
+ def test_get_token_from_cache(mock_get_token):
113
+ mock_get_token.return_value = 'cached_token'
114
+ token = get_token()
115
+ assert token == 'cached_token'
tests/test_main_ui.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_main_ui.py
2
+ # run with pytest tests/.
3
+
4
+ import pytest
5
+ from unittest.mock import patch, MagicMock
6
+ from pathlib import Path
7
+
8
+ from main import build_interface # Wait, main imports from ui, but test main logic
9
+ from ui.gradio_ui import convert_batch, build_interface, accumulate_files, clear_state, pdf_files_wrap
10
+ from utils.logger import get_logger
11
+
12
+ logger = get_logger("test_main_ui")
13
+
14
+ @pytest.fixture
15
+ def mock_gradio():
16
+ with patch('gradio.Blocks') as mock_blocks, \
17
+ patch('gradio.Markdown') as mock_md, \
18
+ patch('gradio.Accordion') as mock_accordion, \
19
+ patch('gradio.Dropdown') as mock_dropdown, \
20
+ patch('gradio.Textbox') as mock_textbox, \
21
+ patch('gradio.Slider') as mock_slider, \
22
+ patch('gradio.Checkbox') as mock_checkbox, \
23
+ patch('gradio.Button') as mock_button, \
24
+ patch('gradio.File') as mock_file, \
25
+ patch('gradio.UploadButton') as mock_upload, \
26
+ patch('gradio.State') as mock_state, \
27
+ patch('gradio.Tab') as mock_tab, \
28
+ patch('gradio.JSON') as mock_json, \
29
+ patch('gradio.Files') as mock_files, \
30
+ patch('gradio.Gallery') as mock_gallery:
31
+ yield {
32
+ 'Blocks': mock_blocks, 'Markdown': mock_md, 'Accordion': mock_accordion,
33
+ 'Dropdown': mock_dropdown, 'Textbox': mock_textbox, 'Slider': mock_slider,
34
+ 'Checkbox': mock_checkbox, 'Button': mock_button, 'File': mock_file,
35
+ 'UploadButton': mock_upload, 'State': mock_state, 'Tab': mock_tab,
36
+ 'JSON': mock_json, 'Files': mock_files, 'Gallery': mock_gallery
37
+ }
38
+
39
+ def test_build_interface(mock_gradio):
40
+ demo = build_interface()
41
+ assert demo is not None
42
+ # Verify UI components are created
43
+ mock_gradio['Blocks'].assert_called_once_with(title="parserPDF", css=MagicMock())
44
+ mock_gradio['Markdown'].assert_called() # Title markdown
45
+ mock_gradio['Accordion'].assert_any_call("⚙️ LLM Model Settings", open=False)
46
+ mock_gradio['Tab'].assert_any_call(" 📄 PDF & HTML ➜ Markdown")
47
+
48
+ def test_convert_batch_no_files():
49
+ result = convert_batch([], 0, "huggingface", "test-model", "fireworks-ai", "", "model-id",
50
+ "system", 1024, 0.0, 0.1, False, "token",
51
+ "https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
52
+ "output_dir", False, None)
53
+ assert "No files uploaded" in result[0]
54
+
55
+ @patch('ui.gradio_ui.login_huggingface')
56
+ @patch('ui.gradio_ui.ProcessPoolExecutor')
57
+ @patch('ui.gradio_ui.pdf2md_converter.convert_files')
58
+ def test_convert_batch_success(mock_convert, mock_pool, mock_login):
59
+ mock_result = MagicMock()
60
+ mock_convert.return_value = {"filepath": Path("test.md"), "image_path": ["img.jpg"], "markdown": "content"}
61
+ mock_pool.return_value.__enter__.return_value.map.return_value = [mock_result]
62
+ mock_login.return_value = None
63
+
64
+ pdf_files = [MagicMock(name="test.pdf")]
65
+ result = convert_batch(pdf_files, 1, "huggingface", "test-model", "fireworks-ai", "", "model-id",
66
+ "system", 1024, 0.0, 0.1, False, "token",
67
+ "https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
68
+ "output_dir", False, None)
69
+
70
+ assert len(result) == 3
71
+ assert "test.md" in result[0]
72
+ assert "img.jpg" in result[2][0]
73
+ mock_pool.assert_called_once()
74
+ mock_convert.assert_called_once_with("test.pdf")
75
+
76
+ @patch('ui.gradio_ui.ProcessPoolExecutor')
77
+ def test_convert_batch_pool_error(mock_pool):
78
+ mock_pool.side_effect = Exception("Pool error")
79
+ pdf_files = [MagicMock(name="test.pdf")]
80
+ result = convert_batch(pdf_files, 1, "huggingface", "test-model", "fireworks-ai", "", "model-id",
81
+ "system", 1024, 0.0, 0.1, False, "token",
82
+ "https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
83
+ "output_dir", False, None)
84
+ assert "Error during ProcessPoolExecutor" in result[0]
85
+
86
+ def test_accumulate_files():
87
+ # Test initial accumulation
88
+ new_files = [MagicMock(name="/tmp/file1.pdf"), MagicMock(name="/tmp/file2.html")]
89
+ state = []
90
+ updated_state, message = accumulate_files(new_files, state)
91
+ assert len(updated_state) == 2
92
+ assert "/tmp/file1.pdf" in updated_state
93
+ assert "Accumulated 2 file(s)" in message
94
+
95
+ # Test adding to existing state
96
+ new_files2 = [MagicMock(name="/tmp/file3.pdf")]
97
+ updated_state2, message2 = accumulate_files(new_files2, updated_state)
98
+ assert len(updated_state2) == 3
99
+ assert "Accumulated 3 file(s)" in message2
100
+
101
+ # Test no new files
102
+ _, message3 = accumulate_files([], updated_state2)
103
+ assert "No new files uploaded" in message3
104
+
105
+ def test_clear_state():
106
+ result = clear_state()
107
+ assert len(result) == 4
108
+ assert result[0] == [] # cleared file list
109
+ assert result[1] == "Files list cleared." # message
110
+ assert result[2] == [] # cleared file btn
111
+ assert result[3] == [] # cleared dir btn
112
+
113
+ def test_pdf_files_wrap():
114
+ # Single file
115
+ single_file = "single.pdf"
116
+ wrapped = pdf_files_wrap(single_file)
117
+ assert isinstance(wrapped, list)
118
+ assert len(wrapped) == 1
119
+ assert wrapped[0] == single_file
120
+
121
+ # List of files
122
+ files_list = ["file1.pdf", "file2.html"]
123
+ wrapped_list = pdf_files_wrap(files_list)
124
+ assert wrapped_list == files_list
125
+
126
+ # None input
127
+ assert pdf_files_wrap(None) == [None]
128
+
129
+ @patch('ui.gradio_ui.os.chdir')
130
+ @patch('ui.gradio_ui.Path')
131
+ def test_main_launch(mock_path, mock_chdir):
132
+ mock_script_dir = MagicMock()
133
+ mock_path.return_value.resolve.return_value.parent = mock_script_dir
134
+ mock_chdir.return_value = None
135
+
136
+ # Test main execution path
137
+ with patch('builtins.__name__', '__main__'):
138
+ from main import main # Assuming main has a main function, or test the if __name__ logic indirectly
139
+ # Since main.py is simple, test the key parts
140
+ demo = MagicMock()
141
+ with patch('ui.gradio_ui.build_interface', return_value=demo):
142
+ with patch('gradio.Interface.launch') as mock_launch:
143
+ # Execute main logic
144
+ import main
145
+ main.main() # If it has main(), or just import runs it
146
+
147
+ mock_chdir.assert_called_once_with(mock_script_dir)
148
+ mock_launch.assert_called_once_with(debug=True, show_error=True)
tests/test_utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_utils.py
2
+ # run with pytest tests/.
3
+ #
4
+ # import pytest
5
+ import logging
6
+ from unittest.mock import patch, MagicMock
7
+ from pathlib import Path
8
+
9
+ from utils.logger import get_logger, setup_logging
10
+ from utils.utils import is_dict, is_list_of_dicts
11
+ from utils.config import TITLE, DESCRIPTION # Assuming these are defined
12
+ from utils.get_config import get_config_value # If separate module
13
+
14
+ def test_setup_logging(capsys):
15
+ setup_logging()
16
+ captured = capsys.readouterr()
17
+ assert "Logging configured" in captured.out or captured.err # Assuming it prints config message
18
+
19
+ def test_get_logger():
20
+ logger = get_logger("test_logger")
21
+ assert isinstance(logger, logging.Logger)
22
+ assert logger.name == "test_logger"
23
+
24
+ @patch('logging.getLogger')
25
+ def test_get_logger_custom(mock_get_logger):
26
+ mock_logger = MagicMock()
27
+ mock_get_logger.return_value = mock_logger
28
+ logger = get_logger("custom_test")
29
+ mock_get_logger.assert_called_once_with("custom_test")
30
+ assert logger == mock_logger
31
+
32
+ def test_is_dict():
33
+ assert is_dict({"key": "value"}) is True
34
+ assert is_dict({"key": [1, 2]}) is True
35
+ assert is_dict([]) is False
36
+ assert is_dict("string") is False
37
+ assert is_dict(123) is False
38
+ assert is_dict(None) is False
39
+
40
+ def test_is_list_of_dicts():
41
+ assert is_list_of_dicts([{"a": 1}, {"b": 2}]) is True
42
+ assert is_list_of_dicts([]) is False # Empty list not considered list of dicts
43
+ assert is_list_of_dicts([{"a": 1}, "string"]) is False
44
+ assert is_list_of_dicts("not_list") is False
45
+ assert is_list_of_dicts([1, 2]) is False
46
+ assert is_list_of_dicts(None) is False
47
+
48
+ def test_config_constants():
49
+ # Test if config values are as expected (update based on actual config.py)
50
+ assert TITLE == "parserPDF" # Or whatever the actual value is
51
+ assert DESCRIPTION.startswith("PDF parser") # Partial match for description
52
+
53
+ @patch('utils.get_config.configparser.ConfigParser')
54
+ def test_get_config_value(mock_configparser):
55
+ mock_config = MagicMock()
56
+ mock_config.get.return_value = "test_value"
57
+ mock_configparser.return_value = mock_config
58
+
59
+ value = get_config_value("SECTION", "KEY")
60
+ mock_config.get.assert_called_once_with("SECTION", "KEY")
61
+ assert value == "test_value"
62
+
63
+ @patch('utils.get_config.configparser.ConfigParser')
64
+ def test_get_config_value_default(mock_configparser):
65
+ mock_config = MagicMock()
66
+ mock_config.get.side_effect = KeyError("No such key")
67
+ mock_configparser.return_value = mock_config
68
+
69
+ value = get_config_value("SECTION", "NONEXISTENT", default="fallback")
70
+ assert value == "fallback"
71
+ mock_config.get.assert_called_once_with("SECTION", "NONEXISTENT")
72
+
73
+ def test_logger_levels(caplog):
74
+ # Test logging at different levels
75
+ logger = get_logger("level_test")
76
+
77
+ with caplog.at_level(logging.DEBUG):
78
+ logger.debug("Debug message")
79
+ assert "Debug message" in caplog.text
80
+
81
+ with caplog.at_level(logging.INFO):
82
+ logger.info("Info message")
83
+ assert "Info message" in caplog.text
84
+
85
+ with caplog.at_level(logging.ERROR):
86
+ logger.error("Error message")
87
+ assert "Error message" in caplog.text
88
+
89
+ def test_setup_logging_file(tmp_path):
90
+ log_file = tmp_path / "test.log"
91
+ with patch.dict('os.environ', {'LOG_FILE': str(log_file)}):
92
+ setup_logging()
93
+ assert log_file.exists()
94
+ log_file.unlink() # Cleanup
tests/tests_converter.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tests/test_conversion.py
2
+ # run with pytest tests/.
3
+
4
+ from pathlib import Path
5
+ from converters.pdf_to_md import PdfToMarkdownConverter
6
+ from converters.md_to_pdf import MarkdownToPdfConverter
7
+
8
+ def test_sample_pdf():
9
+ pdf = Path("tests/sample.pdf")
10
+ converter = PdfToMarkdownConverter()
11
+ md = converter.convert(pdf)
12
+ assert isinstance(md, str) and len(md) > 0
13
+
14
+ def test_markdown_to_pdf(tmp_path):
15
+ md_file = tmp_path / "test.md"
16
+ md_file.write_text("# Hello\nThis is a test.")
17
+ conv = MarkdownToPdfConverter()
18
+ pdf_path = conv.convert(md_file)
19
+ assert pdf_path.exists() and pdf_path.suffix == ".pdf"
ui/__init__.py ADDED
File without changes
ui/gradio_ui.py ADDED
@@ -0,0 +1,850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/gradio_ui.py
2
+ import gradio as gr
3
+
4
+ from pathlib import Path, WindowsPath
5
+ import traceback ## Extract, format and print information about Python stack traces.
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
+ from typing import Optional, Union #, Dict, List, Any, Tuple
8
+
9
+ from huggingface_hub import get_token
10
+ import file_handler
11
+ import file_handler.file_utils
12
+ from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
13
+ from utils.utils import is_dict, is_list_of_dicts
14
+ from file_handler.file_utils import process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
15
+ #from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
16
+ from llm.provider_validator import is_valid_provider, suggest_providers
17
+ from llm.llm_login import login_huggingface
18
+
19
+ from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
20
+ from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
21
+ from converters.md_to_pdf import MarkdownToPdfConverter
22
+ from converters.html_to_md import HtmlToMarkdownConverter
23
+
24
+ from utils.get_config import get_config_value
25
+ from utils.logger import get_logger
26
+
27
+ logger = get_logger(__name__) ##NB: setup_logging() ## set logging
28
+
29
+ # Instantiate converters class once – they are stateless
30
+ pdf2md_converter = PdfToMarkdownConverter()
31
+ #html2md_converter = HtmlToMarkdownConverter()
32
+ md2pdf_converter = MarkdownToPdfConverter()
33
+
34
+
35
+ # pool executor to convert files called by Gradio
36
+ def convert_batch(
37
+ pdf_files, #: list[str],
38
+ pdf_files_count: int,
39
+ provider: str,
40
+ model_id: str,
41
+ #base_url: str
42
+ hf_provider: str,
43
+ endpoint: str,
44
+ backend_choice: str,
45
+ system_message: str,
46
+ max_tokens: int,
47
+ temperature: float,
48
+ top_p: float,
49
+ stream: bool,
50
+ api_token: str,
51
+ #max_workers: int,
52
+ #max_retries: int,
53
+ openai_base_url: str = "https://router.huggingface.co/v1",
54
+ openai_image_format: Optional[str] = "webp",
55
+ max_workers: Optional[int] = 4,
56
+ max_retries: Optional[int] = 2,
57
+ output_format: str = "markdown",
58
+ #output_dir: Optional[Union[str, Path]] = "output_dir",
59
+ output_dir_string: str = "output_dir_default",
60
+ use_llm: bool = False, #Optional[bool] = False, #True,
61
+ page_range: str = None, #Optional[str] = None,
62
+ ) -> str:
63
+ """
64
+ Handles the conversion process using multiprocessing.
65
+ Spins up a pool and converts all uploaded files in parallel.
66
+ Aggregates per-file logs into one string.
67
+ Receives Gradio component values, starting with the list of uploaded file paths
68
+ """
69
+
70
+ # explicitly wrap file object in a list
71
+ #pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
72
+
73
+ ## debug
74
+ #logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
75
+
76
+ #if not files:
77
+ if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
78
+ logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
79
+ return "Initialising ProcessPool: No files uploaded."
80
+
81
+ # Get config values if not provided
82
+ model_id = get_config_value("MARKER_CAP", "MODEL_ID") if not model_id else model_id
83
+ openai_base_url = get_config_value( "MARKER_CAP", "OPENAI_BASE_URL") if not openai_base_url else openai_base_url
84
+ openai_image_format = get_config_value( "MARKER_CAP", "OPENAI_IMAGE_FORMAT") if not openai_image_format else openai_image_format
85
+ max_workers = get_config_value("MARKER_CAP", "MAX_WORKERS") if not max_workers else max_workers
86
+ max_retries = get_config_value("MARKER_CAP", "MAX_RETRIES") if not max_retries else max_retries
87
+ output_format = get_config_value("MARKER_CAP", "OUTPUT_FORMAT") if not output_format else output_format
88
+ output_dir_string = str(get_config_value("MARKER_CAP", "OUTPUT_DIR") if not output_dir_string else output_dir_string)
89
+ use_llm = get_config_value("MARKER_CAP", "USE_LLM") if not use_llm else use_llm
90
+ page_range = get_config_value("MARKER_CAP", "PAGE_RANGE") if not page_range else page_range
91
+
92
+ # Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
93
+ init_args = (
94
+ provider,
95
+ model_id,
96
+ #base_url,
97
+ hf_provider,
98
+ endpoint,
99
+ backend_choice,
100
+ system_message,
101
+ max_tokens,
102
+ temperature,
103
+ top_p,
104
+ stream,
105
+ api_token,
106
+ openai_base_url,
107
+ openai_image_format,
108
+ max_workers,
109
+ max_retries,
110
+ output_format,
111
+ output_dir_string,
112
+ use_llm,
113
+ page_range,
114
+ )
115
+
116
+ #global docextractor ##SMY: deprecated.
117
+ try:
118
+ login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
119
+ except Exception as exc: # Catch all exceptions
120
+ tb = traceback.format_exc()
121
+ logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
122
+ return f"✗ An error occurred during login_huggingface → {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
123
+
124
+ try:
125
+ # Create a pool with init_worker initialiser
126
+ with ProcessPoolExecutor(
127
+ max_workers=max_workers,
128
+ initializer=init_worker,
129
+ initargs=init_args
130
+ ) as pool:
131
+ #global docextractor
132
+ logger.log(level=30, msg="Initialising ProcessPool: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
133
+
134
+ # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
135
+ # The 'docconverter' argument is implicitly handled by the initialiser
136
+
137
+ #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
138
+ #logs = [f.result() for f in as_completed(futures)]
139
+ #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
140
+ #logs = [f.result() for f in futures]
141
+
142
+ try:
143
+ pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
144
+ #result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
145
+ results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
146
+ except Exception as exc:
147
+ # Raise the exception to stop the Gradio app
148
+ #raise # Re-raise the exception to halt execution
149
+ logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
150
+ traceback.print_exc() # Print the exception traceback
151
+ return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
152
+
153
+ #'''
154
+ logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
155
+ logs = []
156
+ logs_files_images = []
157
+ #logs.extend(results) ## performant pythonic
158
+ #logs = list[results] ##
159
+ logs = [result for result in results] ## pythonic list comprehension
160
+ ## logs : [file , images , filepath, image_path]
161
+
162
+ #logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
163
+ for log in logs:
164
+ #logs_files_images.append(log.get("filepath", "Error or No filepath")) # if all(isinstance(log, dict) for item in logs))
165
+ #logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
166
+
167
+ logs_files_images.append(log.get("filepath") if is_dict(logs) or isinstance(log, Path) else "Error or no image_path") # isinstance(log, (dict, str))
168
+ logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
169
+
170
+
171
+ #logs_files_images.append(logs_filepath) ## to del
172
+ #logs_files_images.extend(logs_images) ## to del
173
+ #'''
174
+ except Exception as exc:
175
+ tb = traceback.format_exc()
176
+ logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True) # Log the full traceback
177
+ #traceback.print_exc() # Print the exception traceback
178
+ return f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
179
+
180
+ '''
181
+ logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
182
+ logs = []
183
+ #logs.extend(results) ## performant pythonic
184
+ #logs = list[results] ##
185
+ logs = [result for result in results] ## pythonic list comprehension
186
+ '''
187
+
188
+ try:
189
+ logs_return = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
190
+ #logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
191
+
192
+ ##convert the List of Path objects to List of string for gr.Files output
193
+ #logs_files_images_return = list(str(path) for path in logs_files_images)
194
+
195
+ ## # Convert any Path objects to strings, but leave strings as-is
196
+ logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
197
+ return logs_return, logs_return, logs_files_images_return
198
+ #return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
199
+ except Exception as exc:
200
+ tb = traceback.format_exc()
201
+ logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
202
+ #traceback.print_exc() # Print the exception traceback
203
+ return f"✗ An error occurred during returning result logs→ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
204
+
205
+
206
+ #return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
207
+ #print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
208
+
209
+ # files wrapping into list ##SMY: Flagged for deprecation
210
+ def pdf_files_wrap(files: list[str]):
211
+ # explicitly wrap file object in a list
212
+ return [files] if not isinstance(files, list) else files
213
+ #return [files]
214
+
215
+ ##====================
216
+ ## SMY: moved to logic file: See pdf_to_md.py. Currently unused
217
+ def convert_pdfs_to_md(file: gr.File | None, folder: str | None) -> dict:
218
+ """
219
+ Gradio callback for PDF → Markdown.
220
+ Accepts either a single file or a folder path (recursively).
221
+ Leverages Marker, a pipeline of deep learning models, for conversion
222
+ Returns a dictionary of filename → Markdown string.
223
+ """
224
+ if not file and not folder:
225
+ return {"error": "Please provide a PDF file or a folder."}
226
+
227
+ pdf_paths = []
228
+
229
+ # Single file
230
+ if file:
231
+ pdf_path = Path(file.name)
232
+ pdf_paths.append(pdf_path)
233
+
234
+ # Folder (recursively)
235
+ if folder:
236
+ try:
237
+ pdf_paths.extend(collect_pdf_paths(folder))
238
+ except Exception as exc:
239
+ logger.exception("Folder traversal failed.")
240
+ return {"error": str(exc)}
241
+
242
+ if not pdf_paths:
243
+ return {"error": "No PDF files found."}
244
+
245
+ results = pdf2md_converter.batch_convert(pdf_paths)
246
+ # Gradio expects a dict of {filename: content}
247
+ return results
248
+
249
+ ## SMY: to refactor and moved to logic file. Currently unused
250
+ def convert_htmls_to_md(file: gr.File | None, folder: str | None) -> dict:
251
+ """
252
+ Gradio callback for HTML → Markdown.
253
+ Accepts either a single file or a folder path (recursively).
254
+ Returns a dictionary of filename → Markdown string.
255
+ """
256
+ if not file and not folder:
257
+ return {"error": "Please provide a HTML file or a folder."}
258
+
259
+ html_paths = []
260
+
261
+ # Single file
262
+ if file:
263
+ html_path = Path(file.name)
264
+ html_paths.append(html_path)
265
+
266
+ # Folder (recursively)
267
+ if folder:
268
+ try:
269
+ html_paths.extend(collect_html_paths(folder))
270
+ except Exception as exc:
271
+ logger.exception("Folder traversal failed.")
272
+ return {"error": str(exc)}
273
+
274
+ if not html_paths:
275
+ return {"error": "No HTML files found."}
276
+
277
+ results = html2md_converter.batch_convert(html_paths)
278
+ # Gradio expects a dict of {filename: content}
279
+ return results
280
+
281
+ ## SMY: to refactor and moved to logic file
282
+ def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
283
+ """
284
+ Gradio callback for Markdown → PDF.
285
+ Returns a list of generated PDF files (as Gradio File objects).
286
+ """
287
+ if not file and not folder:
288
+ return []
289
+
290
+ md_paths = []
291
+
292
+ # Single file
293
+ if file:
294
+ md_path = Path(file.name)
295
+ md_paths.append(md_path)
296
+
297
+ # Folder
298
+ if folder:
299
+ try:
300
+ md_paths.extend(collect_markdown_paths(folder))
301
+ except Exception as exc:
302
+ logger.exception("Folder traversal failed.")
303
+ return []
304
+
305
+ if not md_paths:
306
+ return []
307
+
308
+ output_dir = Path("./generated_pdfs")
309
+ output_dir.mkdir(exist_ok=True)
310
+
311
+ pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
312
+ # Convert to Gradio File objects
313
+ gr_files = [gr.File(path=str(p)) for p in pdf_files]
314
+ return gr_files
315
+ ##====================
316
+
317
+ def build_interface() -> gr.Blocks:
318
+ """
319
+ Assemble the Gradio Blocks UI.
320
+ """
321
+
322
+ # Use custom CSS to style the file component
323
+ custom_css = """
324
+ .file-or-directory-area {
325
+ border: 2px dashed #ccc;
326
+ padding: 20px;
327
+ text-align: center;
328
+ border-radius: 8px;
329
+ margin-bottom: 10px;
330
+ display: flex;
331
+ flex-direction: column;
332
+ align-items: center;
333
+ }
334
+ .file-or-directory-area:hover {
335
+ border-color: #007bff;
336
+ background-color: #f8f9fa;
337
+ }
338
+ .gradio-upload-btn {
339
+ margin-top: 10px;
340
+ }
341
+ """
342
+
343
+ def is_file_with_extension(path_obj: Path) -> bool:
344
+ """
345
+ Checks if a pathlib.Path object is a file and has a non-empty extension.
346
+ """
347
+ path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
348
+ return path_obj.is_file() and bool(path_obj.suffix)
349
+
350
+ def accumulate_files(uploaded_files, current_state):
351
+ """
352
+ Accumulates newly uploaded files with the existing state.
353
+ """
354
+ # Initialize state if it's the first run
355
+ if current_state is None:
356
+ current_state = []
357
+
358
+ # If no files were uploaded in this interaction, return the current state unchanged
359
+ if not uploaded_files:
360
+ return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
361
+
362
+ # Get the temporary paths of the newly uploaded files
363
+ # call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
364
+ new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))] #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)] #Path(f.name).suffix.lower() !=""]
365
+
366
+ # Concatenate the new files with the existing ones in the state
367
+ updated_files = current_state + new_file_paths
368
+ updated_filenames = [Path(f).name for f in updated_files]
369
+
370
+ # Return the updated state and a message to the user
371
+ #file_info = "\n".join(updated_files)
372
+ filename_info = "\n".join(updated_filenames)
373
+ #message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
374
+ message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
375
+
376
+ return updated_files, message
377
+
378
+ def clear_state():
379
+ """
380
+ Clears the accumulated state of uloaded file list, output textbox, files and directory upload.
381
+ """
382
+ return [], "Files list cleared.", [], []
383
+
384
+ # with gr.Blocks(title=TITLE) as demo
385
+ with gr.Blocks(title=TITLE, css=custom_css) as demo:
386
+ gr.Markdown(f"## {DESCRIPTION}")
387
+
388
+ # Clean UI: Model parameters hidden in collapsible accordion
389
+ with gr.Accordion("⚙️ LLM Model Settings", open=False):
390
+ gr.Markdown(f"#### **Backend Configuration**")
391
+ system_message = gr.Textbox(
392
+ label="System Message",
393
+ lines=2
394
+ )
395
+ with gr.Row():
396
+ provider_dd = gr.Dropdown(
397
+ choices=["huggingface", "openai"],
398
+ label="Provider",
399
+ value="huggingface",
400
+ #allow_custom_value=True
401
+ )
402
+ backend_choice = gr.Dropdown(
403
+ choices=["model-id", "provider", "endpoint"],
404
+ label="HF Backend Choice"
405
+ ) ## SMY: ensure HFClient maps correctly
406
+ model_tb = gr.Textbox(
407
+ label="Model ID",
408
+ value="meta-llama/Llama-4-Maverick-17B-128E-Instruct", #image-Text-to-Text #"openai/gpt-oss-120b", ##Text-to-Text
409
+ )
410
+ endpoint_tb = gr.Textbox(
411
+ label="Endpoint",
412
+ placeholder="Optional custom endpoint"
413
+ )
414
+ with gr.Row():
415
+ max_token_sl = gr.Slider(
416
+ label="Max Tokens",
417
+ minimum=1,
418
+ maximum=131172, #65536, #32768, #16384, #8192,
419
+ value=1024, #512,
420
+ step=1
421
+ )
422
+ temperature_sl = gr.Slider(
423
+ label="Temperature",
424
+ minimum=0.0,
425
+ maximum=1.0,
426
+ value=0.0,
427
+ step=0.1 #0.01
428
+ )
429
+ top_p_sl = gr.Slider(
430
+ label="Top-p",
431
+ minimum=0.0,
432
+ maximum=1.0,
433
+ value=0.1,
434
+ step=0.1 #0.01
435
+ )
436
+ stream_cb = gr.Checkbox(
437
+ label="LLM Streaming",
438
+ value=False
439
+ )
440
+ with gr.Row():
441
+ api_token_tb = gr.Textbox(
442
+ label="API Token [OPTIONAL]",
443
+ type="password",
444
+ placeholder="hf_xxx or openai key"
445
+ )
446
+ hf_provider_dd = gr.Dropdown(
447
+ choices=["fireworks-ai", "together-ai", "openrouter-ai", "hf-inference"],
448
+ value="fireworks-ai",
449
+ label="Provider",
450
+ allow_custom_value=True, # let users type new providers as they appear
451
+ )
452
+
453
+ # Validate provider on change; warn but allow continue
454
+ def on_provider_change(provider_value: str):
455
+ if not provider_value:
456
+ return
457
+ if not is_valid_provider(provider_value):
458
+ sug = suggest_providers(provider_value)
459
+ extra = f" Suggestions: {', '.join(sug)}." if sug else ""
460
+ gr.Warning(
461
+ f"Provider not on HF provider list. See https://huggingface.co/docs/inference-providers/index.{extra}"
462
+ )
463
+ hf_provider_dd.change(on_provider_change, inputs=hf_provider_dd, outputs=None)
464
+
465
+ # Clean UI: Model parameters hidden in collapsible accordion
466
+ with gr.Accordion("⚙️ Marker Settings", open=False):
467
+ gr.Markdown(f"#### **Marker Configuration**")
468
+ with gr.Row():
469
+ openai_base_url_tb = gr.Textbox(
470
+ label="OpenAI Base URL: Default HuggingFace",
471
+ value="https://router.huggingface.co/v1",
472
+ lines=1,
473
+ max_lines=1,
474
+ )
475
+ openai_image_format_dd = gr.Dropdown(
476
+ choices=["webp", "png", "jpeg"],
477
+ label="OpenAI Image Format",
478
+ value="webp",
479
+ )
480
+ output_format_dd = gr.Dropdown(
481
+ choices=["markdown", "html"], #, "json", "chunks"], ##SMY: To be enabled later
482
+ #choices=["markdown", "html", "json", "chunks"],
483
+ label="Output Format",
484
+ value="markdown",
485
+ )
486
+ output_dir_tb = gr.Textbox(
487
+ label="Output Directory",
488
+ value="output_dir", #"output_md",
489
+ lines=1,
490
+ max_lines=1,
491
+ )
492
+ with gr.Row():
493
+ max_workers_sl = gr.Slider(
494
+ label="Max Worker",
495
+ minimum=1,
496
+ maximum=7,
497
+ value=4,
498
+ step=1
499
+ )
500
+ max_retries_sl = gr.Slider(
501
+ label="Max Retry",
502
+ minimum=1,
503
+ maximum=3,
504
+ value=2,
505
+ step=1 #0.01
506
+ )
507
+ use_llm_cb = gr.Checkbox(
508
+ label="Use LLM for Marker conversion",
509
+ value=False
510
+ )
511
+ page_range_tb = gr.Textbox(
512
+ label="Page Range (Optional)",
513
+ placeholder="Example: 0,1-5,8,12-15",
514
+ lines=1,
515
+ max_lines=1,
516
+ )
517
+
518
+ # Initialise gr.State
519
+ state_max_workers = gr.State(4) #max_workers_sl,
520
+ state_max_retries = gr.State(2) #max_retries_sl,
521
+
522
+ def update_state_stored_value(new_component_input):
523
+ """ Updates stored state: use for max_workers and max_retries """
524
+ return new_component_input
525
+
526
+ # Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
527
+ max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
528
+ max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
529
+
530
+
531
+ with gr.Accordion("🤗 HuggingFace Logout", open=False):
532
+ # Logout controls
533
+ def do_logout():
534
+ #ok = docextractor.client.logout()
535
+ ok = docconverter.client.logout()
536
+ # Reset token textbox on successful logout
537
+ msg = "✅ Logged out of Hugging Face and cleared tokens." if ok else "⚠️ Logout failed."
538
+ return gr.update(value=""), gr.update(visible=True, value=msg)
539
+
540
+ logout_status = gr.Markdown(visible=False)
541
+ logout_btn = gr.Button("Logout from Hugging Face", variant="stop")
542
+
543
+ logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status])
544
+
545
+
546
+ # The gr.State component to hold the accumulated list of files
547
+ uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
548
+
549
+ # --- PDF & HTML → Markdown tab ---
550
+ with gr.Tab(" 📄 PDF & HTML ➜ Markdown"):
551
+ gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
552
+
553
+ ### flag4deprecation #earlier implementation
554
+ '''
555
+ pdf_files = gr.File(
556
+ label="Upload PDF, HTML or PDF and HTMLfiles",
557
+ file_count="directory", ## handle directory and files upload #"multiple",
558
+ type="filepath",
559
+ file_types=["pdf", ".pdf"],
560
+ #size="small",
561
+ )
562
+ pdf_files_count = gr.TextArea(label="Files Count", interactive=False, lines=1)
563
+ with gr.Row():
564
+ btn_pdf_count = gr.Button("Count Files")
565
+ #btn_pdf_upload = gr.UploadButton("Upload files")
566
+ btn_pdf_convert = gr.Button("Convert PDF(s)")
567
+ '''
568
+
569
+ with gr.Column(elem_classes=["file-or-directory-area"]):
570
+ with gr.Row():
571
+ file_btn = gr.UploadButton(
572
+ #file_btn = gr.File(
573
+ label="Upload Multiple Files",
574
+ file_count="multiple",
575
+ file_types=["file"],
576
+ #height=25, #"sm",
577
+ size="sm",
578
+ elem_classes=["gradio-upload-btn"]
579
+ )
580
+ dir_btn = gr.UploadButton(
581
+ #dir_btn = gr.File(
582
+ label="Upload a Directory",
583
+ file_count="directory",
584
+ #file_types=["file"], #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
585
+ #height=25, #"0.5",
586
+ size="sm",
587
+ elem_classes=["gradio-upload-btn"]
588
+ )
589
+ with gr.Accordion("Display uploaded", open=True):
590
+ # Displays the accumulated file paths
591
+ output_textbox = gr.Textbox(label="Accumulated Files", lines=3) #, max_lines=4) #10
592
+
593
+ with gr.Row():
594
+ process_button = gr.Button("Process All Uploaded Files", variant="primary")
595
+ clear_button = gr.Button("Clear All Uploads", variant="secondary")
596
+
597
+ # Event handler for the multiple file upload button
598
+ file_btn.upload(
599
+ fn=accumulate_files,
600
+ inputs=[file_btn, uploaded_file_list],
601
+ outputs=[uploaded_file_list, output_textbox]
602
+ )
603
+
604
+ # Event handler for the directory upload button
605
+ dir_btn.upload(
606
+ fn=accumulate_files,
607
+ inputs=[dir_btn, uploaded_file_list],
608
+ outputs=[uploaded_file_list, output_textbox]
609
+ )
610
+
611
+ # Event handler for the "Clear" button
612
+ clear_button.click(
613
+ fn=clear_state,
614
+ inputs=None,
615
+ outputs=[uploaded_file_list, output_textbox, file_btn, dir_btn],
616
+ )
617
+
618
+ # --- PDF → Markdown tab ---
619
+ with gr.Tab(" 📄 PDF ➜ Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
620
+ gr.Markdown(f"#### {DESCRIPTION_PDF}")
621
+
622
+ files_upload_pdf = gr.File(
623
+ label="Upload PDF files",
624
+ file_count="directory", ## handle directory and files upload #"multiple",
625
+ type="filepath",
626
+ file_types=["pdf", ".pdf"],
627
+ #size="small",
628
+ )
629
+ files_count = gr.TextArea(label="Files Count", interactive=False, lines=1) #pdf_files_count
630
+ with gr.Row():
631
+ btn_pdf_count = gr.Button("Count Files")
632
+ #btn_pdf_upload = gr.UploadButton("Upload files")
633
+ btn_pdf_convert = gr.Button("Convert PDF(s)")
634
+
635
+ # --- 📃 HTML → Markdown tab ---
636
+ with gr.Tab("🕸️ HTML ➜ Markdown: (Flag for DEPRECATION)", interactive=False, visible=False):
637
+ gr.Markdown(f"#### {DESCRIPTION_HTML}")
638
+
639
+ files_upload_html = gr.File(
640
+ label="Upload HTML files",
641
+ file_count="multiple",
642
+ type="filepath",
643
+ file_types=["html", ".html", "htm", ".htm"]
644
+ )
645
+ #btn_html_convert = gr.Button("Convert HTML(s)")
646
+ html_files_count = gr.TextArea(label="Files Count", interactive=False, lines=1)
647
+ with gr.Row():
648
+ btn_html_count = gr.Button("Count Files")
649
+ #btn_pdf_upload = gr.UploadButton("Upload files")
650
+ btn_html_convert = gr.Button("Convert PDF(s)")
651
+
652
+
653
+ # --- Markdown → PDF tab ---
654
+ with gr.Tab("PENDING: Markdown ➜ PDF", interactive=False):
655
+ gr.Markdown(f"#### {DESCRIPTION_MD}")
656
+
657
+ md_files = gr.File(
658
+ label="Upload Markdown files",
659
+ file_count="multiple",
660
+ type="filepath",
661
+ file_types=["md", ".md"]
662
+ )
663
+ btn_md_convert = gr.Button("Convert Markdown to PDF)")
664
+ output_pdf = gr.Gallery(label="Generated PDFs", elem_id="pdf_gallery")
665
+
666
+ '''
667
+ md_input = gr.File(label="Upload a single Markdown file", file_count="single")
668
+ md_folder_input = gr.Textbox(
669
+ label="Or provide a folder path (recursively)",
670
+ placeholder="/path/to/folder",
671
+ )
672
+ convert_md_btn = gr.Button("Convert Markdown to PDF")
673
+ output_pdf = gr.Gallery(label="Generated PDFs", elem_id="pdf_gallery")
674
+
675
+ convert_md_btn.click(
676
+ fn=convert_md_to_pdf,
677
+ inputs=[md_input, md_folder_input],
678
+ outputs=output_pdf,
679
+ )
680
+ '''
681
+
682
+ # A Files component to display individual processed files as download links
683
+ with gr.Accordion("⏬ View and Download processed files", open=False):
684
+ with gr.Row():
685
+ files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250)
686
+ files_individual_downloads = gr.Files(label="Individual Processed Files")
687
+
688
+ ## Displays processed file paths
689
+ with gr.Accordion("View processing log", open=False):
690
+ log_output = gr.Textbox(
691
+ label="Conversion Logs",
692
+ lines=5,
693
+ #max_lines=25,
694
+ interactive=False
695
+ )
696
+
697
+ # file inputs
698
+ ## [wierd] NB: inputs_arg is a list of Gradio component objects, not the values of those components.
699
+ ## inputs_arg variable captures the state of these components at the time the list is created.
700
+ ## When btn_convert.click() is called later, it uses the list as it was initially defined
701
+ ##
702
+ ## SMY: Gradio component values are not directly mutable.
703
+ ## Instead, you should pass the component values to a function,
704
+ ## and then use the return value of the function to update the component.
705
+ ## Discarding for now. #//TODO: investigate further.
706
+ ## SMY: Solved: using gr.State
707
+ inputs_arg = [
708
+ #pdf_files,
709
+ ##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
710
+ uploaded_file_list,
711
+ files_count, #pdf_files_count,
712
+ provider_dd,
713
+ model_tb,
714
+ hf_provider_dd,
715
+ endpoint_tb,
716
+ backend_choice,
717
+ system_message,
718
+ max_token_sl,
719
+ temperature_sl,
720
+ top_p_sl,
721
+ stream_cb,
722
+ api_token_tb,
723
+ #gr.State(4), # max_workers
724
+ #gr.State(3), # max_retries
725
+ openai_base_url_tb,
726
+ openai_image_format_dd,
727
+ state_max_workers, #gr.State(4), #max_workers_sl,
728
+ state_max_retries, #gr.State(2), #max_retries_sl,
729
+ output_format_dd,
730
+ output_dir_tb,
731
+ use_llm_cb,
732
+ page_range_tb,
733
+ ]
734
+
735
+ ## debug
736
+ #logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
737
+
738
+ try:
739
+ #logger.log(level=30, msg="input_arg[0]: {input_arg[0]}")
740
+ process_button.click(
741
+ #pdf_files.upload(
742
+ fn=convert_batch,
743
+ inputs=inputs_arg,
744
+ outputs=[log_output, files_individual_JSON, files_individual_downloads],
745
+ )
746
+ except Exception as exc:
747
+ tb = traceback.format_exc()
748
+ logger.exception(f"✗ Error during process_button.click → {exc}\n{tb}", exc_info=True)
749
+ return f"✗ An error occurred during process_button.click → {exc}\n{tb}"
750
+
751
+ ##gr.File .upload() event, fire only after a file has been uploaded
752
+ # Event handler for the pdf file upload button
753
+ files_upload_pdf.upload(
754
+ fn=accumulate_files,
755
+ inputs=[files_upload_pdf, uploaded_file_list],
756
+ outputs=[uploaded_file_list, log_output]
757
+ )
758
+ #inputs_arg[0] = files_upload
759
+ btn_pdf_convert.click(
760
+ #pdf_files.upload(
761
+ fn=convert_batch,
762
+ outputs=[log_output, files_individual_downloads],
763
+ inputs=inputs_arg,
764
+ )
765
+ '''
766
+ inputs = [
767
+ pdf_files,
768
+ #pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
769
+ pdf_files_count,
770
+ provider_dd,
771
+ model_tb,
772
+ hf_provider_dd,
773
+ endpoint_tb,
774
+ backend_choice,
775
+ system_message,
776
+ max_token_sl,
777
+ temperature_sl,
778
+ top_p_sl,
779
+ stream_cb,
780
+ api_token_tb,
781
+ #gr.State(4), # max_workers
782
+ #gr.State(3), # max_retries
783
+ openai_base_url_tb,
784
+ openai_image_format_dd,
785
+ state_max_workers, #gr.State(max_workers_sl), #max_workers_sl,
786
+ state_max_retries, #gr.State(max_retries_sl), #max_retries_sl,
787
+ output_format_dd,
788
+ output_dir_tb,
789
+ use_llm_cb,
790
+ page_range_tb,
791
+ ],
792
+ '''
793
+ # )
794
+
795
+ # reuse the same business logic for HTML tab
796
+ # Event handler for the pdf file upload button
797
+ files_upload_html.upload(
798
+ fn=accumulate_files,
799
+ inputs=[files_upload_html, uploaded_file_list],
800
+ outputs=[uploaded_file_list, log_output]
801
+ )
802
+ #inputs_arg[0] = html_files
803
+ btn_html_convert.click(
804
+ fn=convert_batch,
805
+ inputs=inputs_arg,
806
+ outputs=[log_output, files_individual_downloads]
807
+ )
808
+
809
+ def get_file_count(file_list):
810
+ """
811
+ Counts the number of files in the list.
812
+
813
+ Args:
814
+ file_list (list): A list of temporary file objects.
815
+ Returns:
816
+ str: A message with the number of uploaded files.
817
+ """
818
+ if file_list:
819
+ return f"{len(file_list)}", f"Upload: {len(file_list)} files: \n {file_list}" #{[pdf_files.value]}"
820
+ else:
821
+ return "No files uploaded.", "No files uploaded." # Count files button
822
+
823
+ btn_pdf_count.click(
824
+ fn=get_file_count,
825
+ inputs=[files_upload_pdf],
826
+ outputs=[files_count, log_output]
827
+ )
828
+ btn_html_count.click(
829
+ fn=get_file_count,
830
+ inputs=[files_upload_html],
831
+ outputs=[html_files_count, log_output]
832
+ )
833
+
834
+ # Validate files upload on change; warn but allow continue
835
+ def on_pdf_files_change(pdf_files_value: list[str]):
836
+ # explicitly wrap file object in a list
837
+ pdf_files_value = pdf_files_wrap(pdf_files_value)
838
+ #if not isinstance(pdf_files_value, list):
839
+ # pdf_files_value = [pdf_files_value]
840
+
841
+ pdf_files_path = [file.name for file in pdf_files_value]
842
+ pdf_files_len = len(pdf_files_value) #len(pdf_files_path)
843
+ if pdf_files_value:
844
+ #return
845
+ return pdf_files_path, pdf_files_len
846
+ #pdf_files.change(on_pdf_files_change, inputs=pdf_files, outputs=[log_output, pdf_files_count]) #, postprocess=False) ##debug
847
+
848
+
849
+ return demo
850
+
utils/__init__.py ADDED
File without changes
utils/config.ini ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [marker]
2
+ provider=openai
3
+ #model_id=openai/gpt-oss-120b
4
+ ## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
5
+ ## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
6
+ ## Note that Marker works pretty well using it's own transformer-based model without LLM
7
+ model_id=meta-llama/Llama-4-Maverick-17B-128E-Instruct
8
+ hf_provider=fireworks-ai
9
+ endpoint_url=""
10
+ backend_choice=provider
11
+ system_message=""
12
+ max_tokens=8192
13
+ temperature=0.2
14
+ top_p=0.2
15
+ stream=True
16
+ api_token=a1b2c3
17
+ openai_model=openai/gpt-oss-120b
18
+ openai_api_key=a1b2c3
19
+ openai_base_url=https://router.huggingface.co/v1
20
+ openai_image_format=webp
21
+ #max_retries=3
22
+
23
+ #[Configuration]
24
+ use_llm=True
25
+ output_format=markdown
26
+ input_dir=inputs
27
+ output_dir=output_md
28
+ max_workers=4
29
+ max_retries=2
30
+ extract_images=True
31
+ output_image_format=png
32
+ output_encoding=utf-8
33
+ debug_data_folder=debug_data
34
+
35
+ [unsure]
36
+ image_output_dir="images"
37
+ image_output_format="png"
38
+ base_dir=Path(__file__).resolve().parent.parent
39
+ ###
40
+ # Create a Path object from the current file's location, resolve it to an absolute path,
41
+ # and then get its parent's parent using chained .parent calls or the parents[] attribute.
42
+ #grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
43
+ ###
44
+
45
+ [libraries]
46
+ libobject_path = C:\\Dat\\dev\\gtk3-runtime\\bin
47
+
48
+
49
+ # from config.ini ##SMY: future plan to merge
50
+ [MARKER_CAP]
51
+ #[marker]
52
+ PROVIDER = openai
53
+ #MODEL_ID = openai/gpt-oss-120b
54
+ ## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
55
+ ## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
56
+ ## Note that Marker works pretty well using it's own transformer-based model without LLM
57
+ MODEL_ID=meta-llama/Llama-4-Maverick-17B-128E-Instruct
58
+ HF_PROVIDER = fireworks-ai
59
+ ENDPOINT_URL = ""
60
+ BACKEND_CHOiCE = provider
61
+ SYSTEM_MESSAGE = ""
62
+ MAX_TOKENS = 8192
63
+ TEMMPERATURE = 0.2
64
+ TOP_P = 0.2
65
+ STREAM = True
66
+ API_TOKEN = a1b2c3
67
+ OPENAI_MODEL = openai/gpt-oss-120b
68
+ OPENAI_API_KEY = a1b2c3
69
+ OPENAI_BASE_URL = https://router.huggingface.co/v1
70
+ OPENAI_IMAGE_FORMAT = webp
71
+
72
+ #[CONFIGURATION]
73
+ MAX_WORKERS = 4
74
+ MAX_RETRIES = 2
75
+ OUTPUT_FORMAT = markdown
76
+ INPUT_DIR = inputs
77
+ OUTPUT_DIR = output_dir
78
+ USE_LLM = False
79
+ EXTRACT_IMAGES = True
80
+ OUTPUT_IMAGE_FORMAT = png
81
+ OUTPUT_ENCODING = utf-8
82
+ DEBUG_DATA_FOLDER = debug_data
83
+
84
+ [UNSURE_CAP]
85
+ IMAGE_OUTPUT_DIR = images
86
+ IMAGE_OUTPUT_FORMAT = png
87
+ BASE_DIR = Path(__file__).resolve().parent.parent
88
+ ###
89
+ # Create a Path object from the current file's location, resolve it to an absolute path
90
+ # Get its parent's parent using chained .parent calls or the parents[] attribute.
91
+ #grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
92
+ ###
93
+
94
+ [LIBRARIES_CAP]
95
+ LIBOBJECT_PATH = C:\\Dat\\dev\\gtk3-runtime\\bin
96
+ WEASYPRINT_DLL_DIRECTORIES = C:\\Dat\\dev\\gtk3-runtime\\bin
97
+
98
+ [GLOBAL_CAP]
99
+ # Globals within each worker process
100
+ HF_MODEL ="openai/gpt-oss-120b"
101
+ HF_TOKEN = ""
102
+ HF_CLIENT = None
103
+ ARTIFACT_DICT = None
104
+ PDF_CONVERTER = None
105
+ HTML_CONVERTER = None
106
+
107
+ [marker_dict]
108
+ ## "meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
109
+ provider:"openai" #provider,
110
+ model_id:"openai/gpt-oss-120b" #model_id, #"meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
111
+ hf_provider:"fireworks-ai" #hf_provider,
112
+ endpoint_url:"" #endpoint_url,
113
+ backend_choice:"provider" #backend_choice,
114
+ system_message:"" #system_message,
115
+ max_tokens:8192 #max_tokens,
116
+ temperature:0.2 #temperature,
117
+ top_p:0.2 #top_p,
118
+ stream:"stream"
119
+ api_token:"a1b2c3" #get_token,
120
+ output_format:"markdown" #output_format, #"markdown",
121
+ openai_model:"openai/gpt-oss-120b" #self.client.model_id, #"model_name"
122
+ openai_api_key:"a1b2c3" #self.client.openai_api_key, #self.api_token,
123
+ openai_base_url:"https://router.huggingface.co/v1" #self.client.base_url, #self.base_url,
124
+ #temperature=self.client.temperature,
125
+ #top_p=self.client.top_p,
126
+ openai_image_format:"webp" #"png" #better compatibility
127
+ max_retries:3 ## pass to __call__
128
+
129
+
130
+ [marker_nostrip]
131
+ provider="openai"
132
+ model_id="openai/gpt-oss-120b"
133
+ hf_provider="fireworks-ai"
134
+ endpoint_url=""
135
+ backend_choice="provider"
136
+ system_message=""
137
+ max_tokens=8192
138
+ temperature=0.2
139
+ top_p=0.2
140
+ stream=True
141
+ api_token="a1b2c3"
142
+ openai_model="openai/gpt-oss-120b"
143
+ openai_api_key="a1b2c3"
144
+ openai_base_url="https://router.huggingface.co/v1"
145
+ openai_image_format="webp"
146
+ #max_retries=3
147
+
148
+ #[Configuration]
149
+ use_llm=True
150
+ output_format="markdown"
151
+ input_dir="inputs"
152
+ output_dir="output_md"
153
+ max_workers=4
154
+ max_retries=2
155
+ extract_images=True
156
+ output_image_format="png"
157
+ output_encoding=utf-8
158
+ debug_data_folder="debug_data"
utils/config.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/config.py
2
+
3
+ import os
4
+
5
+ """
6
+ Centralised configuration constants.
7
+ ##SMY: TODO: Create Class Settings(BaseSettings) leveraging from pydantic_settings import BaseSettings
8
+ """
9
+
10
+ # UI text
11
+ TITLE = "PyPDFMd – PDF & HTML ↔ Markdown Converter"
12
+ DESCRIPTION = (
13
+ "Drag‑and‑drop a single PDF/HTML, a folder to convert to Markdown."
14
+ "Or upload Markdown/LaTeX files and generate a polished PDF."
15
+ )
16
+ DESCRIPTION_PDF_HTML = (
17
+ "Upload a single or multiple PDF or HTML, a folder or an entire directory tree "
18
+ "to convert to Markdown."
19
+ )
20
+ DESCRIPTION_PDF = (
21
+ "Drag‑and‑drop a single PDF, a folder of PDFs or an entire directory tree "
22
+ "to convert to Markdown."
23
+ )
24
+ DESCRIPTION_HTML = (
25
+ "Drag‑and‑drop a single HTML, a folder of HTMLs or an entire directory tree "
26
+ "to convert to Markdown."
27
+ )
28
+ DESCRIPTION_MD = (
29
+ "Upload Markdown/LaTeX files and generate a polished PDF."
30
+ )
31
+
32
+ # Conversion defaults
33
+ DEFAULT_MARKER_OPTIONS = {
34
+ "include_images": True,
35
+ "image_format": "png",
36
+ }
37
+
38
+ # Configuration
39
+ MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
40
+ MAX_RETRIES = int(os.getenv("MAX_RETRIES", "2")) #3
41
+ INPUT_DIR = os.getenv("INPUT_DIR", "inputs") # unused
42
+ OUTPUT_DIR = os.getenv("OUTPUT_DIR", "md_output")
43
+ USE_LLM = bool(os.getenv("USE-LLM", False)) #True
44
+ EXTRACT_IMAGES = bool(os.getenv("EXTRACT_IMAGES", True)) #True
45
+ OUTPUT_IMAGE_FORMAT = os.getenv("OUTPUT_IMAGE_FORMAT", "png") #png
46
+ OUTPUT_ENCODING = os.getenv("OUTPUT_ENCODING", "utf-8") #utf-8
47
+ DEBUG_DATA_FOLDER = os.getenv("DEBUG_DATA_FOLDER", "debug_data") #debug_data
48
+
49
+ # Global
50
+ HF_MODEL = os.getenv("HF_MODEL", "gpt2") # swap for a chat-capable model
51
+ HF_TOKEN = os.getenv("HF_TOKEN") # your Hugging Face token
52
+
53
+
54
+
55
+ ## //TODO:
56
+ # from config.ini ##SMY: future plan to merge
57
+ api_token="a1b2c3"
58
+ OUTPUT_FORMAT = "markdown" #output_format
59
+ OPENAI_MODEL = "openai/gpt-oss-120b" #openai_model
60
+ OPENAI_API_KEY = "a1b2c3" #openai_api_key
61
+ OPENAI_BASE_URL = "https://router.huggingface.co/v1" ##openai_base_url
62
+ OPENAI_IMAGE_FORMAT = "webp" #openai_image_format
63
+ OUTPUT_IMAGE_FORMAT = "png"
64
+ #max_retries=3
65
+
66
+ #[marker]
67
+ PROVIDER = "openai" #provider
68
+ MODEL_ID = "openai/gpt-oss-120b" #model_id
69
+ HF_PROVIDER = "fireworks-ai" #hf_provider
70
+ ENDPOINT_URL = "" #endpoint_url
71
+ BACKEND_CHOiCE = "provider" #backend_choice
72
+ SYSTEM_MESSAGE = "" #system_message
73
+ MAX_TOKENS = 8192 #max_tokens
74
+ TEMMPERATURE = 0.2 #temperature
75
+ TOP_P = 0.2 #top_p
76
+ STREAM = True #stream
77
+
78
+ # Globals within each worker process
79
+ hf_client = None
80
+ artifact_dict = None
81
+ pdf_converter = None
82
+ html_converter = None
83
+
utils/get_arg_name.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+
3
+ def get_arg_name_as_string(arg):
4
+ """
5
+ Returns the name of the argument passed to the function as a string.
6
+ This works by inspecting the calling frame's local variables.
7
+
8
+ example usage:
9
+ def my_function(x):
10
+ arg_name = get_arg_name_as_string(arg_x)
11
+ print(f"The argument name is: {arg_name}") # Outputs: "The argument name is: arg_x"
12
+ """
13
+ frame = inspect.currentframe().f_back # Get the frame of the caller
14
+ arg_name = None
15
+ for name, value in frame.f_locals.items():
16
+ if value is arg:
17
+ arg_name = name
18
+ break
19
+ return arg_name
utils/get_config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from configparser import ConfigParser as config
2
+ from typing import Union
3
+ from pathlib import Path
4
+ #from utils.get_arg_name import get_arg_name_as_string
5
+ import traceback
6
+
7
+ def get_config_value(section:str, parameter:str, fallback:str=None, configfile: Union[str, Path]="utils\\config.ini"):
8
+ """ Load config file, locate section, read parameter and return value """
9
+
10
+ try:
11
+ cfg = config()
12
+ cfg.read(configfile)
13
+ param_value = cfg[section].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
14
+ return param_value
15
+ except Exception as exc:
16
+ tb = traceback.format_exc()
17
+ raise RuntimeWarning(f"Error loading config: {exc}\n{tb}")
18
+ #pass
utils/lib_loader.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import sys
4
+ import ctypes
5
+ from typing import Union
6
+ from configparser import ConfigParser as config
7
+ from venv import logger
8
+ from utils.get_arg_name import get_arg_name_as_string
9
+ from utils.get_config import get_config_value
10
+ import traceback
11
+
12
+ from utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+ def set_weasyprint_library(libpath: Union[str, Path] = None, config_file: Union[str, Path] = "utils\\config.ini"):
17
+ """ Loads Weasyprint backend dependency libraries to environment """
18
+ # Check if the current platform is Windows
19
+ if sys.platform == 'win32':
20
+
21
+ #libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
22
+ if not libpath:
23
+ '''cfg = config()
24
+ cfg.read(config_file) #"utils\\config.ini")
25
+ lib_path = cfg["LIBRARIES_CAP"].get(f"WEASYPRINT_DLL_DIRECTORIES", "C:\\Dat\\dev\\gtk3-runtime\\bin")
26
+ '''
27
+ lib_path = get_config_value("LIBRARIES_CAP", "WEASYPRINT_DLL_DIRECTORIES") if not libpath else "C:\\Dat\\dev\\gtk3-runtime\\bin"
28
+
29
+ # Check if the file exists before attempting to load it
30
+ #if not os.path.exists(libobject):
31
+ if not Path(lib_path).exists():
32
+ raise FileNotFoundError(f"The specified Weasyprint DLL Directory does not exist: {lib_path}. Follow Weasyprint installation guide or provide a valid GTK3-runtime path.")
33
+ #logger.exception(f"gobject library path: {libgobject_path}") ##debug
34
+
35
+ try:
36
+ # Set a new environment variable
37
+ lib_path = lib_path ##SMY: on dev machine, using extracted 'portable' GTK3 rather than installing 'MSYS2'
38
+ os.environ["WEASYPRINT_DLL_DIRECTORIES"] = lib_path
39
+ #logger.info(f"sets Weasyprint DLL library path: {lib_path}") #debug
40
+
41
+ except Exception as exc:
42
+ tb = traceback.format_exc()
43
+ logger.exception(f"Error setting environ: weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
44
+
45
+ raise RuntimeWarning(f"✗ error during setting environ: weasyprint backend dependency → {exc}\n{tb}")
46
+
47
+
48
+ def load_library(libobject_name: Union[str, Path]):
49
+ """
50
+ Loads Weasyprint backend dependency libraries
51
+ usage: list(map(load_library, library_list)) ##SMY: map the load_library function to each item in library_list
52
+ The library list was starting to grow excessively, opt to setting environ
53
+ """
54
+ # Check if the current platform is Windows
55
+ if sys.platform == 'win32':
56
+
57
+ #libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
58
+ cfg = config()
59
+ cfg.read("utils\\config.ini")
60
+ lib_path = cfg["libraries"].get(f"libobject_path", "C:\\Dat\\dev\\gtk3-runtime\\bin")
61
+ lib_object_dll = get_arg_name_as_string(libobject_name) ## future use
62
+
63
+ # Construct the path to libgobject-2.0.dll
64
+ #libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', 'C:\\glib'), 'bin', 'libgobject-2.0-0.dll')
65
+ libobject = f"{lib_path}\\{libobject_name}.dll" ##libgobject-2.0-0.dll"
66
+ #print(f"Loading gobject library: {libgobject}") #debug
67
+
68
+ # Check if the file exists before attempting to load it
69
+ #if not os.path.exists(libobject):
70
+ if not Path(libobject).exists():
71
+ raise FileNotFoundError(f"The specified library file does not exist: {libobject}")
72
+ #print(f"gobject library path: {libgobject_path}") ##debug
73
+
74
+ # Load the library using ctypes
75
+ try:
76
+ ctypes_libgobject = ctypes.CDLL(libobject)
77
+ #msg = f"libgobject-2.0-0.dll loaded successfully via ctypes. {str(ctypes_libgobject)}"
78
+ #print(msg) ##debug
79
+ except OSError as exc:
80
+ tb = traceback.format_exc()
81
+ raise RuntimeWarning(f"Failed to load library: {exc}\n{tb}") ##raise RuntimeError
82
+
83
+ ## Test
84
+ #load_library("libpango-1.0-0")
85
+ #load_library("libgobject-2.0-0")
86
+
87
+
88
+ ##SMY: Original implementation: TODO: for refactoring
89
+ def load_libgobject():
90
+ # Check if the current platform is Windows
91
+ if sys.platform == 'win32':
92
+
93
+ #libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
94
+ cfg = config()
95
+ cfg.read("utils\\config.ini")
96
+ libgobject_path = cfg["libraries"].get("libgobject_path", "C:\\Dat\\dev\\gtk3-runtime\\bin")
97
+
98
+ # Construct the path to libgobject-2.0.dll
99
+ #libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', 'C:\\glib'), 'bin', 'libgobject-2.0-0.dll')
100
+ libgobject = f"{libgobject_path}\\libgobject-2.0-0.dll"
101
+ #print(f"Loading gobject library: {libgobject}") #debug
102
+
103
+ # Check if the file exists before attempting to load it
104
+ if not os.path.exists(libgobject):
105
+ raise FileNotFoundError(f"The specified library file does not exist: {libgobject}")
106
+ #print(f"gobject library path: {libgobject_path}") ##debug
107
+
108
+ # Load the library using ctypes
109
+ try:
110
+ ctypes_libgobject = ctypes.CDLL(libgobject)
111
+ #msg = f"libgobject-2.0-0.dll loaded successfully via ctypes. {str(ctypes_libgobject)}"
112
+ #print(msg) ##debug
113
+
114
+ return ctypes_libgobject
115
+ except OSError as exc:
116
+ tb = traceback.format_exc()
117
+ raise RuntimeWarning(f"Failed to load library: {exc}\n{tb}") ##raise RuntimeError
118
+
119
+
120
+ # Load the library using ctypes (Linux/macOS)
121
+ # Construct the path to libgobject-2.0.so.0 in the custom GLib installation
122
+ #libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', '/opt/glib'), 'lib', 'libgobject-2.0.so.0')
123
+ #print("This script is intended to run on Unix-like systems, not Windows.")
124
+ else:
125
+ # Load the library using ctypes (Linux/macOS)
126
+ # Construct the path to libgobject-2.0.so.0 in the custom GLib installation
127
+ libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', '/opt/glib'), 'lib', 'libgobject-2.0.so.0')
128
+ #print("This script is intended to run on Unix-like systems, not Windows.")
129
+
130
+ return libgobject_path
utils/logger.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/logger.py
2
+
3
+ import json
4
+ import logging
5
+ import sys
6
+ from datetime import datetime, timezone
7
+
8
+ '''
9
+ def get_logger(name: str) -> logging.Logger:
10
+ """
11
+ Returns a logger configured with a console handler.
12
+ """
13
+ logger = logging.getLogger(name)
14
+ if not logger.handlers:
15
+ logger.setLevel(logging.INFO)
16
+ ch = logging.StreamHandler()
17
+ formatter = logging.Formatter(
18
+ "[%(asctime)s] %(levelname)s - %(name)s: %(message)s",
19
+ datefmt="%H:%M:%S",
20
+ )
21
+ ch.setFormatter(formatter)
22
+ logger.addHandler(ch)
23
+ return logger
24
+ '''
25
+
26
+ class JsonFormatter(logging.Formatter):
27
+ """Minimal JSON formatter for structured logs."""
28
+
29
+ def format(self, record: logging.LogRecord) -> str: #
30
+ payload = {
31
+ #"ts": datetime.now(timezone.utc).isoformat(), ## default to 'YYYY-MM-DD HH:MM:SS.mmmmmm',
32
+ "ts": datetime.now(timezone.utc).strftime("%Y-%m-%d : %H:%M:%S"), ## SMY: interested in datefmt="%H:%M:%S",
33
+ "level": record.levelname,
34
+ "logger": record.name,
35
+ "message": record.getMessage(),
36
+ }
37
+ # Include extra attributes (fields not in default LogRecord)
38
+ for key, value in record.__dict__.items():
39
+ if key in ("args", "msg", "levelno", "levelname", "name", "pathname", "filename",
40
+ "module", "exc_info", "exc_text", "stack_info", "lineno", "funcName",
41
+ "created", "msecs", "relativeCreated", "thread", "threadName",
42
+ "processName", "process"):
43
+ continue
44
+ payload[key] = value
45
+ return json.dumps(payload, ensure_ascii=False)
46
+
47
+ #def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
48
+ def setup_logging(level: int = None) -> None:
49
+ """Configure root logger with JSON output to both stdout and file.
50
+
51
+ Args:
52
+ level: Logging level. If None, uses WARNING for production (HF Spaces)
53
+ and INFO for local development.
54
+ """
55
+ if level is None:
56
+ # Auto-detect environment: WARNING for production, INFO for local dev
57
+ import os
58
+ is_production = os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID") or os.getenv("HUGGINGFACE_SPACE_ID")
59
+ level = logging.WARNING if is_production else logging.INFO
60
+
61
+ # Console handler
62
+ console_handler = logging.StreamHandler(stream=sys.stdout)
63
+ console_handler.setFormatter(JsonFormatter()) #, datefmt="%H:%M:%S",) ##explicit time format
64
+
65
+ # File handler
66
+ #file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
67
+ file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
68
+ file_handler.setFormatter(JsonFormatter())
69
+
70
+ root = logging.getLogger()
71
+ root.handlers.clear()
72
+ root.addHandler(console_handler)
73
+ root.addHandler(file_handler)
74
+ root.setLevel(level)
75
+
76
+
77
+ def get_logger(name: str) -> logging.Logger:
78
+ """Return a module logger configured with console handler using defined JSON format."""
79
+ return logging.getLogger(name)
80
+
81
+
utils/utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def is_dict(variable):
2
+ """Checks if a variable is a dict."""
3
+ if isinstance(variable, dict):
4
+ return True
5
+
6
+ return False
7
+
8
+ def is_list_of_dicts(variable):
9
+ """Checks if a variable is a list containing only dicts."""
10
+
11
+ if isinstance(variable, list):
12
+ # Return True only if the list is empty or all elements are dicts.
13
+ return all(isinstance(item, dict) for item in variable)
14
+
15
+ return False