Spaces:
Sleeping
Sleeping
Commit
·
0fd441a
0
Parent(s):
initial commit
Browse filesBaseline initial commit
- .cursorindexingignore +3 -0
- .gitattributes +1 -0
- .gitignore +227 -0
- .python-version +1 -0
- .specstory/.gitignore +2 -0
- README.md +178 -0
- __init__.py +0 -0
- converters/__init__.py +1 -0
- converters/extraction_converter.py +264 -0
- converters/pdf_to_md.py +332 -0
- data/output_dir/.gitignore +4 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/.gitignore +4 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.md +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_0_Picture_1.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_11_Figure_9.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_18_Figure_1.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_1.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_9.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_6_Figure_1.jpeg +0 -0
- data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_8_Figure_1.jpeg +0 -0
- data/pdf/.gitignore +3 -0
- data/pdf/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.pdf +3 -0
- file_handler/__init__.py +1 -0
- file_handler/file_utils.py +296 -0
- llm/__init__.py +0 -0
- llm/hf_client.py +244 -0
- llm/llm_login.py +70 -0
- llm/openai_client.py +91 -0
- llm/provider_validator.py +116 -0
- main.py +22 -0
- pyproject.toml +9 -0
- requirements.txt +5 -0
- tests/test_converters.py +98 -0
- tests/test_file_handler.py +115 -0
- tests/test_llm.py +115 -0
- tests/test_main_ui.py +148 -0
- tests/test_utils.py +94 -0
- tests/tests_converter.py +19 -0
- ui/__init__.py +0 -0
- ui/gradio_ui.py +850 -0
- utils/__init__.py +0 -0
- utils/config.ini +158 -0
- utils/config.py +83 -0
- utils/get_arg_name.py +19 -0
- utils/get_config.py +18 -0
- utils/lib_loader.py +130 -0
- utils/logger.py +81 -0
- utils/utils.py +15 -0
.cursorindexingignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Don't index SpecStory auto-save files, but allow explicit context inclusion via @ references
|
| 3 |
+
.specstory/**
|
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# data
|
| 2 |
+
# data/
|
| 3 |
+
# !pdf/
|
| 4 |
+
# !output_dir/
|
| 5 |
+
|
| 6 |
+
md_to_pdf*
|
| 7 |
+
html_to_md*
|
| 8 |
+
|
| 9 |
+
# Byte-compiled / optimized / DLL files
|
| 10 |
+
__pycache__/
|
| 11 |
+
*.py[codz]
|
| 12 |
+
*$py.class
|
| 13 |
+
|
| 14 |
+
# C extensions
|
| 15 |
+
*.so
|
| 16 |
+
|
| 17 |
+
# Certificates
|
| 18 |
+
*.pem
|
| 19 |
+
|
| 20 |
+
# Distribution / packaging
|
| 21 |
+
.Python
|
| 22 |
+
build/
|
| 23 |
+
develop-eggs/
|
| 24 |
+
dist/
|
| 25 |
+
downloads/
|
| 26 |
+
eggs/
|
| 27 |
+
.eggs/
|
| 28 |
+
lib/
|
| 29 |
+
lib64/
|
| 30 |
+
parts/
|
| 31 |
+
sdist/
|
| 32 |
+
var/
|
| 33 |
+
wheels/
|
| 34 |
+
share/python-wheels/
|
| 35 |
+
*.egg-info/
|
| 36 |
+
.installed.cfg
|
| 37 |
+
*.egg
|
| 38 |
+
MANIFEST
|
| 39 |
+
|
| 40 |
+
# PyInstaller
|
| 41 |
+
# Usually these files are written by a python script from a template
|
| 42 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 43 |
+
*.manifest
|
| 44 |
+
*.spec
|
| 45 |
+
|
| 46 |
+
# Installer logs
|
| 47 |
+
pip-log.txt
|
| 48 |
+
pip-delete-this-directory.txt
|
| 49 |
+
|
| 50 |
+
# Unit test / coverage reports
|
| 51 |
+
htmlcov/
|
| 52 |
+
.tox/
|
| 53 |
+
.nox/
|
| 54 |
+
.coverage
|
| 55 |
+
.coverage.*
|
| 56 |
+
.cache
|
| 57 |
+
nosetests.xml
|
| 58 |
+
coverage.xml
|
| 59 |
+
*.cover
|
| 60 |
+
*.py.cover
|
| 61 |
+
.hypothesis/
|
| 62 |
+
.pytest_cache/
|
| 63 |
+
cover/
|
| 64 |
+
|
| 65 |
+
# Translations
|
| 66 |
+
*.mo
|
| 67 |
+
*.pot
|
| 68 |
+
|
| 69 |
+
# Django stuff:
|
| 70 |
+
*.log
|
| 71 |
+
local_settings.py
|
| 72 |
+
db.sqlite3
|
| 73 |
+
db.sqlite3-journal
|
| 74 |
+
|
| 75 |
+
# Flask stuff:
|
| 76 |
+
instance/
|
| 77 |
+
.webassets-cache
|
| 78 |
+
|
| 79 |
+
# Scrapy stuff:
|
| 80 |
+
.scrapy
|
| 81 |
+
|
| 82 |
+
# Sphinx documentation
|
| 83 |
+
docs/_build/
|
| 84 |
+
|
| 85 |
+
# PyBuilder
|
| 86 |
+
.pybuilder/
|
| 87 |
+
target/
|
| 88 |
+
|
| 89 |
+
# Jupyter Notebook
|
| 90 |
+
.ipynb_checkpoints
|
| 91 |
+
|
| 92 |
+
# IPython
|
| 93 |
+
profile_default/
|
| 94 |
+
ipython_config.py
|
| 95 |
+
|
| 96 |
+
# pyenv
|
| 97 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 98 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 99 |
+
# .python-version
|
| 100 |
+
|
| 101 |
+
# pipenv
|
| 102 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 103 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 104 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 105 |
+
# install all needed dependencies.
|
| 106 |
+
# Pipfile.lock
|
| 107 |
+
|
| 108 |
+
# UV
|
| 109 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 110 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 111 |
+
# commonly ignored for libraries.
|
| 112 |
+
# uv.lock
|
| 113 |
+
|
| 114 |
+
# poetry
|
| 115 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 116 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 117 |
+
# commonly ignored for libraries.
|
| 118 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 119 |
+
# poetry.lock
|
| 120 |
+
# poetry.toml
|
| 121 |
+
|
| 122 |
+
# pdm
|
| 123 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 124 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 125 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 126 |
+
# pdm.lock
|
| 127 |
+
# pdm.toml
|
| 128 |
+
.pdm-python
|
| 129 |
+
.pdm-build/
|
| 130 |
+
|
| 131 |
+
# pixi
|
| 132 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 133 |
+
# pixi.lock
|
| 134 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 135 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 136 |
+
.pixi
|
| 137 |
+
|
| 138 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 139 |
+
__pypackages__/
|
| 140 |
+
|
| 141 |
+
# Celery stuff
|
| 142 |
+
celerybeat-schedule
|
| 143 |
+
celerybeat.pid
|
| 144 |
+
|
| 145 |
+
# Redis
|
| 146 |
+
*.rdb
|
| 147 |
+
*.aof
|
| 148 |
+
*.pid
|
| 149 |
+
|
| 150 |
+
# RabbitMQ
|
| 151 |
+
mnesia/
|
| 152 |
+
rabbitmq/
|
| 153 |
+
rabbitmq-data/
|
| 154 |
+
|
| 155 |
+
# ActiveMQ
|
| 156 |
+
activemq-data/
|
| 157 |
+
|
| 158 |
+
# SageMath parsed files
|
| 159 |
+
*.sage.py
|
| 160 |
+
|
| 161 |
+
# Environments
|
| 162 |
+
.env
|
| 163 |
+
.envrc
|
| 164 |
+
.venv
|
| 165 |
+
env/
|
| 166 |
+
venv/
|
| 167 |
+
ENV/
|
| 168 |
+
env.bak/
|
| 169 |
+
venv.bak/
|
| 170 |
+
|
| 171 |
+
# Spyder project settings
|
| 172 |
+
.spyderproject
|
| 173 |
+
.spyproject
|
| 174 |
+
|
| 175 |
+
# Rope project settings
|
| 176 |
+
.ropeproject
|
| 177 |
+
|
| 178 |
+
# mkdocs documentation
|
| 179 |
+
/site
|
| 180 |
+
|
| 181 |
+
# mypy
|
| 182 |
+
.mypy_cache/
|
| 183 |
+
.dmypy.json
|
| 184 |
+
dmypy.json
|
| 185 |
+
|
| 186 |
+
# Pyre type checker
|
| 187 |
+
.pyre/
|
| 188 |
+
|
| 189 |
+
# pytype static type analyzer
|
| 190 |
+
.pytype/
|
| 191 |
+
|
| 192 |
+
# Cython debug symbols
|
| 193 |
+
cython_debug/
|
| 194 |
+
|
| 195 |
+
# PyCharm
|
| 196 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 197 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 198 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 199 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 200 |
+
# .idea/
|
| 201 |
+
|
| 202 |
+
# Abstra
|
| 203 |
+
# Abstra is an AI-powered process automation framework.
|
| 204 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 205 |
+
# Learn more at https://abstra.io/docs
|
| 206 |
+
.abstra/
|
| 207 |
+
|
| 208 |
+
# Visual Studio Code
|
| 209 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 210 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 211 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 212 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 213 |
+
# .vscode/
|
| 214 |
+
|
| 215 |
+
# Ruff stuff:
|
| 216 |
+
.ruff_cache/
|
| 217 |
+
|
| 218 |
+
# PyPI configuration file
|
| 219 |
+
.pypirc
|
| 220 |
+
|
| 221 |
+
# Marimo
|
| 222 |
+
marimo/_static/
|
| 223 |
+
marimo/_lsp/
|
| 224 |
+
__marimo__/
|
| 225 |
+
|
| 226 |
+
# Streamlit
|
| 227 |
+
.streamlit/secrets.toml
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
.specstory/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SpecStory explanation file
|
| 2 |
+
/.what-is-this.md
|
README.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
#[project]
|
| 3 |
+
#name: "parserpdf"
|
| 4 |
+
name: "parser2md"
|
| 5 |
+
#title: "parserPDF"
|
| 6 |
+
title: "parser2md"
|
| 7 |
+
emoji: 📝
|
| 8 |
+
colorFrom: yellow
|
| 9 |
+
colorTo: purple
|
| 10 |
+
sdk: gradio
|
| 11 |
+
sdk_version: 5.0.1
|
| 12 |
+
app_file: main.py
|
| 13 |
+
pinned: false
|
| 14 |
+
license: mit
|
| 15 |
+
short_description: 'PDF & HTML parser to markdown'
|
| 16 |
+
version: "0.1.0"
|
| 17 |
+
readme: "README.md"
|
| 18 |
+
requires-python: ">=3.12"
|
| 19 |
+
dependencies: []
|
| 20 |
+
owner: "research-semmyk"
|
| 21 |
+
---
|
| 22 |
+
# parserPDF
|
| 23 |
+
|
| 24 |
+
[](https://www.gradio.app/)
|
| 25 |
+
[](https://www.python.org/)
|
| 26 |
+
[](LICENSE)
|
| 27 |
+
|
| 28 |
+
A Gradio-based web application for converting PDF and HTML documents to Markdown format. Powered by the Marker library (a pipeline of deep learning models for document parsing) and optional LLM integration for enhanced processing. Supports batch processing of files and directories via an intuitive UI.
|
| 29 |
+
|
| 30 |
+
## Features
|
| 31 |
+
- **PDF to Markdown**: Extract text, tables, and images from PDFs using Marker.
|
| 32 |
+
- **HTML to Markdown**: Convert HTML files to clean Markdown.
|
| 33 |
+
- **Batch Processing**: Upload multiple files or entire directories.
|
| 34 |
+
- **LLM Integration**: Optional use of Hugging Face or OpenAI models for advanced conversion (e.g., via Llama or GPT models).
|
| 35 |
+
- **Customizable Settings**: Adjust model parameters, output formats (Markdown/HTML), page ranges, and more via the UI.
|
| 36 |
+
- **Output Management**: Generated Markdown files saved to a configurable output directory, with logs and download links.
|
| 37 |
+
|
| 38 |
+
## Project Structure
|
| 39 |
+
```
|
| 40 |
+
parserpdf/
|
| 41 |
+
├── README.md # Project documentation
|
| 42 |
+
├── requirements.txt # Python dependencies
|
| 43 |
+
├── main.py # Entry point – launches the Gradio UI
|
| 44 |
+
├── pyproject.toml # Project configuration
|
| 45 |
+
├── .env # Environment variables (e.g., API tokens)
|
| 46 |
+
├── .gitignore # Git ignore rules
|
| 47 |
+
├── converters/ # Conversion logic
|
| 48 |
+
│ ├── __init__.py
|
| 49 |
+
│ ├── extraction_converter.py # Document extraction utilities
|
| 50 |
+
│ ├── pdf_to_md.py # Marker-based PDF → Markdown
|
| 51 |
+
│ ├── html_to_md.py # HTML → Markdown
|
| 52 |
+
│ └── md_to_pdf.py # Markdown → PDF (pending full implementation)
|
| 53 |
+
├── file_handler/ # File handling utilities
|
| 54 |
+
│ ├── __init__.py
|
| 55 |
+
│ └── file_utils.py # Helpers for files, directories, and paths
|
| 56 |
+
├── llm/ # LLM client integrations
|
| 57 |
+
│ ├── __init__.py
|
| 58 |
+
│ ├── hf_client.py # Hugging Face client wrapper
|
| 59 |
+
│ ├── openai_client.py # Marker OpenAI client
|
| 60 |
+
│ ├── llm_login.py # Authentication handlers
|
| 61 |
+
│ └── provider_validator.py # Provider validation
|
| 62 |
+
├── ui/ # Gradio UI components
|
| 63 |
+
│ ├── __init__.py
|
| 64 |
+
│ └── gradio_ui.py # UI layout and event handlers
|
| 65 |
+
├── utils/ # Utility modules
|
| 66 |
+
│ ├── __init__.py
|
| 67 |
+
│ ├── config.py # Configuration constants
|
| 68 |
+
│ ├── config.ini # config file for settings
|
| 69 |
+
│ ├── logger.py # Logging wrapper
|
| 70 |
+
│ ├── lib_loader.py # loads weasyprint lib dependencies to environ
|
| 71 |
+
│ ├── get_config.py # helper for getting configurations
|
| 72 |
+
│ ├── get_arg_name.py # helper for getting argument names
|
| 73 |
+
│ └── utils.py # General utilities and helpers
|
| 74 |
+
├── data/ # Sample data and outputs (gitignored)
|
| 75 |
+
│ ├── output_dir/ # Output directory
|
| 76 |
+
│ ├── pdf/ # Sample PDFs
|
| 77 |
+
├── logs/ # Log files (gitignored)
|
| 78 |
+
├── tests/ # Unit tests
|
| 79 |
+
├── tests_converter.py # tests for converters
|
| 80 |
+
└── scrapyard/ # Development scraps
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
[Projected]
|
| 84 |
+
├── transformers/
|
| 85 |
+
│ ├── __init__.py
|
| 86 |
+
│ ├── marker.py # Marker class
|
| 87 |
+
│ └── marker_utils.py # helpers for Marker class
|
| 88 |
+
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## Installation
|
| 92 |
+
1. Clone the repository:
|
| 93 |
+
```
|
| 94 |
+
git clone <repo-url>
|
| 95 |
+
cd parserpdf
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
2. Create a virtual environment and install dependencies:
|
| 99 |
+
```
|
| 100 |
+
python -m venv venv
|
| 101 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 102 |
+
pip install -r requirements.txt
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
3. Set up environment variables (optional for LLM features):
|
| 106 |
+
- Create a `.env` file with your API tokens, e.g.:
|
| 107 |
+
```
|
| 108 |
+
HF_TOKEN=hf_xxx
|
| 109 |
+
OPENAI_API_KEY=sk-xxx
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
4. Install Marker (if not in requirements.txt):
|
| 113 |
+
```
|
| 114 |
+
pip install marker-pdf
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## Usage
|
| 118 |
+
1. Run the application:
|
| 119 |
+
```
|
| 120 |
+
python main.py
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
2. Open the provided local URL (e.g., http://127.0.0.1:7860) in your browser.
|
| 124 |
+
|
| 125 |
+
3. In the UI:
|
| 126 |
+
- Upload PDF/HTML files or directories via the "PDF & HTML ➜ Markdown" tab.
|
| 127 |
+
- Configure LLM/Marker settings in the accordions (e.g., select provider, model, tokens).
|
| 128 |
+
- Click "Process All Uploaded Files" to convert.
|
| 129 |
+
- View logs, JSON output, and download generated Markdown files.
|
| 130 |
+
|
| 131 |
+
### Example Workflow
|
| 132 |
+
- Upload a PDF directory.
|
| 133 |
+
- Set model to `meta-llama/Llama-4-Maverick-17B-128E-Instruct` (Hugging Face).
|
| 134 |
+
- Enable LLM if needed, set page range (e.g., "1-10").
|
| 135 |
+
- Process: Outputs Markdown files with extracted text/images to `output_dir`.
|
| 136 |
+
|
| 137 |
+
## Configuration
|
| 138 |
+
- Edit `utils/config.py` or `utils/config.ini` for defaults (e.g., model ID, output dir).
|
| 139 |
+
- UI overrides: Adjust sliders for max tokens, temperature, workers, etc.
|
| 140 |
+
|
| 141 |
+
## LLM Providers
|
| 142 |
+
- **Hugging Face**: Supports inference providers like Fireworks AI, Together AI.
|
| 143 |
+
- **OpenAI**: Compatible via router (default: https://router.huggingface.co/v1).
|
| 144 |
+
- Login via UI or CLI: `huggingface-cli login`.
|
| 145 |
+
|
| 146 |
+
## Output
|
| 147 |
+
- Markdown files saved to `output_dir` (default: `./output_dir`).
|
| 148 |
+
- Images extracted as JPEGs alongside Markdown.
|
| 149 |
+
- Logs in `logs/` and UI textbox.
|
| 150 |
+
|
| 151 |
+
## Limitations & TODO
|
| 152 |
+
- Markdown → PDF is pending full implementation.
|
| 153 |
+
- HTML tab is deprecated; use main tab for mixed uploads.
|
| 154 |
+
- Large files/directories may require increased `max_workers`.
|
| 155 |
+
- No JSON/chunks output yet (flagged for future).
|
| 156 |
+
|
| 157 |
+
## Contributing
|
| 158 |
+
Fork the repo, create a branch, and submit a PR.
|
| 159 |
+
|
| 160 |
+
Ensure tests pass: - verify the application's functionality.
|
| 161 |
+
```
|
| 162 |
+
pytest tests/
|
| 163 |
+
```
|
| 164 |
+
Test Structure
|
| 165 |
+
- tests/test_converters.py: Tests PDF/HTML/Markdown converters, including init, conversion, batch processing, and error handling.
|
| 166 |
+
- tests/test_file_handler.py: Tests file collection utilities (PDF/HTML/MD paths), data processing, and output directory creation.
|
| 167 |
+
- tests/test_utils.py: Tests logging setup, config loading, utility functions like is_dict/is_list_of_dicts, and configuration access.
|
| 168 |
+
- tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
|
| 169 |
+
- tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
## License
|
| 173 |
+
MIT License. See [LICENSE](LICENSE) for details.
|
| 174 |
+
|
| 175 |
+
## Acknowledgments
|
| 176 |
+
- Built with [Gradio](https://gradio.app/) for the UI.
|
| 177 |
+
- PDF parsing via [Marker](https://github.com/VikParuchuri/marker).
|
| 178 |
+
- LLM integrations using Hugging Face Transformers and OpenAI APIs.
|
__init__.py
ADDED
|
File without changes
|
converters/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
converters/extraction_converter.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import traceback
|
| 4 |
+
#import time
|
| 5 |
+
from typing import Dict, Any, Type, Optional, Union #, BaseModel
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
|
| 8 |
+
from marker.models import create_model_dict
|
| 9 |
+
#from marker.converters.extraction import ExtractionConverter as MarkerExtractor ## structured pydantic extraction
|
| 10 |
+
from marker.converters.pdf import PdfConverter as MarkerConverter ## full document convertion/extraction
|
| 11 |
+
from marker.config.parser import ConfigParser ## Process custom configuration
|
| 12 |
+
from marker.services.openai import OpenAIService as MarkerOpenAIService
|
| 13 |
+
#from sympy import Union
|
| 14 |
+
|
| 15 |
+
#from llm.hf_client import HFChatClient
|
| 16 |
+
from llm.openai_client import OpenAIChatClient
|
| 17 |
+
from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir
|
| 18 |
+
from utils.lib_loader import load_library
|
| 19 |
+
|
| 20 |
+
from utils.logger import get_logger
|
| 21 |
+
|
| 22 |
+
logger = get_logger(__name__)
|
| 23 |
+
|
| 24 |
+
# Full document converter
|
| 25 |
+
class DocumentConverter:
|
| 26 |
+
"""
|
| 27 |
+
Business logic wrapper using Marker OpenAI LLM Services to
|
| 28 |
+
convert documents (PDF, HTML files) into markdowns + assets.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self,
|
| 32 |
+
#provider: str,
|
| 33 |
+
model_id: str,
|
| 34 |
+
#base_url: str,
|
| 35 |
+
hf_provider: str,
|
| 36 |
+
#endpoint_url: str,
|
| 37 |
+
#backend_choice: str,
|
| 38 |
+
#system_message: str,
|
| 39 |
+
#max_tokens: int,
|
| 40 |
+
temperature: float,
|
| 41 |
+
top_p: float,
|
| 42 |
+
#stream: bool,
|
| 43 |
+
api_token: str,
|
| 44 |
+
openai_base_url: str = "https://router.huggingface.co/v1",
|
| 45 |
+
openai_image_format: Optional[str] = "webp",
|
| 46 |
+
#max_workers: Optional[str] = 4,
|
| 47 |
+
max_retries: Optional[int] = 2,
|
| 48 |
+
output_format: str = "markdown",
|
| 49 |
+
output_dir: Optional[Union[str, Path]] = "output_dir",
|
| 50 |
+
use_llm: Optional[bool] = None, #bool = False, #Optional[bool] = False, #True,
|
| 51 |
+
page_range: Optional[str] = None, #str = None #Optional[str] = None,
|
| 52 |
+
):
|
| 53 |
+
|
| 54 |
+
#self.converter = None #MarkerConverter
|
| 55 |
+
self.model_id = model_id #"model_name"
|
| 56 |
+
self.openai_api_key = api_token ## to replace dependency on self.client.openai_api_key
|
| 57 |
+
self.openai_base_url = openai_base_url #, #self.base_url,
|
| 58 |
+
self.temperature = temperature #, self.client.temperature,
|
| 59 |
+
self.top_p = top_p # self.client.top_p,
|
| 60 |
+
self.llm_service = MarkerOpenAIService
|
| 61 |
+
self.openai_image_format = openai_image_format #"png" #better compatibility
|
| 62 |
+
self.max_retries = max_retries ## pass to __call__
|
| 63 |
+
self.output_dir = output_dir
|
| 64 |
+
self.use_llm = use_llm[0] if isinstance(use_llm, tuple) else use_llm, #False, #True,
|
| 65 |
+
#self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range ##SMY: iterating twice because self.page casting as hint type tuple!
|
| 66 |
+
self.page_range = page_range if page_range else None
|
| 67 |
+
# self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range if isinstance(page_range, str) else None, ##Example: "0,4-8,16" ##Marker parses as List[int] #]debug #len(pdf_file)
|
| 68 |
+
'''
|
| 69 |
+
if isinstance(page_range, tuple | str):
|
| 70 |
+
self.page_range = page_range[0] if isinstance(page_range, tuple) else page_range
|
| 71 |
+
else:
|
| 72 |
+
self.page_range = None
|
| 73 |
+
'''
|
| 74 |
+
|
| 75 |
+
# 0) Instantiate the LLM Client (OPENAIChatClient): Get a provider‐agnostic chat function
|
| 76 |
+
##SMY: #future. Plan to integrate into Marker: uses its own LLM services (clients). As at 1.9.2, there's no huggingface client service.
|
| 77 |
+
try:
|
| 78 |
+
self.client = OpenAIChatClient(
|
| 79 |
+
model_id=model_id,
|
| 80 |
+
hf_provider=hf_provider,
|
| 81 |
+
#base_url=base_url,
|
| 82 |
+
api_token=api_token,
|
| 83 |
+
temperature=temperature,
|
| 84 |
+
top_p=top_p,
|
| 85 |
+
)
|
| 86 |
+
logger.log(level=20, msg="✔️ OpenAIChatClient instantiated:", extra={"model_id": self.client.model_id, "chatclient": str(self.client)})
|
| 87 |
+
|
| 88 |
+
except Exception as exc:
|
| 89 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 90 |
+
logger.exception(f"✗ Error initialising OpenAIChatClient: {exc}\n{tb}")
|
| 91 |
+
raise RuntimeError(f"✗ Error initialising OpenAIChatClient: {exc}\n{tb}") #.with_traceback(tb)
|
| 92 |
+
|
| 93 |
+
# 1) # Define the custom configuration for the Hugging Face LLM.
|
| 94 |
+
# Use typing.Dict and typing.Any for flexible dictionary type hints
|
| 95 |
+
try:
|
| 96 |
+
self.config_dict: Dict[str, Any] = self.get_config_dict(model_id=model_id, llm_service=str(self.llm_service), output_format=output_format)
|
| 97 |
+
#self.config_dict.pop("page_range") if self.config_dict.get("page_range")[0] is None else None ##SMY: execute if page_range is none. `else None` ensures valid syntactic expression
|
| 98 |
+
|
| 99 |
+
##SMY: if falsely empty tuple () or None, pop the "page_range" key-value pair, else do nothing if truthy tuple value (i.e. keep as-is)
|
| 100 |
+
self.config_dict.pop("page_range", None) if not self.config_dict.get("page_range") else None
|
| 101 |
+
|
| 102 |
+
logger.log(level=20, msg="✔️ config_dict custom configured:", extra={"service": "openai"}) #, "config": str(self.config_dict)})
|
| 103 |
+
|
| 104 |
+
except Exception as exc:
|
| 105 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 106 |
+
logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
|
| 107 |
+
raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
|
| 108 |
+
|
| 109 |
+
# 2) Use the Marker's ConfigParser to process configuration.
|
| 110 |
+
# The `ConfigParser` class is explicitly imported and used as the type hint.
|
| 111 |
+
try:
|
| 112 |
+
config_parser: ConfigParser = ConfigParser(self.config_dict)
|
| 113 |
+
logger.log(level=20, msg="✔️ parsed/processed custom config_dict:", extra={"config": str(config_parser)}) #.config_dict)})
|
| 114 |
+
|
| 115 |
+
except Exception as exc:
|
| 116 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 117 |
+
logger.exception(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}")
|
| 118 |
+
raise RuntimeError(f"✗ Error parsing/processing custom config_dict: {exc}\n{tb}") #.with_traceback(tb)
|
| 119 |
+
|
| 120 |
+
# 3) Create the artifact dictionary and retrieve the LLM service.
|
| 121 |
+
try:
|
| 122 |
+
#self.artifact_dict: Dict[str, Any] = self.get_create_model_dict ##SMY: Might have to eliminate function afterall
|
| 123 |
+
self.artifact_dict: Dict[str, Type[BaseModel]] = create_model_dict() ##SMY: BaseModel for Any??
|
| 124 |
+
#logger.log(level=20, msg="✔️ Create artifact_dict and llm_service retrieved:", extra={"llm_service": self.llm_service})
|
| 125 |
+
|
| 126 |
+
except Exception as exc:
|
| 127 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 128 |
+
logger.exception(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}")
|
| 129 |
+
raise RuntimeError(f"✗ Error creating artifact_dict or retrieving LLM service: {exc}\n{tb}") #.with_traceback(tb)
|
| 130 |
+
|
| 131 |
+
# 4) Instantiate Marker's MarkerConverter (PdfConverter) with config managed by config_parser
|
| 132 |
+
try:
|
| 133 |
+
llm_service_str = str(self.llm_service).split("'")[1] ## SMY: split and slicing ##Gets the string value
|
| 134 |
+
|
| 135 |
+
# sets api_key required by Marker
|
| 136 |
+
os.environ["OPENAI_API_KEY"] = self.openai_api_key or api_token ## to handle Marker's assertion test on OpenAI
|
| 137 |
+
logger.log(level=20, msg="self.converter: instantiating MarkerConverter:", extra={"llm_service_str": llm_service_str, "api_token": api_token}) ##debug
|
| 138 |
+
|
| 139 |
+
#self.converter: MarkerConverter = MarkerConverter(
|
| 140 |
+
self.converter = MarkerConverter(
|
| 141 |
+
#artifact_dict=self.artifact_dict,
|
| 142 |
+
artifact_dict=create_model_dict(),
|
| 143 |
+
config=config_parser.generate_config_dict(),
|
| 144 |
+
#llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
|
| 145 |
+
llm_service=llm_service_str ##resolve
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
|
| 149 |
+
#return self.converter ##SMY: to query why did I comment out?. Bingo: "__init__() should return None, not 'PdfConverter'"
|
| 150 |
+
except Exception as exc:
|
| 151 |
+
tb = traceback.format_exc
|
| 152 |
+
logger.exception(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
|
| 153 |
+
raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}\n{tb}")
|
| 154 |
+
|
| 155 |
+
# Define the custom configuration for HF LLM.
|
| 156 |
+
def get_config_dict(self, model_id: str, llm_service=MarkerOpenAIService, output_format: Optional[str] = "markdown" ) -> Dict[str, Any]:
|
| 157 |
+
""" Define the custom configuration for the Hugging Face LLM. """
|
| 158 |
+
|
| 159 |
+
try:
|
| 160 |
+
## Enable higher quality processing with LLMs. ## See MarkerOpenAIService,
|
| 161 |
+
#llm_service = llm_service.removeprefix("<class '").removesuffix("'>") # e.g <class 'marker.services.openai.OpenAIService'>
|
| 162 |
+
llm_service = str(llm_service).split("'")[1] ## SMY: split and slicing
|
| 163 |
+
self.use_llm = self.use_llm[0]
|
| 164 |
+
self.page_range = self.page_range[0] if isinstance(self.page_range, tuple) else self.page_range #if isinstance(self.page_range, str) else None, ##SMY: passing as hint type tuple!
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
config_dict = {
|
| 168 |
+
"output_format" : output_format, #"markdown",
|
| 169 |
+
"openai_model" : self.model_id, #self.client.model_id, #"model_name"
|
| 170 |
+
"openai_api_key" : self.client.openai_api_key, #self.client.openai_api_key, #self.api_token,
|
| 171 |
+
"openai_base_url": self.openai_base_url, #self.client.base_url, #self.base_url,
|
| 172 |
+
"temperature" : self.temperature, #self.client.temperature,
|
| 173 |
+
"top_p" : self.top_p, #self.client.top_p,
|
| 174 |
+
"openai_image_format": self.openai_image_format, #"webp", #"png" #better compatibility
|
| 175 |
+
"max_retries" : self.max_retries, #3, ## pass to __call__
|
| 176 |
+
"output_dir" : self.output_dir,
|
| 177 |
+
"use_llm" : self.use_llm, #False, #True,
|
| 178 |
+
"page_range" : self.page_range, #]debug #len(pdf_file)
|
| 179 |
+
}
|
| 180 |
+
return config_dict
|
| 181 |
+
except Exception as exc:
|
| 182 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 183 |
+
logger.exception(f"✗ Error configuring custom config_dict: {exc}\n{tb}")
|
| 184 |
+
raise RuntimeError(f"✗ Error configuring custom config_dict: {exc}\n{tb}") #").with_traceback(tb)
|
| 185 |
+
#raise
|
| 186 |
+
|
| 187 |
+
##SMY: flagged for deprecation
|
| 188 |
+
##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
|
| 189 |
+
#def get_extraction_converter(self, chat_fn):
|
| 190 |
+
def get_create_model_dict(self):
|
| 191 |
+
"""
|
| 192 |
+
Wraps the LLM chat_fn into marker’s artifact_dict
|
| 193 |
+
and returns an ExtractionConverter for PDFs & HTML.
|
| 194 |
+
"""
|
| 195 |
+
return create_model_dict()
|
| 196 |
+
#artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
|
| 197 |
+
#return artifact_dict
|
| 198 |
+
|
| 199 |
+
## SMY: Kept for future implementation (and historic reasoning). Keeping the classes separate to avoid confusion with the original implementation
|
| 200 |
+
'''
|
| 201 |
+
class DocumentExtractor:
|
| 202 |
+
"""
|
| 203 |
+
Business logic wrapper using HFChatClient and Marker to
|
| 204 |
+
convert documents (PDF, HTML files) into markdowns + assets
|
| 205 |
+
Wrapper around the Marker extraction converter for PDFs & HTML.
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
def __init__(self,
|
| 209 |
+
provider: str,
|
| 210 |
+
model_id: str,
|
| 211 |
+
hf_provider: str,
|
| 212 |
+
endpoint_url: str,
|
| 213 |
+
backend_choice: str,
|
| 214 |
+
system_message: str,
|
| 215 |
+
max_tokens: int,
|
| 216 |
+
temperature: float,
|
| 217 |
+
top_p: float,
|
| 218 |
+
stream: bool,
|
| 219 |
+
api_token: str,
|
| 220 |
+
):
|
| 221 |
+
# 1) Instantiate the LLM Client (HFChatClient): Get a provider‐agnostic chat function
|
| 222 |
+
try:
|
| 223 |
+
self.client = HFChatClient(
|
| 224 |
+
provider=provider,
|
| 225 |
+
model_id=model_id,
|
| 226 |
+
hf_provider=hf_provider,
|
| 227 |
+
endpoint_url=endpoint_url,
|
| 228 |
+
backend_choice=backend_choice, #choices=["model-id", "provider", "endpoint"]
|
| 229 |
+
system_message=system_message,
|
| 230 |
+
max_tokens=max_tokens,
|
| 231 |
+
temperature=temperature,
|
| 232 |
+
top_p=top_p,
|
| 233 |
+
stream=stream,
|
| 234 |
+
api_token=api_token,
|
| 235 |
+
)
|
| 236 |
+
logger.log(level=20, msg="✔️ HFChatClient instantiated:", extra={"model_id": model_id, "chatclient": str(self.client)})
|
| 237 |
+
|
| 238 |
+
except Exception as exc:
|
| 239 |
+
tb = traceback.format_exc() #exc.__traceback__
|
| 240 |
+
logger.exception(f"✗ Error initialising HFChatClient: {exc}")
|
| 241 |
+
raise RuntimeError(f"✗ Error initialising HFChatClient: {exc}").with_traceback(tb)
|
| 242 |
+
#raise
|
| 243 |
+
|
| 244 |
+
# 2) Build Marker's artifact dict using the client's chat method
|
| 245 |
+
self.artifact_dict = self.get_extraction_converter(self.client)
|
| 246 |
+
|
| 247 |
+
# 3) Instantiate Marker's ExtractionConverter (ExtractionConverter)
|
| 248 |
+
try:
|
| 249 |
+
self.extractor = MarkerExtractor(artifact_dict=self.artifact_dict)
|
| 250 |
+
except Exception as exc:
|
| 251 |
+
logger.exception(f"✗ Error initialising MarkerExtractor: {exc}")
|
| 252 |
+
raise RuntimeError(f"✗ Error initialising MarkerExtractor: {exc}")
|
| 253 |
+
|
| 254 |
+
##SMY: marker prefer default artifact dictionary (marker.models.create_model_dict) instead of overridding
|
| 255 |
+
def get_extraction_converter(self, chat_fn):
|
| 256 |
+
"""
|
| 257 |
+
Wraps the LLM chat_fn into marker’s artifact_dict
|
| 258 |
+
and returns an ExtractionConverter for PDFs & HTML.
|
| 259 |
+
"""
|
| 260 |
+
|
| 261 |
+
artifact_dict = create_model_dict(inhouse_chat_model=chat_fn)
|
| 262 |
+
return artifact_dict
|
| 263 |
+
'''
|
| 264 |
+
|
converters/pdf_to_md.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# converters/pdf_to_md.py
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Dict, Optional, Union
|
| 5 |
+
import traceback ## Extract, format and print information about Python stack traces.
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
#from llm.hf_client import HFChatClient
|
| 9 |
+
from converters.extraction_converter import DocumentConverter #, DocumentExtractor #as docextractor #ExtractionConverter #get_extraction_converter ## SMY: should disuse
|
| 10 |
+
from file_handler.file_utils import collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir, write_markdown, dump_images
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
from utils import config
|
| 14 |
+
from utils.lib_loader import set_weasyprint_library
|
| 15 |
+
from utils.logger import get_logger
|
| 16 |
+
|
| 17 |
+
logger = get_logger(__name__)
|
| 18 |
+
|
| 19 |
+
# Define global variables
|
| 20 |
+
docconverter: DocumentConverter = None
|
| 21 |
+
converter = None #DocumentConverter
|
| 22 |
+
#converter:DocumentConverter.converter = None
|
| 23 |
+
|
| 24 |
+
# Define docextractor in the pool as serialised object and passed to each worker process.
|
| 25 |
+
# Note: DocumentConverter must be "picklable".
|
| 26 |
+
def init_worker(#self,
|
| 27 |
+
provider: str,
|
| 28 |
+
model_id: str,
|
| 29 |
+
#base_url,
|
| 30 |
+
hf_provider: str,
|
| 31 |
+
endpoint_url: str,
|
| 32 |
+
backend_choice: str,
|
| 33 |
+
system_message: str,
|
| 34 |
+
max_tokens: int,
|
| 35 |
+
temperature: float,
|
| 36 |
+
top_p: float,
|
| 37 |
+
stream: bool,
|
| 38 |
+
api_token: str,
|
| 39 |
+
openai_base_url: str, #: str = "https://router.huggingface.co/v1",
|
| 40 |
+
openai_image_format: str, #: str | None = "webp",
|
| 41 |
+
max_workers: int,
|
| 42 |
+
max_retries: int, #: int | None = 2,
|
| 43 |
+
output_format: str, #: str = "markdown",
|
| 44 |
+
output_dir: str, #: Union | None = "output_dir",
|
| 45 |
+
use_llm: bool, #: bool | None = False,
|
| 46 |
+
page_range: str, #: str | None = None
|
| 47 |
+
):
|
| 48 |
+
|
| 49 |
+
#'''
|
| 50 |
+
"""
|
| 51 |
+
instantiate DocumentConverter/DocumentExtractor for use in each pool worker
|
| 52 |
+
Args:
|
| 53 |
+
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
## moved to class
|
| 57 |
+
# Initialise the global `converter` in each worker
|
| 58 |
+
# Define global variables
|
| 59 |
+
global docconverter
|
| 60 |
+
global converter
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
##SMY: kept for future implementation. Replaced with DocumentConverter.
|
| 64 |
+
'''
|
| 65 |
+
# 1) Instantiate the DocumentExtractor
|
| 66 |
+
logger.log(level=20, msg="initialising docextractor:", extra={"model_id": model_id, "hf_provider": hf_provider})
|
| 67 |
+
try:
|
| 68 |
+
docextractor = DocumentExtractor(
|
| 69 |
+
provider=provider,
|
| 70 |
+
model_id=model_id,
|
| 71 |
+
hf_provider=hf_provider,
|
| 72 |
+
endpoint_url=endpoint_url,
|
| 73 |
+
backend_choice=backend_choice,
|
| 74 |
+
system_message=system_message,
|
| 75 |
+
max_tokens=max_tokens,
|
| 76 |
+
temperature=temperature,
|
| 77 |
+
top_p=top_p,
|
| 78 |
+
stream=stream,
|
| 79 |
+
api_token=api_token,
|
| 80 |
+
)
|
| 81 |
+
logger.log(level=20, msg="✔️ docextractor initialised:", extra={"model_id": model_id, "hf_provider": hf_provider})
|
| 82 |
+
except Exception as exc:
|
| 83 |
+
#logger.error(f"Failed to initialise DocumentExtractor: {exc}")
|
| 84 |
+
tb = traceback.format_exc()
|
| 85 |
+
logger.exception(f"init_worker: Error initialising DocumentExtractor → {exc}\n{tb}", exc_info=True)
|
| 86 |
+
return f"✗ init_worker: error initialising DocumentExtractor → {exc}\n{tb}"
|
| 87 |
+
|
| 88 |
+
self.docextractor = docextractor
|
| 89 |
+
'''
|
| 90 |
+
|
| 91 |
+
#'''
|
| 92 |
+
# 1) Instantiate the DocumentConverter
|
| 93 |
+
logger.log(level=20, msg="initialising docconverter:", extra={"model_id": model_id, "hf_provider": hf_provider}) ##debug
|
| 94 |
+
try:
|
| 95 |
+
docconverter = DocumentConverter(
|
| 96 |
+
model_id, #: str,
|
| 97 |
+
hf_provider, #: str,
|
| 98 |
+
temperature, #: float,
|
| 99 |
+
top_p, #: float,
|
| 100 |
+
api_token, #: str,
|
| 101 |
+
openai_base_url, #: str = "https://router.huggingface.co/v1",
|
| 102 |
+
openai_image_format, #: str | None = "webp",
|
| 103 |
+
max_retries, #: int | None = 2,
|
| 104 |
+
output_format, #: str = "markdown",
|
| 105 |
+
output_dir, #: Union | None = "output_dir",
|
| 106 |
+
use_llm, #: bool | None = False,
|
| 107 |
+
page_range, #: str | None = None
|
| 108 |
+
)
|
| 109 |
+
logger.log(level=20, msg="✔️ docextractor initialised:", extra={"docconverter model_id": docconverter.converter.config.get("openai_model"), "docconverter use_llm": docconverter.converter.use_llm, "docconverter output_dir": docconverter.output_dir})
|
| 110 |
+
except Exception as exc:
|
| 111 |
+
#logger.error(f"Failed to initialise DocumentConverter: {exc}") #debug
|
| 112 |
+
tb = traceback.format_exc()
|
| 113 |
+
logger.exception(f"init_worker: Error initialising DocumentConverter → {exc}\n{tb}", exc_info=True)
|
| 114 |
+
return f"✗ init_worker: error initialising DocumentConverter → {exc}\n{tb}"
|
| 115 |
+
|
| 116 |
+
#docconverter = docconverter
|
| 117 |
+
converter = docconverter.converter
|
| 118 |
+
#self.llm_service = docconverter.llm_service ##duplicate?
|
| 119 |
+
#self.model_id = model_id ##duplicate?
|
| 120 |
+
#'''
|
| 121 |
+
|
| 122 |
+
class PdfToMarkdownConverter:
|
| 123 |
+
"""
|
| 124 |
+
Wrapper around the Marker library that converts PDFs to Markdown.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
#def __init__(self, options: Dict | None = None):
|
| 128 |
+
def __init__(self, options: Dict | None = None): #extractor: DocumentExtractor, options: Dict | None = None):
|
| 129 |
+
self.options = options or {}
|
| 130 |
+
self.output_dir_string = ''
|
| 131 |
+
#self.OUTPUT_DIR = config.OUTPUT_DIR ##flag unused
|
| 132 |
+
#self.MAX_RETRIES = config.MAX_RETRIES ##flag unused
|
| 133 |
+
#self.docconverter = None #DocumentConverter
|
| 134 |
+
#self.converter = self.docconverter.converter #None
|
| 135 |
+
|
| 136 |
+
# This global will be set (re-initialised) in each worker after init_worker runs
|
| 137 |
+
|
| 138 |
+
## moved from extraction_converter ( to standalone extract_to_md)
|
| 139 |
+
#def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
|
| 140 |
+
def extract(self, src_path: str, output_dir: str) -> Dict:
|
| 141 |
+
#def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
|
| 142 |
+
"""
|
| 143 |
+
Convert one file (PDF/HTML) to Markdown + images.
|
| 144 |
+
Writes a `.md` file and any extracted images under `output_dir`.
|
| 145 |
+
Returns a dict with metadata, e.g. {"filename": <file.name>, "images": <count>, "filepath": <filepath>}.
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
| 150 |
+
# Set a new environment variable
|
| 151 |
+
set_weasyprint_library() ##utils.lib_loader.set_weasyprint_library()
|
| 152 |
+
except Exception as exc:
|
| 153 |
+
tb = traceback.format_exc()
|
| 154 |
+
logger.exception(f"Error loading weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 155 |
+
raise RuntimeWarning(f"✗ error during loading weasyprint backend dependency → {exc}\n{tb}")
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# Run Marker conversion with LLM if use_llm is true
|
| 159 |
+
try:
|
| 160 |
+
#rendered = self.docconverter.converter(src_path, use_llm=True)
|
| 161 |
+
#rendered = self.docconverter.converter(src_path)
|
| 162 |
+
rendered = converter(src_path)
|
| 163 |
+
logger.log(level=20, msg=f"✓ File extraction successful for {Path(src_path).name}")
|
| 164 |
+
except Exception as exc:
|
| 165 |
+
tb = traceback.format_exc()
|
| 166 |
+
logger.exception(f"Error during file extraction → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 167 |
+
|
| 168 |
+
return f"✗ error during extraction → {exc}\n{tb}"
|
| 169 |
+
|
| 170 |
+
# Write Markdown file
|
| 171 |
+
'''
|
| 172 |
+
base = Path(str_path).stem ## Get filename without extension
|
| 173 |
+
md_path = output_dir / f"{base}.md" # Join output dir and new markdown file with the slash operator
|
| 174 |
+
|
| 175 |
+
with open(md_path, "w", encoding="utf-8") as f:
|
| 176 |
+
f.write(rendered.markdown)
|
| 177 |
+
'''
|
| 178 |
+
try:
|
| 179 |
+
md_file = write_markdown(src_path=src_path, output_dir=output_dir, rendered=rendered)
|
| 180 |
+
#debug md_file = "debug_md_file dummy name" ##debug
|
| 181 |
+
except Exception as exc:
|
| 182 |
+
tb = traceback.format_exc()
|
| 183 |
+
logger.exception(f"✗ error creating md_file → {exc}\n{tb}", exc_info=True)
|
| 184 |
+
#return f"✗ error creating md_file → {exc}\n{tb}"
|
| 185 |
+
|
| 186 |
+
# Dump extracted images
|
| 187 |
+
#debug images_count = 100 ##debug
|
| 188 |
+
try:
|
| 189 |
+
images_count, image_path = dump_images(src_path, output_dir, rendered)
|
| 190 |
+
except Exception as exc:
|
| 191 |
+
tb = traceback.format_exc()
|
| 192 |
+
logger.exception(f"✗ error counting and creating image_path → {exc}\n{tb}", exc_info=True)
|
| 193 |
+
#return f"✗ error counting andcreating image_path → {exc}\n{tb}"
|
| 194 |
+
|
| 195 |
+
#return {"images": len(rendered.images), "file": md_file} ##debug
|
| 196 |
+
return {"file": md_file.name, "images": images_count, "filepath": md_file, "image_path": image_path} ####SMY should be Dict[str, int, str]. Dicts are not necessarily ordered.
|
| 197 |
+
|
| 198 |
+
#def convert_files(src_path: str, output_dir: str, max_retries: int = 2) -> str:
|
| 199 |
+
def convert_files(self, src_path: str, output_dir_string: str = None, max_retries: int = 2) -> Union[Dict, str]: #str:
|
| 200 |
+
#def convert_files(self, src_path: str) -> str:
|
| 201 |
+
"""
|
| 202 |
+
Worker task: use `extractor` to convert file with retry/backoff.
|
| 203 |
+
Returns a short log line.
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
output_dir = create_outputdir(root=src_path, output_dir_string=self.output_dir_string)
|
| 208 |
+
logger.info(f"✓ output_dir created: {output_dir}") #{create_outputdir(src_path)}"
|
| 209 |
+
except Exception as exc:
|
| 210 |
+
tb = traceback.format_exc()
|
| 211 |
+
logger.exception("✗ error creating output_dir → {exc}\n{tb}", exc_info=True)
|
| 212 |
+
return f"✗ error creating output_dir → {exc}\n{tb}"
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
#if Path(src_path).suffix.lower() not in {".pdf", ".html", ".htm"}:
|
| 216 |
+
#if not Path(src_path).name.endswith(tuple({".pdf", ".html"})): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
|
| 217 |
+
if not Path(src_path).name.endswith((".pdf", ".html")): #,".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls"})):
|
| 218 |
+
logger.log(level=20, msg=f"skipped {Path(src_path).name}", exc_info=True)
|
| 219 |
+
return f"skipped {Path(src_path).name}"
|
| 220 |
+
except Exception as exc:
|
| 221 |
+
tb = traceback.format_exc()
|
| 222 |
+
logger.exception("✗ error during suffix extraction → {exc}\n{tb}", exc_info=True)
|
| 223 |
+
return f"✗ error during suffix extraction → {exc}"
|
| 224 |
+
|
| 225 |
+
#max_retries = self.MAX_RETRIES
|
| 226 |
+
for attempt in range(1, max_retries + 1):
|
| 227 |
+
try:
|
| 228 |
+
info = self.extract(str(src_path), str(output_dir.stem)) #extractor.converter(str(src_path), str(output_dir)) #
|
| 229 |
+
logger.log(level=20, msg=f"✓ : info about extracted {Path(src_path).name}: ", extra={"info": str(info)})
|
| 230 |
+
''' ##SMY: moving formating to calling Gradio
|
| 231 |
+
img_count = info.get("images", 0)
|
| 232 |
+
md_filename = info.get("file", 0)
|
| 233 |
+
md_filepath = info.get("filepath", 0)
|
| 234 |
+
#return f"✓ {src_path.name} ({img_count} images)"
|
| 235 |
+
return f"✓ {md_filename}: ({img_count} images)", md_filepath
|
| 236 |
+
'''
|
| 237 |
+
return info ##SMY: simply return the dict
|
| 238 |
+
except Exception as exc:
|
| 239 |
+
if attempt == max_retries:
|
| 240 |
+
tb = traceback.format_exc()
|
| 241 |
+
return f"✗ {info.get("file")} → {exc}\n{tb}"
|
| 242 |
+
#return f"✗ {md_filename} → {exc}\n{tb}"
|
| 243 |
+
|
| 244 |
+
#time.sleep(2 ** attempt)
|
| 245 |
+
# Exponential backoff before retry
|
| 246 |
+
logger.warning(f"Attempt {attempt} failed for {Path(src_path).name}: {exc}. Retrying in {2 ** attempt}s...")
|
| 247 |
+
|
| 248 |
+
time.sleep(2 ** attempt)
|
| 249 |
+
|
| 250 |
+
## SMY: unused
|
| 251 |
+
#===================== discarded
|
| 252 |
+
'''
|
| 253 |
+
def convert(self, pdf_path: Path) -> str:
|
| 254 |
+
"""
|
| 255 |
+
Convert a single PDF file to Markdown string.
|
| 256 |
+
|
| 257 |
+
Parameters
|
| 258 |
+
----------
|
| 259 |
+
pdf_path : pathlib.Path
|
| 260 |
+
Path to the source PDF.
|
| 261 |
+
|
| 262 |
+
Returns
|
| 263 |
+
-------
|
| 264 |
+
str
|
| 265 |
+
The extracted Markdown content.
|
| 266 |
+
"""
|
| 267 |
+
logger.info(f"Converting {pdf_path} → Markdown")
|
| 268 |
+
try:
|
| 269 |
+
md_text = self.marker.extract_markdown(str(pdf_path))
|
| 270 |
+
return md_text
|
| 271 |
+
except Exception as exc:
|
| 272 |
+
logger.exception("Marker failed to convert PDF.")
|
| 273 |
+
raise RuntimeError(f"Failed to convert {pdf_path}") from exc
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def batch_convert(self, pdf_paths: List[Path]) -> Dict[str, str]:
|
| 277 |
+
"""
|
| 278 |
+
Convert multiple PDFs and return a mapping of filename → Markdown.
|
| 279 |
+
|
| 280 |
+
Parameters
|
| 281 |
+
----------
|
| 282 |
+
pdf_paths : list[pathlib.Path]
|
| 283 |
+
List of PDF files to process.
|
| 284 |
+
|
| 285 |
+
Returns
|
| 286 |
+
-------
|
| 287 |
+
dict
|
| 288 |
+
Mapping from original file name (without extension) to Markdown string.
|
| 289 |
+
"""
|
| 290 |
+
results = {}
|
| 291 |
+
for p in pdf_paths:
|
| 292 |
+
try:
|
| 293 |
+
md = self.convert(p)
|
| 294 |
+
key = p.stem # filename without .pdf
|
| 295 |
+
results[key] = md
|
| 296 |
+
except Exception as exc:
|
| 297 |
+
logger.warning(f"Skipping {p}: {exc}")
|
| 298 |
+
return results
|
| 299 |
+
|
| 300 |
+
def convert_file(self, src_path: Path, extractor: DocumentConverter): #DocumentExtractor): #-> str:
|
| 301 |
+
"""
|
| 302 |
+
Converts one PDF or HTML file to Markdown + images
|
| 303 |
+
with retry/backoff on errors.
|
| 304 |
+
"""
|
| 305 |
+
path = src_path
|
| 306 |
+
out_dir = path.parent / self.OUTPUT_DIR
|
| 307 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 308 |
+
|
| 309 |
+
for attempt in range(1, self.MAX_RETRIES + 1):
|
| 310 |
+
try:
|
| 311 |
+
rendered = extractor.converter(str(path), use_llm=True)
|
| 312 |
+
|
| 313 |
+
# Write Markdown
|
| 314 |
+
md_file = out_dir / f"{path.stem}.md"
|
| 315 |
+
md_file.write_text(rendered.markdown, encoding="utf-8")
|
| 316 |
+
|
| 317 |
+
# Dump images
|
| 318 |
+
for name, content in rendered.images.items():
|
| 319 |
+
(out_dir / name).write_bytes(content)
|
| 320 |
+
|
| 321 |
+
print(f"[ok] {path.name}")
|
| 322 |
+
return
|
| 323 |
+
|
| 324 |
+
except Exception as e:
|
| 325 |
+
if attempt == self.MAX_RETRIES:
|
| 326 |
+
print(f"[fail] {path.name} after {attempt} attempts")
|
| 327 |
+
traceback.print_exc()
|
| 328 |
+
else:
|
| 329 |
+
backoff = 2 ** attempt
|
| 330 |
+
print(f"[retry] {path.name} in {backoff}s ({e})")
|
| 331 |
+
time.sleep(backoff)
|
| 332 |
+
'''
|
data/output_dir/.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**
|
| 2 |
+
# !*.md
|
| 3 |
+
!COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/
|
| 4 |
+
!.gitignore
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
!*.md
|
| 2 |
+
!*.jpeg
|
| 3 |
+
# !COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/
|
| 4 |
+
!.gitignore
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_0_Picture_1.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_11_Figure_9.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_18_Figure_1.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_1.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_4_Figure_9.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_6_Figure_1.jpeg
ADDED
|
data/output_dir/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main/_page_8_Figure_1.jpeg
ADDED
|
data/pdf/.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**
|
| 2 |
+
!*.pdf
|
| 3 |
+
!.gitignore
|
data/pdf/COSE_ITAssetsRedefinition_Adesemowo2021_1-s2.0-S0167404820304041-main.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e971eac65cce7be288302e2b1faf8c622b62bb9c8fedb60a3f88ff385c3104c
|
| 3 |
+
size 2137689
|
file_handler/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
file_handler/file_utils.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# file_handler/file_utils.py
|
| 2 |
+
#import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from itertools import chain
|
| 5 |
+
from typing import List, Union, Any, Mapping
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
import utils.config as config
|
| 9 |
+
|
| 10 |
+
##SMY: Might be deprecated vis duplicated. See marker/marker/config/parser.py ~ https://github.com/datalab-to/marker/blob/master/marker/config/parser.py#L169
|
| 11 |
+
#def create_outputdir(root: Union[str, Path], out_dir:Union[str, Path] = None) -> Path: #List[Path]:
|
| 12 |
+
def create_outputdir(root: Union[str, Path], output_dir_string:str = None) -> Path: #List[Path]:
|
| 13 |
+
""" Create output dir under the input folder """
|
| 14 |
+
|
| 15 |
+
''' ##preserved for future implementation if needed again
|
| 16 |
+
root = root if isinstance(root, Path) else Path(root)
|
| 17 |
+
#root = Path(root)
|
| 18 |
+
if not root.exists():
|
| 19 |
+
raise FileNotFoundError(f"Root path {root} does not exist: cannot create output dir.")
|
| 20 |
+
out_dir = out_dir if out_dir else "output_md" ## SMY: default to outputdir in config file = "output_md"
|
| 21 |
+
output_dir = root.parent / out_dir #"md_output" ##SMY: concatenating output str with src Path
|
| 22 |
+
'''
|
| 23 |
+
|
| 24 |
+
## map to img_path. Opt to putting output within same output_md folder rather than individual source folders
|
| 25 |
+
output_dir_string = output_dir_string if output_dir_string else "output_dir" ##redundant SMY: default to outputdir in config file = "output_md"
|
| 26 |
+
output_dir = Path("data") / output_dir_string #"output_md" ##SMY: concatenating output str with src Path
|
| 27 |
+
output_dir.mkdir(mode=0o2644, parents=True, exist_ok=True)
|
| 28 |
+
return output_dir
|
| 29 |
+
|
| 30 |
+
def is_file_with_extension(path_obj: Path) -> bool:
|
| 31 |
+
"""
|
| 32 |
+
Checks if a pathlib.Path object is a file and has a non-empty extension.
|
| 33 |
+
"""
|
| 34 |
+
path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
|
| 35 |
+
return path_obj.is_file() and bool(path_obj.suffix)
|
| 36 |
+
|
| 37 |
+
def process_dicts_data(data:Union[dict, list[dict]]):
|
| 38 |
+
""" Returns formatted JSON string for a single dictionary or a list of dictionaries"""
|
| 39 |
+
import json
|
| 40 |
+
from pathlib import WindowsPath
|
| 41 |
+
#from typing import dict, list
|
| 42 |
+
|
| 43 |
+
# Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
|
| 44 |
+
class PathEncoder(json.JSONEncoder):
|
| 45 |
+
def default(self, obj):
|
| 46 |
+
if isinstance(obj, WindowsPath):
|
| 47 |
+
return str(obj)
|
| 48 |
+
# Let the base class default method raise the TypeError for other types
|
| 49 |
+
return json.JSONEncoder.default(self, obj)
|
| 50 |
+
|
| 51 |
+
# Convert the list of dicts to a formatted JSON string
|
| 52 |
+
formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
|
| 53 |
+
|
| 54 |
+
return formatted_string
|
| 55 |
+
|
| 56 |
+
##NB: Python =>3.10, X | Y equiv to the type checker as Union[X, Y]
|
| 57 |
+
def collect_pdf_html_paths(root: Union[str, Path]) -> List[Path]:
|
| 58 |
+
"""
|
| 59 |
+
Recursively walk *root* and return a list of all PDF files.
|
| 60 |
+
"""
|
| 61 |
+
root = Path(root)
|
| 62 |
+
patterns = ["*.pdf", "*.html"] #, "*.htm*"]
|
| 63 |
+
if not root.exists():
|
| 64 |
+
raise FileNotFoundError(f"Root path {root} does not exist.")
|
| 65 |
+
#pdfs_htmls = [p for p in root.rglob("*.pdf", "*.html", "*.htm*") if p.is_file()]
|
| 66 |
+
#pdfs_htmls = [chain.from_iterable(root.rglob(pattern) for pattern in patterns)]
|
| 67 |
+
# Use itertools.chain to combine the generators from multiple rglob calls
|
| 68 |
+
pdfs_htmls = list(chain.from_iterable(root.rglob(pattern) for pattern in patterns))
|
| 69 |
+
|
| 70 |
+
return pdfs_htmls
|
| 71 |
+
|
| 72 |
+
def collect_pdf_paths(root: Union[str, Path]) -> List[Path]:
|
| 73 |
+
"""
|
| 74 |
+
Recursively walk *root* and return a list of all PDF files.
|
| 75 |
+
"""
|
| 76 |
+
root = Path(root)
|
| 77 |
+
if not root.exists():
|
| 78 |
+
raise FileNotFoundError(f"Root path {root} does not exist.")
|
| 79 |
+
pdfs = [p for p in root.rglob("*.pdf") if p.is_file()]
|
| 80 |
+
return pdfs
|
| 81 |
+
|
| 82 |
+
def collect_html_paths(root: Union[str, Path]) -> List[Path]:
|
| 83 |
+
"""
|
| 84 |
+
Recursively walk *root* and return a list of all PDF files.
|
| 85 |
+
"""
|
| 86 |
+
root = Path(root)
|
| 87 |
+
if not root.exists():
|
| 88 |
+
raise FileNotFoundError(f"Root path {root} does not exist.")
|
| 89 |
+
htmls = [p for p in root.rglob("*.html", ".htm") if p.is_file()]
|
| 90 |
+
|
| 91 |
+
## SMY: TODO: convert htmls to PDF. Marker will by default attempt weasyprint which typically raise 'libgobject-2' error on Win
|
| 92 |
+
|
| 93 |
+
return htmls
|
| 94 |
+
|
| 95 |
+
def collect_markdown_paths(root: Union[str, Path]) -> List[Path]:
|
| 96 |
+
"""
|
| 97 |
+
Recursively walk *root* and return a list of all Markdown files.
|
| 98 |
+
"""
|
| 99 |
+
root = Path(root)
|
| 100 |
+
md_files = [p for p in root.rglob("*.md") if p.is_file()]
|
| 101 |
+
return md_files
|
| 102 |
+
|
| 103 |
+
#m __future__ import annotations
|
| 104 |
+
def write_markdown(
|
| 105 |
+
src_path: Union[str, Path],
|
| 106 |
+
output_dir: Union[str, Path],
|
| 107 |
+
rendered: Any,
|
| 108 |
+
) -> Path:
|
| 109 |
+
|
| 110 |
+
"""
|
| 111 |
+
Write the Markdown representation of a source file to an output directory.
|
| 112 |
+
|
| 113 |
+
Parameters
|
| 114 |
+
----------
|
| 115 |
+
src_path : str | Path
|
| 116 |
+
Path to the original source file. Only its base name is used for naming
|
| 117 |
+
the resulting Markdown file.
|
| 118 |
+
output_dir : str | Path
|
| 119 |
+
Directory where the Markdown file will be written. It was created if it does not
|
| 120 |
+
exist with create_outputdir().
|
| 121 |
+
rendered : object
|
| 122 |
+
Object that provides a ``markdown`` attribute containing the text to write.
|
| 123 |
+
|
| 124 |
+
Returns
|
| 125 |
+
-------
|
| 126 |
+
pathlib.Path
|
| 127 |
+
The full path of the written Markdown file.
|
| 128 |
+
|
| 129 |
+
Raises
|
| 130 |
+
------
|
| 131 |
+
FileNotFoundError
|
| 132 |
+
If *src_path* does not point to an existing file.
|
| 133 |
+
OSError
|
| 134 |
+
If writing the file fails for any reason (e.g. permission denied).
|
| 135 |
+
AttributeError
|
| 136 |
+
If *rendered* does not expose a ``markdown`` attribute.
|
| 137 |
+
|
| 138 |
+
Notes
|
| 139 |
+
-----
|
| 140 |
+
The function is intentionally lightweight: it only handles path resolution,
|
| 141 |
+
directory creation, and file I/O. All rendering logic should be performed before
|
| 142 |
+
calling this helper.
|
| 143 |
+
"""
|
| 144 |
+
src = Path(src_path)
|
| 145 |
+
if not src.is_file():
|
| 146 |
+
raise FileNotFoundError(f"Source file does not exist: {src}")
|
| 147 |
+
|
| 148 |
+
#out_dir = Path(output_dir)
|
| 149 |
+
#out_dir.mkdir(parents=True, exist_ok=True)
|
| 150 |
+
|
| 151 |
+
md_name = f"{src.stem}.md"
|
| 152 |
+
if isinstance(output_dir, Path):
|
| 153 |
+
md_path = output_dir / f"{src.stem}" / md_name
|
| 154 |
+
else:
|
| 155 |
+
#md_path = Path(src.parent) / f"{Path(output_dir).stem}" / f"{src.stem}" / md_name
|
| 156 |
+
|
| 157 |
+
## Opt to putting output within same output_md folder rather than individual source folders
|
| 158 |
+
#md_path = Path("data\\pdf") / "output_md" / f"{src.stem}" / md_name ##debug
|
| 159 |
+
md_path = Path("data") / output_dir / f"{src.stem}" / md_name ##debug
|
| 160 |
+
##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 161 |
+
md_path.parent.mkdir(mode=0o2644, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
|
| 162 |
+
md_path.parent.chmod(0)
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
|
| 166 |
+
except AttributeError as exc: # pragma: no cover
|
| 167 |
+
raise AttributeError(
|
| 168 |
+
"Extractor Rendered object must have a 'markdown' attribute"
|
| 169 |
+
) from exc
|
| 170 |
+
|
| 171 |
+
with md_path.open(mode="w", encoding="utf-8") as md_f:
|
| 172 |
+
md_f.write(markdown_text) ##SMY: write markdown content to markdown file
|
| 173 |
+
|
| 174 |
+
return md_path ##SMY: return the markdown file #✓
|
| 175 |
+
#return {"files": md_path} ##SMY: return dict of file with markdown filename.
|
| 176 |
+
|
| 177 |
+
# Dummp Markdown extracted images
|
| 178 |
+
def dump_images(
|
| 179 |
+
src_path: Union[str, Path],
|
| 180 |
+
output_dir: Union[str, Path],
|
| 181 |
+
rendered: Any,
|
| 182 |
+
) -> int:
|
| 183 |
+
|
| 184 |
+
"""
|
| 185 |
+
Dump the images of the Markdown representation of a source file to an output directory.
|
| 186 |
+
|
| 187 |
+
Parameters
|
| 188 |
+
----------
|
| 189 |
+
src_path : str | Path
|
| 190 |
+
Path to the original source file. Only its base name is used for naming
|
| 191 |
+
the resulting Markdown file.
|
| 192 |
+
output_dir : str | Path
|
| 193 |
+
Directory where the Markdown file will be written. It was created if it does not
|
| 194 |
+
exist with create_outputdir().
|
| 195 |
+
rendered : object
|
| 196 |
+
Object that provides a ``markdown`` attribute containing the text to write.
|
| 197 |
+
|
| 198 |
+
Returns
|
| 199 |
+
-------
|
| 200 |
+
Number of images dumped from the Markdown file.
|
| 201 |
+
"""
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
images: Image.Image = getattr(rendered, "images")
|
| 205 |
+
except TypeError as exc: # pragma: no cover
|
| 206 |
+
raise AttributeError(
|
| 207 |
+
"Extracted images from rendered.images must be a mapping of str -> PIL.Image"
|
| 208 |
+
) from exc
|
| 209 |
+
|
| 210 |
+
# Initialise variables
|
| 211 |
+
images_count = 0
|
| 212 |
+
img_path_list = []
|
| 213 |
+
##SMY: See marker.output.save_output() : https://github.com/datalab-to/marker/blob/master/marker/output.py
|
| 214 |
+
#for img_name, img_bytes in images.items():
|
| 215 |
+
|
| 216 |
+
src = Path(src_path) ##SMY: keep uniform with write_markdown. No need is exists anymore
|
| 217 |
+
for img_name, img in images.items():
|
| 218 |
+
# Resolve the full path and make sure any sub‑directories exist.
|
| 219 |
+
#img_path = Path(output_dir) / src_path / img_name ##SMY: image files ##concatenate Path + str
|
| 220 |
+
#img_path = create_outputdir(src_path) / img_name
|
| 221 |
+
|
| 222 |
+
if isinstance(output_dir, Path):
|
| 223 |
+
img_path = output_dir.stem / img_name
|
| 224 |
+
else:
|
| 225 |
+
# #img_path = Path(output_dir) / f"{src.stem}" / img_name ##SMY: create markdown file ##SMY concatenating Path with str
|
| 226 |
+
# #img_path = Path(output_dir) / img_name ##SMY: create markdown file ##SMY concatenating Path with str
|
| 227 |
+
#img_path = Path(src.parent) / f"{Path(output_dir).stem}" / f"{src.stem}" / img_name
|
| 228 |
+
|
| 229 |
+
#img_path = Path("data\\pdf") / "output_md" / f"{src.stem}" / img_name ##debug
|
| 230 |
+
img_path = Path("data") / output_dir / f"{src.stem}" / img_name ##debug
|
| 231 |
+
#img_path.mkdir(mode=0o777, parents=True, exist_ok=True) ##SMY: create nested img_path if not exists
|
| 232 |
+
#img_path.parent.mkdir(parents=True, exist_ok=True)
|
| 233 |
+
|
| 234 |
+
img.save(img_path) ##SMY: save images (of type PIL.Image.Image) to markdown folder
|
| 235 |
+
images_count += 1
|
| 236 |
+
#img_path_list = img_path_list.append(img_path)
|
| 237 |
+
img_path_list.append(img_path)
|
| 238 |
+
|
| 239 |
+
return images_count, img_path_list ##SMY: return number of images and path
|
| 240 |
+
#return images.items().count
|
| 241 |
+
#return len(images)
|
| 242 |
+
|
| 243 |
+
# Dummp Markdown extracted images ##SMY: Marked for deprecated
|
| 244 |
+
'''
|
| 245 |
+
def dump_images(
|
| 246 |
+
src_path: Union[str, Path],
|
| 247 |
+
output_dir: Union[str, Path],
|
| 248 |
+
rendered: Any,
|
| 249 |
+
) -> int:
|
| 250 |
+
|
| 251 |
+
"""
|
| 252 |
+
Dump the images of the Markdown representation of a source file to an output directory.
|
| 253 |
+
|
| 254 |
+
Parameters
|
| 255 |
+
----------
|
| 256 |
+
src_path : str | Path
|
| 257 |
+
Path to the original source file. Only its base name is used for naming
|
| 258 |
+
the resulting Markdown file.
|
| 259 |
+
output_dir : str | Path
|
| 260 |
+
Directory where the Markdown file will be written. It was created if it does not
|
| 261 |
+
exist with create_outputdir().
|
| 262 |
+
rendered : object
|
| 263 |
+
Object that provides a ``markdown`` attribute containing the text to write.
|
| 264 |
+
|
| 265 |
+
Returns
|
| 266 |
+
-------
|
| 267 |
+
Number of images dumped from the Markdown file.
|
| 268 |
+
"""
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
images: Mapping[str, bytes] = getattr(rendered, "images")
|
| 272 |
+
except TypeError as exc: # pragma: no cover
|
| 273 |
+
raise AttributeError(
|
| 274 |
+
"Extracted images from rendered.images must be a mapping of str -> bytes"
|
| 275 |
+
) from exc
|
| 276 |
+
|
| 277 |
+
images_count = 0
|
| 278 |
+
##SMY: See marker.output.save_output() : https://github.com/datalab-to/marker/blob/master/marker/output.py
|
| 279 |
+
#for img_name, img_bytes in images.items():
|
| 280 |
+
for img_name, img in images.items():
|
| 281 |
+
# Resolve the full path and make sure any sub‑directories exist.
|
| 282 |
+
img_path = Path(output_dir) / src_path / img_name ##SMY: image files ##concatenate Path + str
|
| 283 |
+
img_path.parent.mkdir(parents=True, exist_ok=True)
|
| 284 |
+
|
| 285 |
+
#'' '
|
| 286 |
+
#with img_path.open("wb") as fp:
|
| 287 |
+
# fp.write(img_bytes) ##SMY: write images to markdown folder
|
| 288 |
+
#images_count += 1
|
| 289 |
+
#'' '
|
| 290 |
+
img.save(img_path) ##SMY: save images (of type PIL.Image.Image) to markdown folder
|
| 291 |
+
images_count += 1
|
| 292 |
+
|
| 293 |
+
return images_count ##SMY: return number of images
|
| 294 |
+
#return images.items().count
|
| 295 |
+
#return len(images)
|
| 296 |
+
'''
|
llm/__init__.py
ADDED
|
File without changes
|
llm/hf_client.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Iterable, Literal, Optional
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
import traceback
|
| 7 |
+
from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 8 |
+
|
| 9 |
+
from llm.llm_login import login_huggingface, is_login_huggingface
|
| 10 |
+
|
| 11 |
+
from utils.logger import get_logger
|
| 12 |
+
|
| 13 |
+
## Get logger instance
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class HFChatClient:
|
| 18 |
+
"""
|
| 19 |
+
Provider‐agnostic LLM client interface.
|
| 20 |
+
Encapsulate `huggingface_hub.InferenceClient` setup and chat calls.
|
| 21 |
+
|
| 22 |
+
Backends:
|
| 23 |
+
- model: plain HF model id (e.g., "HuggingFaceH4/zephyr-7b-beta")
|
| 24 |
+
- provider: provider-routed id (e.g., "openai/gpt-oss-120b:fireworks-ai")
|
| 25 |
+
- endpoint: full inference endpoint URL (e.g., "http://localhost:1234").
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self,
|
| 29 |
+
#api_token: str,
|
| 30 |
+
#model_id: str = "gpt2",
|
| 31 |
+
provider: str = "huggingface", ## "huggingface2", "openai"
|
| 32 |
+
model_id: str = "openai/gpt-oss-120b", ##default_model
|
| 33 |
+
hf_provider: str = "huggingface",
|
| 34 |
+
endpoint_url: Optional[str] = None,
|
| 35 |
+
#backend: Literal["model", "provider", "endpoint"] = [],
|
| 36 |
+
backend_choice: Optional[str] = None, #choices=["model-id", "provider", "endpoint"]
|
| 37 |
+
system_message: str = "",
|
| 38 |
+
max_tokens: int = 4096,
|
| 39 |
+
temperature: float = 0.0,
|
| 40 |
+
top_p: float = 0.1,
|
| 41 |
+
stream: bool = False,
|
| 42 |
+
api_token: Optional[str] = None
|
| 43 |
+
) -> None:
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
self.model_id = model_id
|
| 47 |
+
self.provider = provider.lower()
|
| 48 |
+
self.hf_provider = hf_provider.lower()
|
| 49 |
+
self.endpoint_url = endpoint_url
|
| 50 |
+
#self.backend = backend
|
| 51 |
+
#self.backend_literal: Literal["model", "provider", "endpoint"] = (
|
| 52 |
+
'''
|
| 53 |
+
self.backend: Literal["model", "provider", "endpoint"] = (
|
| 54 |
+
"model" if backend_choice == "Hugging Face Model ID" else (
|
| 55 |
+
"provider" if backend_choice == "HF Provider Route" else "endpoint")
|
| 56 |
+
),
|
| 57 |
+
'''
|
| 58 |
+
self.backend: Literal["model", "provider", "endpoint"] = (
|
| 59 |
+
"model" if backend_choice == "model-id" else (
|
| 60 |
+
"provider" if backend_choice == "provider" else "endpoint")
|
| 61 |
+
) ## see Gradio backend_choice dropdown
|
| 62 |
+
self.system_message = system_message
|
| 63 |
+
self.max_tokens = max_tokens
|
| 64 |
+
self.temperature = temperature
|
| 65 |
+
self.top_p = top_p
|
| 66 |
+
self.stream = stream
|
| 67 |
+
self.token = api_token if api_token else None #"" # invalid; preserved
|
| 68 |
+
#self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
|
| 69 |
+
|
| 70 |
+
self.base_url = "https://router.huggingface.co/v1" #%22" #HF API proxy
|
| 71 |
+
except Exception as exc:
|
| 72 |
+
#logger.error(f"client_init_failed", extra={"error": str(exc)}")
|
| 73 |
+
tb = traceback.format_exc()
|
| 74 |
+
logger.exception(f'✗ client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
|
| 75 |
+
raise RuntimeError(f"✗ Failed to initialise client: {exc}\n{tb}")
|
| 76 |
+
|
| 77 |
+
##SMY: //TOBE: Deprecated : Moved to llm.llm_login
|
| 78 |
+
'''
|
| 79 |
+
# # Disable implicit token propagation for determinism
|
| 80 |
+
# Explicitly disable implicit token propagation; we rely on explicit auth or env var
|
| 81 |
+
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
| 82 |
+
|
| 83 |
+
# Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
|
| 84 |
+
try:
|
| 85 |
+
login()
|
| 86 |
+
time.sleep(15) ##SMY pause for login. Helpful: pool async opex
|
| 87 |
+
logger.info("hf_login", extra={"mode": "cli"})
|
| 88 |
+
except Exception as exc:
|
| 89 |
+
# Respect common env var names; prefer explicit token arg when provided
|
| 90 |
+
fallback_token = self.token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 91 |
+
if fallback_token:
|
| 92 |
+
try:
|
| 93 |
+
login(token=fallback_token)
|
| 94 |
+
self.token = fallback_token
|
| 95 |
+
logger.info("hf_login", extra={"mode": "token"})
|
| 96 |
+
except Exception as exc_token:
|
| 97 |
+
logger.warning("hf_login_failed", extra={"error": str(exc_token)})
|
| 98 |
+
else:
|
| 99 |
+
logger.warning("hf_login_failed", extra={"error": str(exc)})
|
| 100 |
+
# Silent fallback; client will still work if token is passed directly
|
| 101 |
+
#pass
|
| 102 |
+
'''
|
| 103 |
+
login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 104 |
+
|
| 105 |
+
@staticmethod
|
| 106 |
+
def _normalise_history(history: list, system_message: str, latest_user_message: str) -> list[dict]:
|
| 107 |
+
"""
|
| 108 |
+
`prompt` prefixed by system_message if set
|
| 109 |
+
Normalise chat history to list of {"role": role, "content": content} dicts.
|
| 110 |
+
Supports both dict and tuple formats for history items.
|
| 111 |
+
"""
|
| 112 |
+
messages: list[dict] = []
|
| 113 |
+
if system_message:
|
| 114 |
+
messages.append({"role": "system", "content": system_message})
|
| 115 |
+
for item in history or []:
|
| 116 |
+
if isinstance(item, dict) and "role" in item and "content" in item:
|
| 117 |
+
if item["role"] in ("user", "assistant"):
|
| 118 |
+
messages.append({"role": item["role"], "content": item["content"]})
|
| 119 |
+
elif isinstance(item, (list, tuple)) and len(item) == 2:
|
| 120 |
+
usr, asst = item
|
| 121 |
+
if usr:
|
| 122 |
+
messages.append({"role": "user", "content": usr})
|
| 123 |
+
if asst:
|
| 124 |
+
messages.append({"role": "assistant", "content": asst})
|
| 125 |
+
messages.append({"role": "user", "content": latest_user_message})
|
| 126 |
+
return messages
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def _initialise_client(self,
|
| 130 |
+
backend: Literal["model", "provider", "endpoint"],
|
| 131 |
+
model_id: Optional[str] = None,
|
| 132 |
+
hf_provider: Optional[str] = None,
|
| 133 |
+
endpoint_url: Optional[str] = None,
|
| 134 |
+
token: Optional[str] = None) -> InferenceClient:
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
match backend:
|
| 138 |
+
case "endpoint" | "model":
|
| 139 |
+
logger.debug("_initialise_client: initialising with:", extra={"model":model_id}) ## debug
|
| 140 |
+
hf_client = InferenceClient(model=model_id or endpoint_url, token=token) #endpoint=target) ##, token=api_token or self.token)
|
| 141 |
+
logger.log(20, "client: ", extra={"model":model_id}) ## debug
|
| 142 |
+
case "provider":
|
| 143 |
+
logger.info("_initialise_client: initialising with:", extra={"provider":hf_provider}) ## debug
|
| 144 |
+
hf_client = InferenceClient(provider=hf_provider, model=model_id, token=token) ##, token=api_token or self.token)
|
| 145 |
+
#client = client(model = model_id, provider=provider, token=token) ##target
|
| 146 |
+
logger.log(20, "client: ", extra={"backend":backend}) ## debug
|
| 147 |
+
case _:
|
| 148 |
+
raise ValueError("Invalid backend.")
|
| 149 |
+
return hf_client
|
| 150 |
+
except Exception as exc:
|
| 151 |
+
logger.log(40, "_initialise_client: client_init_failed", extra={"error": str(exc)}) ## debug
|
| 152 |
+
raise RuntimeError(f"_initialise_client: Failed to initialise client: {exc}")
|
| 153 |
+
|
| 154 |
+
## wrap HF client for marker
|
| 155 |
+
def chat_fn(
|
| 156 |
+
self,
|
| 157 |
+
message: str,
|
| 158 |
+
history: list = [],
|
| 159 |
+
) -> Iterable[str]:
|
| 160 |
+
"""
|
| 161 |
+
messages = self._normalise_history(history, system_message, message)
|
| 162 |
+
token = api_token or self.token
|
| 163 |
+
"""
|
| 164 |
+
## set prompt and token
|
| 165 |
+
messages = self._normalise_history(message, history, self.system_message)
|
| 166 |
+
#token = api_token or self.token
|
| 167 |
+
#token = self.token ## redundant
|
| 168 |
+
|
| 169 |
+
logger.log(20,"chat: initialising client", extra={
|
| 170 |
+
"backend": self.backend, "model": self.model_id, "provider": self.hf_provider, "endpoint": self.endpoint_url,
|
| 171 |
+
"stream": self.stream, "max_tokens": self.max_tokens, "temperature": self.temperature, "top_p": self.top_p,
|
| 172 |
+
})
|
| 173 |
+
|
| 174 |
+
## initialised client
|
| 175 |
+
try:
|
| 176 |
+
client = self._initialise_client(self, self.backend, self.model_id, self.hf_provider, self.endpoint_url, self.token) #api_token)
|
| 177 |
+
logger.log(20, "chat: client initialised") ## debug
|
| 178 |
+
except Exception as exc:
|
| 179 |
+
##logger.error
|
| 180 |
+
logger.log(40,"chat client_init_failed", extra={"error": str(exc)})
|
| 181 |
+
raise RuntimeError(f"chat: Failed to initialise client: {exc}")
|
| 182 |
+
|
| 183 |
+
logger.log(20, "chat_start", extra={
|
| 184 |
+
"backend": self.backend, "model": self.model_id, "provider": self.hf_provider, "endpoint": self.endpoint_url,
|
| 185 |
+
"stream": self.stream, "max_tokens": self.max_tokens, "temperature": self.temperature, "top_p": self.top_p,
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
if self.stream:
|
| 189 |
+
acc = ""
|
| 190 |
+
for chunk in client.chat_completion(
|
| 191 |
+
messages=messages,
|
| 192 |
+
#model=client.model, ## moved back to client initialise
|
| 193 |
+
max_tokens=self.max_tokens,
|
| 194 |
+
stream=True,
|
| 195 |
+
temperature=self.temperature,
|
| 196 |
+
top_p=self.top_p,
|
| 197 |
+
):
|
| 198 |
+
delta = getattr(chunk.choices[0].delta, "content", None) or ""
|
| 199 |
+
if delta:
|
| 200 |
+
acc += delta
|
| 201 |
+
yield acc
|
| 202 |
+
return
|
| 203 |
+
|
| 204 |
+
result = client.chat_completion(
|
| 205 |
+
messages=messages,
|
| 206 |
+
#model=client.model, ## moved back to client initialised
|
| 207 |
+
max_tokens=self.max_tokens,
|
| 208 |
+
stream=False,
|
| 209 |
+
temperature=self.temperature,
|
| 210 |
+
top_p=self.top_p,
|
| 211 |
+
)
|
| 212 |
+
yield result.choices[0].message.content
|
| 213 |
+
|
| 214 |
+
'''
|
| 215 |
+
## future consideration
|
| 216 |
+
response = client.text_generation(
|
| 217 |
+
#model=model_name,
|
| 218 |
+
inputs=prompt,
|
| 219 |
+
parameters={
|
| 220 |
+
"max_new_tokens": max_new_tokens,
|
| 221 |
+
"temperature": temperature,
|
| 222 |
+
},
|
| 223 |
+
)
|
| 224 |
+
return response[0].generated_text
|
| 225 |
+
'''
|
| 226 |
+
|
| 227 |
+
def logout(self) -> bool:
|
| 228 |
+
"""Logout from Hugging Face and clear in-process tokens.
|
| 229 |
+
|
| 230 |
+
Returns True on success, False otherwise.
|
| 231 |
+
"""
|
| 232 |
+
try:
|
| 233 |
+
hf_logout()
|
| 234 |
+
except Exception as exc:
|
| 235 |
+
logger.error("hf_logout_failed", extra={"error": str(exc)})
|
| 236 |
+
return False
|
| 237 |
+
# Clear process environment tokens
|
| 238 |
+
for key in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN"):
|
| 239 |
+
if key in os.environ:
|
| 240 |
+
os.environ.pop(key, None)
|
| 241 |
+
self.token = None
|
| 242 |
+
logger.info("hf_logout_success")
|
| 243 |
+
return True
|
| 244 |
+
|
llm/llm_login.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import login, logout
|
| 2 |
+
import os
|
| 3 |
+
import traceback
|
| 4 |
+
from time import sleep
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
from utils.logger import get_logger
|
| 8 |
+
|
| 9 |
+
## Get logger instance
|
| 10 |
+
logger = get_logger(__name__)
|
| 11 |
+
|
| 12 |
+
def login_huggingface(token: Optional[str] = None):
|
| 13 |
+
"""
|
| 14 |
+
Login to Hugging Face account. Prioritize CLI login for privacy and determinism.
|
| 15 |
+
|
| 16 |
+
Attempts to log in to Hugging Face Hub.
|
| 17 |
+
First, it tries to log in interactively via the Hugging Face CLI.
|
| 18 |
+
If that fails, it falls back to using a token provided as an argument or
|
| 19 |
+
found in the environment variables HF_TOKEN or HUGGINGFACEHUB_API_TOKEN.
|
| 20 |
+
|
| 21 |
+
If both methods fail, it logs a warning and continues without logging in.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
logger.info("Attempting Hugging Face login...")
|
| 25 |
+
|
| 26 |
+
# Disable implicit token propagation for determinism
|
| 27 |
+
# Explicitly disable implicit token propagation; we rely on explicit auth or env var
|
| 28 |
+
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
| 29 |
+
|
| 30 |
+
token = token
|
| 31 |
+
# Privacy-first login: try interactive CLI first; fallback to provided/env token only if needed
|
| 32 |
+
try:
|
| 33 |
+
login()
|
| 34 |
+
sleep(5) ##SMY pause for login. Helpful: pool async opex
|
| 35 |
+
logger.info("✔️ hf_login already", extra={"mode": "cli"})
|
| 36 |
+
except Exception as exc:
|
| 37 |
+
# Respect common env var names; prefer explicit token arg when provided
|
| 38 |
+
fallback_token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 39 |
+
if fallback_token:
|
| 40 |
+
try:
|
| 41 |
+
login(token=fallback_token)
|
| 42 |
+
token = fallback_token
|
| 43 |
+
logger.info("✔️ hf_login through fallback", extra={"mode": "token"}) ##SMY: This only displays if token is provided
|
| 44 |
+
except Exception as exc_token:
|
| 45 |
+
logger.warning("❌ hf_login_failed", extra={"error": str(exc_token)})
|
| 46 |
+
else:
|
| 47 |
+
logger.warning("❌ hf_login_failed", extra={"error": str(exc)})
|
| 48 |
+
# Silent fallback; client will still work if token is passed directly
|
| 49 |
+
#pass
|
| 50 |
+
|
| 51 |
+
def is_login_huggingface():
|
| 52 |
+
from huggingface_hub import HfApi
|
| 53 |
+
from huggingface_hub.utils import HfHubHTTPError
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
HfApi().whoami()
|
| 57 |
+
logger.log(level=20, msg=("✔️ You are logged in."), extra={"is_logged_in": True})
|
| 58 |
+
return True
|
| 59 |
+
except HfHubHTTPError as exc:
|
| 60 |
+
# A 401 status code indicates an authentication error.
|
| 61 |
+
if exc.response.status_code == 401:
|
| 62 |
+
print("⚠️ You are not logged in. You can still access public models.")
|
| 63 |
+
else:
|
| 64 |
+
# Handle other HTTP errors if necessary
|
| 65 |
+
#print(f"An unexpected HTTP error occurred: {exc}")
|
| 66 |
+
tb = traceback.format_exc()
|
| 67 |
+
logger.exception(f"✗ An unexpected HTTP error occurred: → {exc}\n{tb}", exc_info=True)
|
| 68 |
+
#raise RuntimeError(f"✗ An unexpected HTTP error occurred: → {exc}\n{tb}") from exc
|
| 69 |
+
return False
|
| 70 |
+
|
llm/openai_client.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from typing import Optional #Iterable, Literal
|
| 5 |
+
#import os
|
| 6 |
+
#import time
|
| 7 |
+
import traceback
|
| 8 |
+
#from huggingface_hub import InferenceClient, login, logout as hf_logout
|
| 9 |
+
|
| 10 |
+
from llm.llm_login import login_huggingface, is_login_huggingface
|
| 11 |
+
|
| 12 |
+
import dotenv
|
| 13 |
+
#dotenv.load_dotenv(".env")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from utils.logger import get_logger
|
| 17 |
+
|
| 18 |
+
## Get logger instance
|
| 19 |
+
logger = get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class OpenAIChatClient:
|
| 23 |
+
"""
|
| 24 |
+
Provider‐agnostic OpenAI-based LLM client interface.
|
| 25 |
+
Compatible with `huggingface_hub.InferenceClient` setup and chat calls.
|
| 26 |
+
|
| 27 |
+
- base_url="https://router.huggingface.co/v1",
|
| 28 |
+
- api_key=os.environ["HF_TOKEN"],
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self,
|
| 32 |
+
model_id: Optional[str] = None,
|
| 33 |
+
hf_provider: Optional[str] = None,
|
| 34 |
+
base_url: Optional[str] = "https://router.huggingface.co/v1", #None,
|
| 35 |
+
api_token: Optional[str] = None,
|
| 36 |
+
temperature: Optional[float] = 0.2,
|
| 37 |
+
top_p: Optional[float] = 0.2,
|
| 38 |
+
) -> None:
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
openai_api_key_env = dotenv.get_key(".env", "OPENAI_API_KEY")
|
| 42 |
+
self.model_id = f"{model_id}:{hf_provider}" if hf_provider is not None else model_id ##concatenate so HF can pipe to Hf provider
|
| 43 |
+
self.hf_provider = hf_provider
|
| 44 |
+
self.base_url = base_url #"https://router.huggingface.co/v1" #%22" #HF API proxy
|
| 45 |
+
#self.token = api_token if api_token else None ##debug
|
| 46 |
+
self.token = openai_api_key_env if openai_api_key_env else api_token #dotenv.get_key(".env", "OPENAI_API_KEY")
|
| 47 |
+
#self.token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") ## not preferred
|
| 48 |
+
login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 49 |
+
#self.fake_token = api_token or "a1b2c3" #or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 50 |
+
self.openai_api_key = self.token #self.fake_token
|
| 51 |
+
self.temperature = temperature
|
| 52 |
+
self.top_p = top_p
|
| 53 |
+
|
| 54 |
+
logger.log(level=2, msg="initialised OpenAIChatClient:", extra={"base_url": self.base_url, "openai_api_key": self.openai_api_key})
|
| 55 |
+
|
| 56 |
+
except Exception as exc:
|
| 57 |
+
#logger.error(f"OpenAI client_init_failed", extra={"error": str(exc)}")
|
| 58 |
+
tb = traceback.format_exc()
|
| 59 |
+
logger.exception(f'✗ OpenAI client_init_failed", extra={"error": str(exc)}\n{tb}', exc_info=True)
|
| 60 |
+
raise RuntimeError(f"✗ Failed to initialise OpenAI client: {exc}\n{tb}")
|
| 61 |
+
|
| 62 |
+
#login_huggingface(self.token) if not is_login_huggingface() else logger.log(level=20, msg=f"logged in to HF Hub already") ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 63 |
+
|
| 64 |
+
####IN PROGRESS
|
| 65 |
+
#
|
| 66 |
+
"""
|
| 67 |
+
## HuggingFace API-proxy Inference Provider - https://huggingface.co/docs/inference-providers/index?python-clients=openai
|
| 68 |
+
## https://huggingface.co/openai/gpt-oss-20b?inference_api=true&inference_provider=fireworks-ai&language=python&client=openai
|
| 69 |
+
|
| 70 |
+
import os
|
| 71 |
+
from openai import OpenAI
|
| 72 |
+
|
| 73 |
+
client = OpenAI(
|
| 74 |
+
base_url="https://router.huggingface.co/v1",
|
| 75 |
+
api_key=os.environ["HF_TOKEN"],
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
stream = client.chat.completions.create(
|
| 79 |
+
model="openai/gpt-oss-20b:fireworks-ai",
|
| 80 |
+
messages=[
|
| 81 |
+
{
|
| 82 |
+
"role": "user",
|
| 83 |
+
"content": "What is the capital of France?"
|
| 84 |
+
}
|
| 85 |
+
],
|
| 86 |
+
stream=True,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
for chunk in stream:
|
| 90 |
+
print(chunk.choices[0].delta.content, end="")
|
| 91 |
+
"""
|
llm/provider_validator.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Validate provider names against Hugging Face Inference Providers list.
|
| 2 |
+
|
| 3 |
+
Source: https://huggingface.co/docs/inference-providers/index
|
| 4 |
+
|
| 5 |
+
Functions:
|
| 6 |
+
- get_supported_providers() -> set[str]
|
| 7 |
+
- normalize_provider(text: str) -> str | None
|
| 8 |
+
- is_valid_provider(text: str) -> bool
|
| 9 |
+
- suggest_providers(text: str, limit: int = 3) -> list[str]
|
| 10 |
+
|
| 11 |
+
Supports common aliases (e.g., "together-ai" -> "together", "fireworks" -> "fireworks-ai").
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
from difflib import get_close_matches
|
| 17 |
+
from typing import Iterable
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Canonical provider slugs from docs (table and provider URLs)
|
| 21 |
+
_CANONICAL: set[str] = {
|
| 22 |
+
"cerebras",
|
| 23 |
+
"cohere",
|
| 24 |
+
"fal-ai",
|
| 25 |
+
"featherless-ai",
|
| 26 |
+
"fireworks-ai",
|
| 27 |
+
"groq",
|
| 28 |
+
"hf-inference",
|
| 29 |
+
"hyperbolic",
|
| 30 |
+
"nebius",
|
| 31 |
+
"novita",
|
| 32 |
+
"nscale",
|
| 33 |
+
"replicate",
|
| 34 |
+
"sambanova",
|
| 35 |
+
"together",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Common aliases users may type; maps to canonical slug
|
| 39 |
+
_ALIASES: dict[str, str] = {
|
| 40 |
+
"together-ai": "together",
|
| 41 |
+
"fireworks": "fireworks-ai",
|
| 42 |
+
"falai": "fal-ai",
|
| 43 |
+
"featherless": "featherless-ai",
|
| 44 |
+
"hf": "hf-inference",
|
| 45 |
+
"huggingface": "hf-inference",
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _to_key(text: str) -> str:
|
| 50 |
+
return (text or "").strip().lower()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_supported_providers(extra: Iterable[str] | None = None) -> set[str]:
|
| 54 |
+
"""Return set of canonical provider slugs.
|
| 55 |
+
|
| 56 |
+
Optionally extend with additional slugs via `extra`.
|
| 57 |
+
"""
|
| 58 |
+
return _CANONICAL | set(map(_to_key, (extra or [])))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def normalize_provider(text: str) -> str | None:
|
| 62 |
+
"""Return canonical provider slug for `text`, if known; else None.
|
| 63 |
+
|
| 64 |
+
Accepts canonical slugs and common aliases.
|
| 65 |
+
"""
|
| 66 |
+
key = _to_key(text)
|
| 67 |
+
if not key:
|
| 68 |
+
return None
|
| 69 |
+
if key in _CANONICAL:
|
| 70 |
+
return key
|
| 71 |
+
if key in _ALIASES:
|
| 72 |
+
return _ALIASES[key]
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def is_valid_provider(text: str) -> bool:
|
| 77 |
+
"""True if `text` is a known provider or alias."""
|
| 78 |
+
return normalize_provider(text) is not None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def suggest_providers(text: str, limit: int = 3) -> list[str]:
|
| 82 |
+
"""Suggest close canonical matches for `text`.
|
| 83 |
+
|
| 84 |
+
Uses difflib to match against canonical slugs; returns up to `limit` suggestions.
|
| 85 |
+
"""
|
| 86 |
+
key = _to_key(text)
|
| 87 |
+
if not key:
|
| 88 |
+
return []
|
| 89 |
+
# Search both canonical and alias keys to be helpful, then map to canonical
|
| 90 |
+
candidates = list(_CANONICAL | set(_ALIASES))
|
| 91 |
+
suggestions = get_close_matches(key, candidates, n=limit, cutoff=0.6)
|
| 92 |
+
canon = []
|
| 93 |
+
for s in suggestions:
|
| 94 |
+
canon_slug = s if s in _CANONICAL else _ALIASES.get(s)
|
| 95 |
+
if canon_slug and canon_slug not in canon:
|
| 96 |
+
canon.append(canon_slug)
|
| 97 |
+
return canon[:limit]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
import sys
|
| 102 |
+
|
| 103 |
+
query = " ".join(sys.argv[1:])
|
| 104 |
+
if not query:
|
| 105 |
+
print("Usage: python provider_validator.py <provider-name>")
|
| 106 |
+
raise SystemExit(2)
|
| 107 |
+
|
| 108 |
+
norm = normalize_provider(query)
|
| 109 |
+
if norm:
|
| 110 |
+
print(f"valid: {norm}")
|
| 111 |
+
else:
|
| 112 |
+
print("invalid")
|
| 113 |
+
suggestions = suggest_providers(query)
|
| 114 |
+
if suggestions:
|
| 115 |
+
print("did_you_mean:", ", ".join(suggestions))
|
| 116 |
+
|
main.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from ui.gradio_ui import build_interface
|
| 6 |
+
from utils.logger import get_logger, setup_logging
|
| 7 |
+
|
| 8 |
+
setup_logging() ## set logging
|
| 9 |
+
#logger = get_logger("pypdfmd")
|
| 10 |
+
logger = get_logger("parserpdf")
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
# Ensure the working directory is clean
|
| 14 |
+
#os.chdir(os.path.dirname(__file__))
|
| 15 |
+
## script working dir absolute path
|
| 16 |
+
script_dir = Path(__file__).resolve().parent
|
| 17 |
+
## change the cwd to the script's dir
|
| 18 |
+
os.chdir(script_dir) ##Path.cwd()
|
| 19 |
+
|
| 20 |
+
demo = build_interface()
|
| 21 |
+
#demo.launch(debug=True, show_error=True ,ssr_mode=True) #(share=True) # share=True for public link; remove in production
|
| 22 |
+
demo.launch(debug=True, show_error=True)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
#name = "parserpdf"
|
| 3 |
+
name = "parser2md"
|
| 4 |
+
version = "0.1.0"
|
| 5 |
+
description = "PDF & HTML parser to markdown"
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
requires-python = ">=3.12"
|
| 8 |
+
dependencies = []
|
| 9 |
+
owner "research-semmyk"
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0
|
| 2 |
+
#marker==1.3.0 # pip install marker (GitHub: https://github.com/datalab-to/marker)
|
| 3 |
+
#pandoc==2.3 # for Markdown → PDF conversion
|
| 4 |
+
#weasyprint==59.0 # optional fallback if pandoc is not available
|
| 5 |
+
python-magic==0.4.27 # file‑type detection
|
tests/test_converters.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_converters.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
import os
|
| 6 |
+
from unittest.mock import patch, MagicMock
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from converters.pdf_to_md import PdfToMarkdownConverter
|
| 10 |
+
from converters.html_to_md import HtmlToMarkdownConverter
|
| 11 |
+
from converters.md_to_pdf import MarkdownToPdfConverter
|
| 12 |
+
from converters.extraction_converter import DocumentConverter
|
| 13 |
+
|
| 14 |
+
@pytest.fixture
|
| 15 |
+
def sample_pdf_path():
|
| 16 |
+
# Create a temporary PDF file for testing
|
| 17 |
+
pdf_path = Path("tests/sample.pdf")
|
| 18 |
+
pdf_path.write_bytes(b"%PDF-1.4\nSample PDF content")
|
| 19 |
+
yield pdf_path
|
| 20 |
+
if pdf_path.exists():
|
| 21 |
+
pdf_path.unlink()
|
| 22 |
+
|
| 23 |
+
@pytest.fixture
|
| 24 |
+
def sample_html_path():
|
| 25 |
+
html_path = Path("tests/sample.html")
|
| 26 |
+
html_path.write_text("<html><body><h1>Test</h1><p>Hello World</p></body></html>")
|
| 27 |
+
yield html_path
|
| 28 |
+
if html_path.exists():
|
| 29 |
+
html_path.unlink()
|
| 30 |
+
|
| 31 |
+
@pytest.fixture
|
| 32 |
+
def sample_md_path():
|
| 33 |
+
md_path = Path("tests/sample.md")
|
| 34 |
+
md_path.write_text("# Test\nHello World")
|
| 35 |
+
yield md_path
|
| 36 |
+
if md_path.exists():
|
| 37 |
+
md_path.unlink()
|
| 38 |
+
|
| 39 |
+
def test_pdf_to_markdown_converter_init():
|
| 40 |
+
converter = PdfToMarkdownConverter()
|
| 41 |
+
assert isinstance(converter, PdfToMarkdownConverter)
|
| 42 |
+
assert hasattr(converter, 'output_dir_string')
|
| 43 |
+
|
| 44 |
+
@patch('converters.pdf_to_md.Marker') # Assuming Marker is imported in pdf_to_md.py
|
| 45 |
+
def test_pdf_to_markdown_convert_file(mock_marker, sample_pdf_path):
|
| 46 |
+
mock_marker.convert_single.return_value = {"markdown": "# Converted\nContent", "images": []}
|
| 47 |
+
|
| 48 |
+
converter = PdfToMarkdownConverter()
|
| 49 |
+
result = converter.convert_file(sample_pdf_path)
|
| 50 |
+
|
| 51 |
+
assert isinstance(result, dict)
|
| 52 |
+
assert "markdown" in result
|
| 53 |
+
assert "filepath" in result
|
| 54 |
+
mock_marker.convert_single.assert_called_once_with(str(sample_pdf_path), prefer_latex=False)
|
| 55 |
+
|
| 56 |
+
def test_html_to_markdown_converter(sample_html_path):
|
| 57 |
+
converter = HtmlToMarkdownConverter()
|
| 58 |
+
result = converter.batch_convert([sample_html_path])
|
| 59 |
+
|
| 60 |
+
assert isinstance(result, dict)
|
| 61 |
+
assert Path(sample_html_path.name) in result
|
| 62 |
+
assert result[Path(sample_html_path.name)].startswith("# Test")
|
| 63 |
+
|
| 64 |
+
def test_markdown_to_pdf_converter(sample_md_path):
|
| 65 |
+
converter = MarkdownToPdfConverter()
|
| 66 |
+
output_dir = Path("tests/output_pdf")
|
| 67 |
+
output_dir.mkdir(exist_ok=True)
|
| 68 |
+
|
| 69 |
+
pdf_files = converter.batch_convert([sample_md_path], output_dir)
|
| 70 |
+
|
| 71 |
+
assert isinstance(pdf_files, list)
|
| 72 |
+
if pdf_files:
|
| 73 |
+
pdf_path = pdf_files[0]
|
| 74 |
+
assert pdf_path.exists()
|
| 75 |
+
assert pdf_path.suffix == ".pdf"
|
| 76 |
+
pdf_path.unlink()
|
| 77 |
+
|
| 78 |
+
output_dir.rmdir()
|
| 79 |
+
|
| 80 |
+
@patch('converters.extraction_converter.get_token')
|
| 81 |
+
def test_document_converter_login(mock_get_token):
|
| 82 |
+
mock_get_token.return_value = "test_token"
|
| 83 |
+
converter = DocumentConverter()
|
| 84 |
+
assert converter.client.token == "test_token"
|
| 85 |
+
|
| 86 |
+
def test_pdf_to_markdown_batch_convert(tmp_path):
|
| 87 |
+
# Test batch with multiple files
|
| 88 |
+
pdf1 = tmp_path / "test1.pdf"
|
| 89 |
+
pdf2 = tmp_path / "test2.pdf"
|
| 90 |
+
pdf1.write_bytes(b"%PDF-1.4")
|
| 91 |
+
pdf2.write_bytes(b"%PDF-1.4")
|
| 92 |
+
|
| 93 |
+
converter = PdfToMarkdownConverter()
|
| 94 |
+
with patch.object(converter, 'convert_file', return_value={"markdown": "test", "filepath": str(pdf1)}):
|
| 95 |
+
results = converter.batch_convert([pdf1, pdf2])
|
| 96 |
+
|
| 97 |
+
assert len(results) == 2
|
| 98 |
+
assert all("markdown" in res for res in results)
|
tests/test_file_handler.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_file_handler.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import tempfile
|
| 7 |
+
from unittest.mock import patch
|
| 8 |
+
|
| 9 |
+
from file_handler.file_utils import (
|
| 10 |
+
collect_pdf_paths, collect_html_paths, collect_markdown_paths,
|
| 11 |
+
process_dicts_data, create_outputdir
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
@pytest.fixture
|
| 15 |
+
def temp_dir_with_pdfs():
|
| 16 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 17 |
+
tmpdir = Path(tmpdirname)
|
| 18 |
+
# Create sample PDF files
|
| 19 |
+
(tmpdir / "doc1.pdf").touch()
|
| 20 |
+
(tmpdir / "subfolder/doc2.pdf").mkdir(parents=True)
|
| 21 |
+
(tmpdir / "subfolder/doc2.pdf").touch()
|
| 22 |
+
(tmpdir / "not_pdf.txt").touch()
|
| 23 |
+
yield tmpdir
|
| 24 |
+
|
| 25 |
+
@pytest.fixture
|
| 26 |
+
def temp_dir_with_html():
|
| 27 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 28 |
+
tmpdir = Path(tmpdirname)
|
| 29 |
+
(tmpdir / "page1.html").touch()
|
| 30 |
+
(tmpdir / "subfolder/page2.htm").mkdir(parents=True)
|
| 31 |
+
(tmpdir / "subfolder/page2.htm").touch()
|
| 32 |
+
(tmpdir / "not_html.md").touch()
|
| 33 |
+
yield tmpdir
|
| 34 |
+
|
| 35 |
+
@pytest.fixture
|
| 36 |
+
def temp_dir_with_md():
|
| 37 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 38 |
+
tmpdir = Path(tmpdirname)
|
| 39 |
+
(tmpdir / "note1.md").touch()
|
| 40 |
+
(tmpdir / "subfolder/note2.md").mkdir(parents=True)
|
| 41 |
+
(tmpdir / "subfolder/note2.md").touch()
|
| 42 |
+
(tmpdir / "not_md.pdf").touch()
|
| 43 |
+
yield tmpdir
|
| 44 |
+
|
| 45 |
+
def test_collect_pdf_paths(temp_dir_with_pdfs):
|
| 46 |
+
paths = collect_pdf_paths(str(temp_dir_with_pdfs))
|
| 47 |
+
assert len(paths) == 2
|
| 48 |
+
assert all(p.suffix.lower() == '.pdf' for p in paths)
|
| 49 |
+
assert Path(str(temp_dir_with_pdfs) / "doc1.pdf") in paths
|
| 50 |
+
assert Path(str(temp_dir_with_pdfs) / "subfolder/doc2.pdf") in paths
|
| 51 |
+
|
| 52 |
+
def test_collect_pdf_paths_no_pdfs(temp_dir_with_html):
|
| 53 |
+
paths = collect_pdf_paths(str(temp_dir_with_html))
|
| 54 |
+
assert len(paths) == 0
|
| 55 |
+
|
| 56 |
+
def test_collect_html_paths(temp_dir_with_html):
|
| 57 |
+
paths = collect_html_paths(str(temp_dir_with_html))
|
| 58 |
+
assert len(paths) == 2
|
| 59 |
+
assert all(p.suffix.lower() in ['.html', '.htm'] for p in paths)
|
| 60 |
+
assert Path(str(temp_dir_with_html) / "page1.html") in paths
|
| 61 |
+
assert Path(str(temp_dir_with_html) / "subfolder/page2.htm") in paths
|
| 62 |
+
|
| 63 |
+
def test_collect_html_paths_no_html(temp_dir_with_pdfs):
|
| 64 |
+
paths = collect_html_paths(str(temp_dir_with_pdfs))
|
| 65 |
+
assert len(paths) == 0
|
| 66 |
+
|
| 67 |
+
def test_collect_markdown_paths(temp_dir_with_md):
|
| 68 |
+
paths = collect_markdown_paths(str(temp_dir_with_md))
|
| 69 |
+
assert len(paths) == 2
|
| 70 |
+
assert all(p.suffix.lower() == '.md' for p in paths)
|
| 71 |
+
assert Path(str(temp_dir_with_md) / "note1.md") in paths
|
| 72 |
+
assert Path(str(temp_dir_with_md) / "subfolder/note2.md") in paths
|
| 73 |
+
|
| 74 |
+
def test_collect_markdown_paths_no_md(temp_dir_with_pdfs):
|
| 75 |
+
paths = collect_markdown_paths(str(temp_dir_with_pdfs))
|
| 76 |
+
assert len(paths) == 0
|
| 77 |
+
|
| 78 |
+
def test_process_dicts_data():
|
| 79 |
+
sample_logs = [
|
| 80 |
+
{"filepath": Path("file1.md"), "markdown": "Content1", "image_path": ["img1.jpg"]},
|
| 81 |
+
{"filepath": Path("file2.md"), "markdown": "Content2", "image_path": []},
|
| 82 |
+
{"error": "Conversion failed for file3"}
|
| 83 |
+
]
|
| 84 |
+
result = process_dicts_data(sample_logs)
|
| 85 |
+
assert "file1.md" in result
|
| 86 |
+
assert "Content1" in result
|
| 87 |
+
assert "img1.jpg" in result
|
| 88 |
+
assert "Conversion failed" in result
|
| 89 |
+
|
| 90 |
+
def test_process_dicts_data_empty():
|
| 91 |
+
result = process_dicts_data([])
|
| 92 |
+
assert result == ""
|
| 93 |
+
|
| 94 |
+
def test_process_dicts_data_invalid():
|
| 95 |
+
with pytest.raises(ValueError):
|
| 96 |
+
process_dicts_data([{"invalid": "data"}])
|
| 97 |
+
|
| 98 |
+
def test_create_outputdir(tmp_path):
|
| 99 |
+
output_dir = tmp_path / "test_output"
|
| 100 |
+
create_outputdir(str(output_dir))
|
| 101 |
+
assert output_dir.exists()
|
| 102 |
+
assert output_dir.is_dir()
|
| 103 |
+
|
| 104 |
+
def test_create_outputdir_existing(tmp_path):
|
| 105 |
+
output_dir = tmp_path / "test_output"
|
| 106 |
+
output_dir.mkdir()
|
| 107 |
+
create_outputdir(str(output_dir))
|
| 108 |
+
assert output_dir.exists()
|
| 109 |
+
assert output_dir.is_dir()
|
| 110 |
+
|
| 111 |
+
@patch('pathlib.Path.mkdir')
|
| 112 |
+
def test_create_outputdir_error(mock_mkdir):
|
| 113 |
+
mock_mkdir.side_effect = OSError("Permission denied")
|
| 114 |
+
with pytest.raises(OSError):
|
| 115 |
+
create_outputdir("protected_dir")
|
tests/test_llm.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_llm.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
#
|
| 4 |
+
# import pytest
|
| 5 |
+
from unittest.mock import patch, MagicMock
|
| 6 |
+
import huggingface_hub
|
| 7 |
+
from huggingface_hub import get_token
|
| 8 |
+
|
| 9 |
+
from llm.llm_login import login_huggingface
|
| 10 |
+
from llm.provider_validator import is_valid_provider, suggest_providers
|
| 11 |
+
from llm.hf_client import HFChatClient # Assuming this exists
|
| 12 |
+
from llm.openai_client import OpenAIClient # Assuming this exists
|
| 13 |
+
|
| 14 |
+
def test_login_huggingface_success():
|
| 15 |
+
with patch('huggingface_hub.login') as mock_login:
|
| 16 |
+
api_token = "hf_test_token"
|
| 17 |
+
login_huggingface(api_token)
|
| 18 |
+
mock_login.assert_called_once_with(token=api_token, add_to_git_credential=False)
|
| 19 |
+
|
| 20 |
+
def test_login_huggingface_no_token():
|
| 21 |
+
with patch('huggingface_hub.login') as mock_login:
|
| 22 |
+
with pytest.raises(ValueError, match="API token required"):
|
| 23 |
+
login_huggingface(None)
|
| 24 |
+
|
| 25 |
+
@patch('huggingface_hub.login')
|
| 26 |
+
def test_login_huggingface_error(mock_login):
|
| 27 |
+
mock_login.side_effect = Exception("Login failed")
|
| 28 |
+
with pytest.raises(Exception, match="Login failed"):
|
| 29 |
+
login_huggingface("invalid_token")
|
| 30 |
+
|
| 31 |
+
def test_is_valid_provider():
|
| 32 |
+
assert is_valid_provider("huggingface") is True
|
| 33 |
+
assert is_valid_provider("openai") is True
|
| 34 |
+
assert is_valid_provider("invalid_provider") is False
|
| 35 |
+
assert is_valid_provider("") is False
|
| 36 |
+
assert is_valid_provider(None) is False
|
| 37 |
+
|
| 38 |
+
def test_suggest_providers():
|
| 39 |
+
suggestions = suggest_providers("hugngface") # Typo example
|
| 40 |
+
assert isinstance(suggestions, list)
|
| 41 |
+
assert "huggingface" in suggestions
|
| 42 |
+
|
| 43 |
+
no_suggestions = suggest_providers("completely_unknown")
|
| 44 |
+
assert isinstance(no_suggestions, list)
|
| 45 |
+
assert len(no_suggestions) == 0
|
| 46 |
+
|
| 47 |
+
@patch('llm.hf_client.HFChatClient.__init__')
|
| 48 |
+
def test_hf_client_init(mock_init):
|
| 49 |
+
mock_init.return_value = None
|
| 50 |
+
client = HFChatClient(model_id="test-model", api_token="test_token")
|
| 51 |
+
mock_init.assert_called_once_with(
|
| 52 |
+
model_id="test-model",
|
| 53 |
+
api_token="test_token",
|
| 54 |
+
# Add other expected params based on actual __init__
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
@patch('llm.hf_client.login_huggingface')
|
| 58 |
+
@patch('llm.hf_client.get_token')
|
| 59 |
+
def test_hf_client_token(mock_get_token, mock_login):
|
| 60 |
+
mock_get_token.return_value = "cached_token"
|
| 61 |
+
mock_login.return_value = None
|
| 62 |
+
|
| 63 |
+
client = HFChatClient(model_id="test-model")
|
| 64 |
+
assert client.api_token == "cached_token"
|
| 65 |
+
|
| 66 |
+
@patch('openai.OpenAI')
|
| 67 |
+
def test_openai_client_init(mock_openai):
|
| 68 |
+
mock_client = MagicMock()
|
| 69 |
+
mock_openai.return_value = mock_client
|
| 70 |
+
|
| 71 |
+
client = OpenAIClient(api_key="sk_test_key", base_url="https://api.openai.com/v1")
|
| 72 |
+
mock_openai.assert_called_once_with(
|
| 73 |
+
api_key="sk_test_key",
|
| 74 |
+
base_url="https://api.openai.com/v1"
|
| 75 |
+
)
|
| 76 |
+
assert client.client == mock_client
|
| 77 |
+
|
| 78 |
+
@patch('openai.OpenAI')
|
| 79 |
+
def test_openai_client_chat(mock_openai):
|
| 80 |
+
mock_client = MagicMock()
|
| 81 |
+
mock_response = MagicMock()
|
| 82 |
+
mock_response.choices = [MagicMock(content="Hello!")]
|
| 83 |
+
mock_client.chat.completions.create.return_value = mock_response
|
| 84 |
+
mock_openai.return_value = mock_client
|
| 85 |
+
|
| 86 |
+
client = OpenAIClient(api_key="sk_test_key")
|
| 87 |
+
response = client.chat("Hello", model="gpt-3.5-turbo")
|
| 88 |
+
|
| 89 |
+
assert response == "Hello!"
|
| 90 |
+
mock_client.chat.completions.create.assert_called_once_with(
|
| 91 |
+
model="gpt-3.5-turbo",
|
| 92 |
+
messages=[{"role": "user", "content": "Hello"}]
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def test_provider_validator_edge_cases():
|
| 96 |
+
# Test with non-string inputs
|
| 97 |
+
assert is_valid_provider(123) is False
|
| 98 |
+
assert suggest_providers(123) == []
|
| 99 |
+
|
| 100 |
+
# Test case insensitivity
|
| 101 |
+
assert is_valid_provider("HUGGINGFACE") is True
|
| 102 |
+
assert is_valid_provider("OpEnAi") is True
|
| 103 |
+
|
| 104 |
+
@patch('huggingface_hub.get_token')
|
| 105 |
+
def test_get_token_from_env(mock_get_token):
|
| 106 |
+
mock_get_token.return_value = None
|
| 107 |
+
with patch.dict('os.environ', {'HUGGINGFACE_HUB_TOKEN': 'env_token'}):
|
| 108 |
+
token = get_token()
|
| 109 |
+
assert token == 'env_token'
|
| 110 |
+
|
| 111 |
+
@patch('huggingface_hub.get_token')
|
| 112 |
+
def test_get_token_from_cache(mock_get_token):
|
| 113 |
+
mock_get_token.return_value = 'cached_token'
|
| 114 |
+
token = get_token()
|
| 115 |
+
assert token == 'cached_token'
|
tests/test_main_ui.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_main_ui.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
|
| 4 |
+
import pytest
|
| 5 |
+
from unittest.mock import patch, MagicMock
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from main import build_interface # Wait, main imports from ui, but test main logic
|
| 9 |
+
from ui.gradio_ui import convert_batch, build_interface, accumulate_files, clear_state, pdf_files_wrap
|
| 10 |
+
from utils.logger import get_logger
|
| 11 |
+
|
| 12 |
+
logger = get_logger("test_main_ui")
|
| 13 |
+
|
| 14 |
+
@pytest.fixture
|
| 15 |
+
def mock_gradio():
|
| 16 |
+
with patch('gradio.Blocks') as mock_blocks, \
|
| 17 |
+
patch('gradio.Markdown') as mock_md, \
|
| 18 |
+
patch('gradio.Accordion') as mock_accordion, \
|
| 19 |
+
patch('gradio.Dropdown') as mock_dropdown, \
|
| 20 |
+
patch('gradio.Textbox') as mock_textbox, \
|
| 21 |
+
patch('gradio.Slider') as mock_slider, \
|
| 22 |
+
patch('gradio.Checkbox') as mock_checkbox, \
|
| 23 |
+
patch('gradio.Button') as mock_button, \
|
| 24 |
+
patch('gradio.File') as mock_file, \
|
| 25 |
+
patch('gradio.UploadButton') as mock_upload, \
|
| 26 |
+
patch('gradio.State') as mock_state, \
|
| 27 |
+
patch('gradio.Tab') as mock_tab, \
|
| 28 |
+
patch('gradio.JSON') as mock_json, \
|
| 29 |
+
patch('gradio.Files') as mock_files, \
|
| 30 |
+
patch('gradio.Gallery') as mock_gallery:
|
| 31 |
+
yield {
|
| 32 |
+
'Blocks': mock_blocks, 'Markdown': mock_md, 'Accordion': mock_accordion,
|
| 33 |
+
'Dropdown': mock_dropdown, 'Textbox': mock_textbox, 'Slider': mock_slider,
|
| 34 |
+
'Checkbox': mock_checkbox, 'Button': mock_button, 'File': mock_file,
|
| 35 |
+
'UploadButton': mock_upload, 'State': mock_state, 'Tab': mock_tab,
|
| 36 |
+
'JSON': mock_json, 'Files': mock_files, 'Gallery': mock_gallery
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def test_build_interface(mock_gradio):
|
| 40 |
+
demo = build_interface()
|
| 41 |
+
assert demo is not None
|
| 42 |
+
# Verify UI components are created
|
| 43 |
+
mock_gradio['Blocks'].assert_called_once_with(title="parserPDF", css=MagicMock())
|
| 44 |
+
mock_gradio['Markdown'].assert_called() # Title markdown
|
| 45 |
+
mock_gradio['Accordion'].assert_any_call("⚙️ LLM Model Settings", open=False)
|
| 46 |
+
mock_gradio['Tab'].assert_any_call(" 📄 PDF & HTML ➜ Markdown")
|
| 47 |
+
|
| 48 |
+
def test_convert_batch_no_files():
|
| 49 |
+
result = convert_batch([], 0, "huggingface", "test-model", "fireworks-ai", "", "model-id",
|
| 50 |
+
"system", 1024, 0.0, 0.1, False, "token",
|
| 51 |
+
"https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
|
| 52 |
+
"output_dir", False, None)
|
| 53 |
+
assert "No files uploaded" in result[0]
|
| 54 |
+
|
| 55 |
+
@patch('ui.gradio_ui.login_huggingface')
|
| 56 |
+
@patch('ui.gradio_ui.ProcessPoolExecutor')
|
| 57 |
+
@patch('ui.gradio_ui.pdf2md_converter.convert_files')
|
| 58 |
+
def test_convert_batch_success(mock_convert, mock_pool, mock_login):
|
| 59 |
+
mock_result = MagicMock()
|
| 60 |
+
mock_convert.return_value = {"filepath": Path("test.md"), "image_path": ["img.jpg"], "markdown": "content"}
|
| 61 |
+
mock_pool.return_value.__enter__.return_value.map.return_value = [mock_result]
|
| 62 |
+
mock_login.return_value = None
|
| 63 |
+
|
| 64 |
+
pdf_files = [MagicMock(name="test.pdf")]
|
| 65 |
+
result = convert_batch(pdf_files, 1, "huggingface", "test-model", "fireworks-ai", "", "model-id",
|
| 66 |
+
"system", 1024, 0.0, 0.1, False, "token",
|
| 67 |
+
"https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
|
| 68 |
+
"output_dir", False, None)
|
| 69 |
+
|
| 70 |
+
assert len(result) == 3
|
| 71 |
+
assert "test.md" in result[0]
|
| 72 |
+
assert "img.jpg" in result[2][0]
|
| 73 |
+
mock_pool.assert_called_once()
|
| 74 |
+
mock_convert.assert_called_once_with("test.pdf")
|
| 75 |
+
|
| 76 |
+
@patch('ui.gradio_ui.ProcessPoolExecutor')
|
| 77 |
+
def test_convert_batch_pool_error(mock_pool):
|
| 78 |
+
mock_pool.side_effect = Exception("Pool error")
|
| 79 |
+
pdf_files = [MagicMock(name="test.pdf")]
|
| 80 |
+
result = convert_batch(pdf_files, 1, "huggingface", "test-model", "fireworks-ai", "", "model-id",
|
| 81 |
+
"system", 1024, 0.0, 0.1, False, "token",
|
| 82 |
+
"https://router.huggingface.co/v1", "webp", 4, 2, "markdown",
|
| 83 |
+
"output_dir", False, None)
|
| 84 |
+
assert "Error during ProcessPoolExecutor" in result[0]
|
| 85 |
+
|
| 86 |
+
def test_accumulate_files():
|
| 87 |
+
# Test initial accumulation
|
| 88 |
+
new_files = [MagicMock(name="/tmp/file1.pdf"), MagicMock(name="/tmp/file2.html")]
|
| 89 |
+
state = []
|
| 90 |
+
updated_state, message = accumulate_files(new_files, state)
|
| 91 |
+
assert len(updated_state) == 2
|
| 92 |
+
assert "/tmp/file1.pdf" in updated_state
|
| 93 |
+
assert "Accumulated 2 file(s)" in message
|
| 94 |
+
|
| 95 |
+
# Test adding to existing state
|
| 96 |
+
new_files2 = [MagicMock(name="/tmp/file3.pdf")]
|
| 97 |
+
updated_state2, message2 = accumulate_files(new_files2, updated_state)
|
| 98 |
+
assert len(updated_state2) == 3
|
| 99 |
+
assert "Accumulated 3 file(s)" in message2
|
| 100 |
+
|
| 101 |
+
# Test no new files
|
| 102 |
+
_, message3 = accumulate_files([], updated_state2)
|
| 103 |
+
assert "No new files uploaded" in message3
|
| 104 |
+
|
| 105 |
+
def test_clear_state():
|
| 106 |
+
result = clear_state()
|
| 107 |
+
assert len(result) == 4
|
| 108 |
+
assert result[0] == [] # cleared file list
|
| 109 |
+
assert result[1] == "Files list cleared." # message
|
| 110 |
+
assert result[2] == [] # cleared file btn
|
| 111 |
+
assert result[3] == [] # cleared dir btn
|
| 112 |
+
|
| 113 |
+
def test_pdf_files_wrap():
|
| 114 |
+
# Single file
|
| 115 |
+
single_file = "single.pdf"
|
| 116 |
+
wrapped = pdf_files_wrap(single_file)
|
| 117 |
+
assert isinstance(wrapped, list)
|
| 118 |
+
assert len(wrapped) == 1
|
| 119 |
+
assert wrapped[0] == single_file
|
| 120 |
+
|
| 121 |
+
# List of files
|
| 122 |
+
files_list = ["file1.pdf", "file2.html"]
|
| 123 |
+
wrapped_list = pdf_files_wrap(files_list)
|
| 124 |
+
assert wrapped_list == files_list
|
| 125 |
+
|
| 126 |
+
# None input
|
| 127 |
+
assert pdf_files_wrap(None) == [None]
|
| 128 |
+
|
| 129 |
+
@patch('ui.gradio_ui.os.chdir')
|
| 130 |
+
@patch('ui.gradio_ui.Path')
|
| 131 |
+
def test_main_launch(mock_path, mock_chdir):
|
| 132 |
+
mock_script_dir = MagicMock()
|
| 133 |
+
mock_path.return_value.resolve.return_value.parent = mock_script_dir
|
| 134 |
+
mock_chdir.return_value = None
|
| 135 |
+
|
| 136 |
+
# Test main execution path
|
| 137 |
+
with patch('builtins.__name__', '__main__'):
|
| 138 |
+
from main import main # Assuming main has a main function, or test the if __name__ logic indirectly
|
| 139 |
+
# Since main.py is simple, test the key parts
|
| 140 |
+
demo = MagicMock()
|
| 141 |
+
with patch('ui.gradio_ui.build_interface', return_value=demo):
|
| 142 |
+
with patch('gradio.Interface.launch') as mock_launch:
|
| 143 |
+
# Execute main logic
|
| 144 |
+
import main
|
| 145 |
+
main.main() # If it has main(), or just import runs it
|
| 146 |
+
|
| 147 |
+
mock_chdir.assert_called_once_with(mock_script_dir)
|
| 148 |
+
mock_launch.assert_called_once_with(debug=True, show_error=True)
|
tests/test_utils.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_utils.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
#
|
| 4 |
+
# import pytest
|
| 5 |
+
import logging
|
| 6 |
+
from unittest.mock import patch, MagicMock
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from utils.logger import get_logger, setup_logging
|
| 10 |
+
from utils.utils import is_dict, is_list_of_dicts
|
| 11 |
+
from utils.config import TITLE, DESCRIPTION # Assuming these are defined
|
| 12 |
+
from utils.get_config import get_config_value # If separate module
|
| 13 |
+
|
| 14 |
+
def test_setup_logging(capsys):
|
| 15 |
+
setup_logging()
|
| 16 |
+
captured = capsys.readouterr()
|
| 17 |
+
assert "Logging configured" in captured.out or captured.err # Assuming it prints config message
|
| 18 |
+
|
| 19 |
+
def test_get_logger():
|
| 20 |
+
logger = get_logger("test_logger")
|
| 21 |
+
assert isinstance(logger, logging.Logger)
|
| 22 |
+
assert logger.name == "test_logger"
|
| 23 |
+
|
| 24 |
+
@patch('logging.getLogger')
|
| 25 |
+
def test_get_logger_custom(mock_get_logger):
|
| 26 |
+
mock_logger = MagicMock()
|
| 27 |
+
mock_get_logger.return_value = mock_logger
|
| 28 |
+
logger = get_logger("custom_test")
|
| 29 |
+
mock_get_logger.assert_called_once_with("custom_test")
|
| 30 |
+
assert logger == mock_logger
|
| 31 |
+
|
| 32 |
+
def test_is_dict():
|
| 33 |
+
assert is_dict({"key": "value"}) is True
|
| 34 |
+
assert is_dict({"key": [1, 2]}) is True
|
| 35 |
+
assert is_dict([]) is False
|
| 36 |
+
assert is_dict("string") is False
|
| 37 |
+
assert is_dict(123) is False
|
| 38 |
+
assert is_dict(None) is False
|
| 39 |
+
|
| 40 |
+
def test_is_list_of_dicts():
|
| 41 |
+
assert is_list_of_dicts([{"a": 1}, {"b": 2}]) is True
|
| 42 |
+
assert is_list_of_dicts([]) is False # Empty list not considered list of dicts
|
| 43 |
+
assert is_list_of_dicts([{"a": 1}, "string"]) is False
|
| 44 |
+
assert is_list_of_dicts("not_list") is False
|
| 45 |
+
assert is_list_of_dicts([1, 2]) is False
|
| 46 |
+
assert is_list_of_dicts(None) is False
|
| 47 |
+
|
| 48 |
+
def test_config_constants():
|
| 49 |
+
# Test if config values are as expected (update based on actual config.py)
|
| 50 |
+
assert TITLE == "parserPDF" # Or whatever the actual value is
|
| 51 |
+
assert DESCRIPTION.startswith("PDF parser") # Partial match for description
|
| 52 |
+
|
| 53 |
+
@patch('utils.get_config.configparser.ConfigParser')
|
| 54 |
+
def test_get_config_value(mock_configparser):
|
| 55 |
+
mock_config = MagicMock()
|
| 56 |
+
mock_config.get.return_value = "test_value"
|
| 57 |
+
mock_configparser.return_value = mock_config
|
| 58 |
+
|
| 59 |
+
value = get_config_value("SECTION", "KEY")
|
| 60 |
+
mock_config.get.assert_called_once_with("SECTION", "KEY")
|
| 61 |
+
assert value == "test_value"
|
| 62 |
+
|
| 63 |
+
@patch('utils.get_config.configparser.ConfigParser')
|
| 64 |
+
def test_get_config_value_default(mock_configparser):
|
| 65 |
+
mock_config = MagicMock()
|
| 66 |
+
mock_config.get.side_effect = KeyError("No such key")
|
| 67 |
+
mock_configparser.return_value = mock_config
|
| 68 |
+
|
| 69 |
+
value = get_config_value("SECTION", "NONEXISTENT", default="fallback")
|
| 70 |
+
assert value == "fallback"
|
| 71 |
+
mock_config.get.assert_called_once_with("SECTION", "NONEXISTENT")
|
| 72 |
+
|
| 73 |
+
def test_logger_levels(caplog):
|
| 74 |
+
# Test logging at different levels
|
| 75 |
+
logger = get_logger("level_test")
|
| 76 |
+
|
| 77 |
+
with caplog.at_level(logging.DEBUG):
|
| 78 |
+
logger.debug("Debug message")
|
| 79 |
+
assert "Debug message" in caplog.text
|
| 80 |
+
|
| 81 |
+
with caplog.at_level(logging.INFO):
|
| 82 |
+
logger.info("Info message")
|
| 83 |
+
assert "Info message" in caplog.text
|
| 84 |
+
|
| 85 |
+
with caplog.at_level(logging.ERROR):
|
| 86 |
+
logger.error("Error message")
|
| 87 |
+
assert "Error message" in caplog.text
|
| 88 |
+
|
| 89 |
+
def test_setup_logging_file(tmp_path):
|
| 90 |
+
log_file = tmp_path / "test.log"
|
| 91 |
+
with patch.dict('os.environ', {'LOG_FILE': str(log_file)}):
|
| 92 |
+
setup_logging()
|
| 93 |
+
assert log_file.exists()
|
| 94 |
+
log_file.unlink() # Cleanup
|
tests/tests_converter.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tests/test_conversion.py
|
| 2 |
+
# run with pytest tests/.
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from converters.pdf_to_md import PdfToMarkdownConverter
|
| 6 |
+
from converters.md_to_pdf import MarkdownToPdfConverter
|
| 7 |
+
|
| 8 |
+
def test_sample_pdf():
|
| 9 |
+
pdf = Path("tests/sample.pdf")
|
| 10 |
+
converter = PdfToMarkdownConverter()
|
| 11 |
+
md = converter.convert(pdf)
|
| 12 |
+
assert isinstance(md, str) and len(md) > 0
|
| 13 |
+
|
| 14 |
+
def test_markdown_to_pdf(tmp_path):
|
| 15 |
+
md_file = tmp_path / "test.md"
|
| 16 |
+
md_file.write_text("# Hello\nThis is a test.")
|
| 17 |
+
conv = MarkdownToPdfConverter()
|
| 18 |
+
pdf_path = conv.convert(md_file)
|
| 19 |
+
assert pdf_path.exists() and pdf_path.suffix == ".pdf"
|
ui/__init__.py
ADDED
|
File without changes
|
ui/gradio_ui.py
ADDED
|
@@ -0,0 +1,850 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/gradio_ui.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
from pathlib import Path, WindowsPath
|
| 5 |
+
import traceback ## Extract, format and print information about Python stack traces.
|
| 6 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 7 |
+
from typing import Optional, Union #, Dict, List, Any, Tuple
|
| 8 |
+
|
| 9 |
+
from huggingface_hub import get_token
|
| 10 |
+
import file_handler
|
| 11 |
+
import file_handler.file_utils
|
| 12 |
+
from utils.config import TITLE, DESCRIPTION, DESCRIPTION_PDF_HTML, DESCRIPTION_PDF, DESCRIPTION_HTML, DESCRIPTION_MD
|
| 13 |
+
from utils.utils import is_dict, is_list_of_dicts
|
| 14 |
+
from file_handler.file_utils import process_dicts_data, collect_pdf_paths, collect_html_paths, collect_markdown_paths, create_outputdir ## should move to handling file
|
| 15 |
+
#from llm.hf_client import HFChatClient ## SMY: unused. See converters.extraction_converter
|
| 16 |
+
from llm.provider_validator import is_valid_provider, suggest_providers
|
| 17 |
+
from llm.llm_login import login_huggingface
|
| 18 |
+
|
| 19 |
+
from converters.extraction_converter import DocumentConverter as docconverter #DocumentExtractor #as docextractor
|
| 20 |
+
from converters.pdf_to_md import PdfToMarkdownConverter, init_worker
|
| 21 |
+
from converters.md_to_pdf import MarkdownToPdfConverter
|
| 22 |
+
from converters.html_to_md import HtmlToMarkdownConverter
|
| 23 |
+
|
| 24 |
+
from utils.get_config import get_config_value
|
| 25 |
+
from utils.logger import get_logger
|
| 26 |
+
|
| 27 |
+
logger = get_logger(__name__) ##NB: setup_logging() ## set logging
|
| 28 |
+
|
| 29 |
+
# Instantiate converters class once – they are stateless
|
| 30 |
+
pdf2md_converter = PdfToMarkdownConverter()
|
| 31 |
+
#html2md_converter = HtmlToMarkdownConverter()
|
| 32 |
+
md2pdf_converter = MarkdownToPdfConverter()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# pool executor to convert files called by Gradio
|
| 36 |
+
def convert_batch(
|
| 37 |
+
pdf_files, #: list[str],
|
| 38 |
+
pdf_files_count: int,
|
| 39 |
+
provider: str,
|
| 40 |
+
model_id: str,
|
| 41 |
+
#base_url: str
|
| 42 |
+
hf_provider: str,
|
| 43 |
+
endpoint: str,
|
| 44 |
+
backend_choice: str,
|
| 45 |
+
system_message: str,
|
| 46 |
+
max_tokens: int,
|
| 47 |
+
temperature: float,
|
| 48 |
+
top_p: float,
|
| 49 |
+
stream: bool,
|
| 50 |
+
api_token: str,
|
| 51 |
+
#max_workers: int,
|
| 52 |
+
#max_retries: int,
|
| 53 |
+
openai_base_url: str = "https://router.huggingface.co/v1",
|
| 54 |
+
openai_image_format: Optional[str] = "webp",
|
| 55 |
+
max_workers: Optional[int] = 4,
|
| 56 |
+
max_retries: Optional[int] = 2,
|
| 57 |
+
output_format: str = "markdown",
|
| 58 |
+
#output_dir: Optional[Union[str, Path]] = "output_dir",
|
| 59 |
+
output_dir_string: str = "output_dir_default",
|
| 60 |
+
use_llm: bool = False, #Optional[bool] = False, #True,
|
| 61 |
+
page_range: str = None, #Optional[str] = None,
|
| 62 |
+
) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Handles the conversion process using multiprocessing.
|
| 65 |
+
Spins up a pool and converts all uploaded files in parallel.
|
| 66 |
+
Aggregates per-file logs into one string.
|
| 67 |
+
Receives Gradio component values, starting with the list of uploaded file paths
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
# explicitly wrap file object in a list
|
| 71 |
+
#pdf_files = pdf_files_wrap(pdf_files) ##Flag: deprecation
|
| 72 |
+
|
| 73 |
+
## debug
|
| 74 |
+
#logger.log(level=30, msg="pdf_files_inputs", extra={"input_arg[0]:": pdf_files[0]})
|
| 75 |
+
|
| 76 |
+
#if not files:
|
| 77 |
+
if not pdf_files or pdf_files is None: ## Check if files is None. This handles the case where no files are uploaded.
|
| 78 |
+
logger.log(level=30, msg="Initialising ProcessPool: No files uploaded.", extra={"pdf_files": pdf_files, "files_len": pdf_files_count})
|
| 79 |
+
return "Initialising ProcessPool: No files uploaded."
|
| 80 |
+
|
| 81 |
+
# Get config values if not provided
|
| 82 |
+
model_id = get_config_value("MARKER_CAP", "MODEL_ID") if not model_id else model_id
|
| 83 |
+
openai_base_url = get_config_value( "MARKER_CAP", "OPENAI_BASE_URL") if not openai_base_url else openai_base_url
|
| 84 |
+
openai_image_format = get_config_value( "MARKER_CAP", "OPENAI_IMAGE_FORMAT") if not openai_image_format else openai_image_format
|
| 85 |
+
max_workers = get_config_value("MARKER_CAP", "MAX_WORKERS") if not max_workers else max_workers
|
| 86 |
+
max_retries = get_config_value("MARKER_CAP", "MAX_RETRIES") if not max_retries else max_retries
|
| 87 |
+
output_format = get_config_value("MARKER_CAP", "OUTPUT_FORMAT") if not output_format else output_format
|
| 88 |
+
output_dir_string = str(get_config_value("MARKER_CAP", "OUTPUT_DIR") if not output_dir_string else output_dir_string)
|
| 89 |
+
use_llm = get_config_value("MARKER_CAP", "USE_LLM") if not use_llm else use_llm
|
| 90 |
+
page_range = get_config_value("MARKER_CAP", "PAGE_RANGE") if not page_range else page_range
|
| 91 |
+
|
| 92 |
+
# Create the initargs tuple from the Gradio inputs: # 'files' is an iterable, and handled separately.
|
| 93 |
+
init_args = (
|
| 94 |
+
provider,
|
| 95 |
+
model_id,
|
| 96 |
+
#base_url,
|
| 97 |
+
hf_provider,
|
| 98 |
+
endpoint,
|
| 99 |
+
backend_choice,
|
| 100 |
+
system_message,
|
| 101 |
+
max_tokens,
|
| 102 |
+
temperature,
|
| 103 |
+
top_p,
|
| 104 |
+
stream,
|
| 105 |
+
api_token,
|
| 106 |
+
openai_base_url,
|
| 107 |
+
openai_image_format,
|
| 108 |
+
max_workers,
|
| 109 |
+
max_retries,
|
| 110 |
+
output_format,
|
| 111 |
+
output_dir_string,
|
| 112 |
+
use_llm,
|
| 113 |
+
page_range,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
#global docextractor ##SMY: deprecated.
|
| 117 |
+
try:
|
| 118 |
+
login_huggingface(api_token) ## attempt login if not already logged in. NB: HF CLI login prompt would not display in Process Worker.
|
| 119 |
+
except Exception as exc: # Catch all exceptions
|
| 120 |
+
tb = traceback.format_exc()
|
| 121 |
+
logger.exception(f"✗ Error during login_huggingface → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 122 |
+
return f"✗ An error occurred during login_huggingface → {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Create a pool with init_worker initialiser
|
| 126 |
+
with ProcessPoolExecutor(
|
| 127 |
+
max_workers=max_workers,
|
| 128 |
+
initializer=init_worker,
|
| 129 |
+
initargs=init_args
|
| 130 |
+
) as pool:
|
| 131 |
+
#global docextractor
|
| 132 |
+
logger.log(level=30, msg="Initialising ProcessPool: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 133 |
+
|
| 134 |
+
# Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
|
| 135 |
+
# The 'docconverter' argument is implicitly handled by the initialiser
|
| 136 |
+
|
| 137 |
+
#futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
|
| 138 |
+
#logs = [f.result() for f in as_completed(futures)]
|
| 139 |
+
#futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
|
| 140 |
+
#logs = [f.result() for f in futures]
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
pdf2md_converter.output_dir_string = output_dir_string ##SMY: attempt setting directly to resolve pool.map iterable
|
| 144 |
+
#result_convert = pool.map(pdf2md_converter.convert_files, pdf_files, max_retries)
|
| 145 |
+
results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #output_dir_string)
|
| 146 |
+
except Exception as exc:
|
| 147 |
+
# Raise the exception to stop the Gradio app
|
| 148 |
+
#raise # Re-raise the exception to halt execution
|
| 149 |
+
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 150 |
+
traceback.print_exc() # Print the exception traceback
|
| 151 |
+
return f"An error occurred during pool.map: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
| 152 |
+
|
| 153 |
+
#'''
|
| 154 |
+
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
| 155 |
+
logs = []
|
| 156 |
+
logs_files_images = []
|
| 157 |
+
#logs.extend(results) ## performant pythonic
|
| 158 |
+
#logs = list[results] ##
|
| 159 |
+
logs = [result for result in results] ## pythonic list comprehension
|
| 160 |
+
## logs : [file , images , filepath, image_path]
|
| 161 |
+
|
| 162 |
+
#logs_files_images = logs_files.extend(logs_images) #zip(logs_files, logs_images) ##SMY: in progress
|
| 163 |
+
for log in logs:
|
| 164 |
+
#logs_files_images.append(log.get("filepath", "Error or No filepath")) # if all(isinstance(log, dict) for item in logs))
|
| 165 |
+
#logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 166 |
+
|
| 167 |
+
logs_files_images.append(log.get("filepath") if is_dict(logs) or isinstance(log, Path) else "Error or no image_path") # isinstance(log, (dict, str))
|
| 168 |
+
logs_files_images.extend(list(image for image in log.get("image_path", "Error or no image_path")))
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
#logs_files_images.append(logs_filepath) ## to del
|
| 172 |
+
#logs_files_images.extend(logs_images) ## to del
|
| 173 |
+
#'''
|
| 174 |
+
except Exception as exc:
|
| 175 |
+
tb = traceback.format_exc()
|
| 176 |
+
logger.exception(f"✗ Error during ProcessPoolExecutor → {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 177 |
+
#traceback.print_exc() # Print the exception traceback
|
| 178 |
+
return f"✗ An error occurred during ProcessPoolExecutor→ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
| 179 |
+
|
| 180 |
+
'''
|
| 181 |
+
logger.log(level=20, msg="ProcessPoolExecutor pool result:", extra={"results": str(results)})
|
| 182 |
+
logs = []
|
| 183 |
+
#logs.extend(results) ## performant pythonic
|
| 184 |
+
#logs = list[results] ##
|
| 185 |
+
logs = [result for result in results] ## pythonic list comprehension
|
| 186 |
+
'''
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
logs_return = file_handler.file_utils.process_dicts_data(logs) #"\n".join(log for log in logs) ##SMY outputs to gr.JSON component with no need for json.dumps(data, indent=)
|
| 190 |
+
#logs_files_images_return = "\n".join(path for path in logs_files_images) ##TypeError: sequence item 0: expected str instance, WindowsPath found
|
| 191 |
+
|
| 192 |
+
##convert the List of Path objects to List of string for gr.Files output
|
| 193 |
+
#logs_files_images_return = list(str(path) for path in logs_files_images)
|
| 194 |
+
|
| 195 |
+
## # Convert any Path objects to strings, but leave strings as-is
|
| 196 |
+
logs_files_images_return = list(str(path) if isinstance(path, Path) else path for path in logs_files_images)
|
| 197 |
+
return logs_return, logs_return, logs_files_images_return
|
| 198 |
+
#return "\n".join(logs), "\n".join(logs_files_images) #"\n".join(logs_files)
|
| 199 |
+
except Exception as exc:
|
| 200 |
+
tb = traceback.format_exc()
|
| 201 |
+
logger.exception(f"✗ Error during returning result logs → {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
| 202 |
+
#traceback.print_exc() # Print the exception traceback
|
| 203 |
+
return f"✗ An error occurred during returning result logs→ {exc}\n{tb}", f"Error: {exc}", f"Error: {exc}" # return the exception message
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
#return "\n".join(log for log in logs), "\n".join(str(path) for path in logs_files_images)
|
| 207 |
+
#print(f'logs_files_images: {"\n".join(str(path) for path in logs_files_images)}')
|
| 208 |
+
|
| 209 |
+
# files wrapping into list ##SMY: Flagged for deprecation
|
| 210 |
+
def pdf_files_wrap(files: list[str]):
|
| 211 |
+
# explicitly wrap file object in a list
|
| 212 |
+
return [files] if not isinstance(files, list) else files
|
| 213 |
+
#return [files]
|
| 214 |
+
|
| 215 |
+
##====================
|
| 216 |
+
## SMY: moved to logic file: See pdf_to_md.py. Currently unused
|
| 217 |
+
def convert_pdfs_to_md(file: gr.File | None, folder: str | None) -> dict:
|
| 218 |
+
"""
|
| 219 |
+
Gradio callback for PDF → Markdown.
|
| 220 |
+
Accepts either a single file or a folder path (recursively).
|
| 221 |
+
Leverages Marker, a pipeline of deep learning models, for conversion
|
| 222 |
+
Returns a dictionary of filename → Markdown string.
|
| 223 |
+
"""
|
| 224 |
+
if not file and not folder:
|
| 225 |
+
return {"error": "Please provide a PDF file or a folder."}
|
| 226 |
+
|
| 227 |
+
pdf_paths = []
|
| 228 |
+
|
| 229 |
+
# Single file
|
| 230 |
+
if file:
|
| 231 |
+
pdf_path = Path(file.name)
|
| 232 |
+
pdf_paths.append(pdf_path)
|
| 233 |
+
|
| 234 |
+
# Folder (recursively)
|
| 235 |
+
if folder:
|
| 236 |
+
try:
|
| 237 |
+
pdf_paths.extend(collect_pdf_paths(folder))
|
| 238 |
+
except Exception as exc:
|
| 239 |
+
logger.exception("Folder traversal failed.")
|
| 240 |
+
return {"error": str(exc)}
|
| 241 |
+
|
| 242 |
+
if not pdf_paths:
|
| 243 |
+
return {"error": "No PDF files found."}
|
| 244 |
+
|
| 245 |
+
results = pdf2md_converter.batch_convert(pdf_paths)
|
| 246 |
+
# Gradio expects a dict of {filename: content}
|
| 247 |
+
return results
|
| 248 |
+
|
| 249 |
+
## SMY: to refactor and moved to logic file. Currently unused
|
| 250 |
+
def convert_htmls_to_md(file: gr.File | None, folder: str | None) -> dict:
|
| 251 |
+
"""
|
| 252 |
+
Gradio callback for HTML → Markdown.
|
| 253 |
+
Accepts either a single file or a folder path (recursively).
|
| 254 |
+
Returns a dictionary of filename → Markdown string.
|
| 255 |
+
"""
|
| 256 |
+
if not file and not folder:
|
| 257 |
+
return {"error": "Please provide a HTML file or a folder."}
|
| 258 |
+
|
| 259 |
+
html_paths = []
|
| 260 |
+
|
| 261 |
+
# Single file
|
| 262 |
+
if file:
|
| 263 |
+
html_path = Path(file.name)
|
| 264 |
+
html_paths.append(html_path)
|
| 265 |
+
|
| 266 |
+
# Folder (recursively)
|
| 267 |
+
if folder:
|
| 268 |
+
try:
|
| 269 |
+
html_paths.extend(collect_html_paths(folder))
|
| 270 |
+
except Exception as exc:
|
| 271 |
+
logger.exception("Folder traversal failed.")
|
| 272 |
+
return {"error": str(exc)}
|
| 273 |
+
|
| 274 |
+
if not html_paths:
|
| 275 |
+
return {"error": "No HTML files found."}
|
| 276 |
+
|
| 277 |
+
results = html2md_converter.batch_convert(html_paths)
|
| 278 |
+
# Gradio expects a dict of {filename: content}
|
| 279 |
+
return results
|
| 280 |
+
|
| 281 |
+
## SMY: to refactor and moved to logic file
|
| 282 |
+
def convert_md_to_pdf(file: gr.File | None, folder: str | None) -> list[gr.File]:
|
| 283 |
+
"""
|
| 284 |
+
Gradio callback for Markdown → PDF.
|
| 285 |
+
Returns a list of generated PDF files (as Gradio File objects).
|
| 286 |
+
"""
|
| 287 |
+
if not file and not folder:
|
| 288 |
+
return []
|
| 289 |
+
|
| 290 |
+
md_paths = []
|
| 291 |
+
|
| 292 |
+
# Single file
|
| 293 |
+
if file:
|
| 294 |
+
md_path = Path(file.name)
|
| 295 |
+
md_paths.append(md_path)
|
| 296 |
+
|
| 297 |
+
# Folder
|
| 298 |
+
if folder:
|
| 299 |
+
try:
|
| 300 |
+
md_paths.extend(collect_markdown_paths(folder))
|
| 301 |
+
except Exception as exc:
|
| 302 |
+
logger.exception("Folder traversal failed.")
|
| 303 |
+
return []
|
| 304 |
+
|
| 305 |
+
if not md_paths:
|
| 306 |
+
return []
|
| 307 |
+
|
| 308 |
+
output_dir = Path("./generated_pdfs")
|
| 309 |
+
output_dir.mkdir(exist_ok=True)
|
| 310 |
+
|
| 311 |
+
pdf_files = md2pdf_converter.batch_convert(md_paths, output_dir)
|
| 312 |
+
# Convert to Gradio File objects
|
| 313 |
+
gr_files = [gr.File(path=str(p)) for p in pdf_files]
|
| 314 |
+
return gr_files
|
| 315 |
+
##====================
|
| 316 |
+
|
| 317 |
+
def build_interface() -> gr.Blocks:
|
| 318 |
+
"""
|
| 319 |
+
Assemble the Gradio Blocks UI.
|
| 320 |
+
"""
|
| 321 |
+
|
| 322 |
+
# Use custom CSS to style the file component
|
| 323 |
+
custom_css = """
|
| 324 |
+
.file-or-directory-area {
|
| 325 |
+
border: 2px dashed #ccc;
|
| 326 |
+
padding: 20px;
|
| 327 |
+
text-align: center;
|
| 328 |
+
border-radius: 8px;
|
| 329 |
+
margin-bottom: 10px;
|
| 330 |
+
display: flex;
|
| 331 |
+
flex-direction: column;
|
| 332 |
+
align-items: center;
|
| 333 |
+
}
|
| 334 |
+
.file-or-directory-area:hover {
|
| 335 |
+
border-color: #007bff;
|
| 336 |
+
background-color: #f8f9fa;
|
| 337 |
+
}
|
| 338 |
+
.gradio-upload-btn {
|
| 339 |
+
margin-top: 10px;
|
| 340 |
+
}
|
| 341 |
+
"""
|
| 342 |
+
|
| 343 |
+
def is_file_with_extension(path_obj: Path) -> bool:
|
| 344 |
+
"""
|
| 345 |
+
Checks if a pathlib.Path object is a file and has a non-empty extension.
|
| 346 |
+
"""
|
| 347 |
+
path_obj = path_obj if isinstance(path_obj, Path) else Path(path_obj) if isinstance(path_obj, str) else None
|
| 348 |
+
return path_obj.is_file() and bool(path_obj.suffix)
|
| 349 |
+
|
| 350 |
+
def accumulate_files(uploaded_files, current_state):
|
| 351 |
+
"""
|
| 352 |
+
Accumulates newly uploaded files with the existing state.
|
| 353 |
+
"""
|
| 354 |
+
# Initialize state if it's the first run
|
| 355 |
+
if current_state is None:
|
| 356 |
+
current_state = []
|
| 357 |
+
|
| 358 |
+
# If no files were uploaded in this interaction, return the current state unchanged
|
| 359 |
+
if not uploaded_files:
|
| 360 |
+
return current_state, f"No new files uploaded. Still tracking {len(current_state)} file(s)."
|
| 361 |
+
|
| 362 |
+
# Get the temporary paths of the newly uploaded files
|
| 363 |
+
# call is_file_with_extension to check if pathlib.Path object is a file and has a non-empty extension
|
| 364 |
+
new_file_paths = [f.name for f in uploaded_files if is_file_with_extension(Path(f.name))] #Path(f.name) and Path(f.name).is_file() and bool(Path(f.name).suffix)] #Path(f.name).suffix.lower() !=""]
|
| 365 |
+
|
| 366 |
+
# Concatenate the new files with the existing ones in the state
|
| 367 |
+
updated_files = current_state + new_file_paths
|
| 368 |
+
updated_filenames = [Path(f).name for f in updated_files]
|
| 369 |
+
|
| 370 |
+
# Return the updated state and a message to the user
|
| 371 |
+
#file_info = "\n".join(updated_files)
|
| 372 |
+
filename_info = "\n".join(updated_filenames)
|
| 373 |
+
#message = f"Accumulated {len(updated_files)} file(s) total.\n\nAll file paths:\n{file_info}"
|
| 374 |
+
message = f"Accumulated {len(updated_files)} file(s) total: \n{filename_info}"
|
| 375 |
+
|
| 376 |
+
return updated_files, message
|
| 377 |
+
|
| 378 |
+
def clear_state():
|
| 379 |
+
"""
|
| 380 |
+
Clears the accumulated state of uloaded file list, output textbox, files and directory upload.
|
| 381 |
+
"""
|
| 382 |
+
return [], "Files list cleared.", [], []
|
| 383 |
+
|
| 384 |
+
# with gr.Blocks(title=TITLE) as demo
|
| 385 |
+
with gr.Blocks(title=TITLE, css=custom_css) as demo:
|
| 386 |
+
gr.Markdown(f"## {DESCRIPTION}")
|
| 387 |
+
|
| 388 |
+
# Clean UI: Model parameters hidden in collapsible accordion
|
| 389 |
+
with gr.Accordion("⚙️ LLM Model Settings", open=False):
|
| 390 |
+
gr.Markdown(f"#### **Backend Configuration**")
|
| 391 |
+
system_message = gr.Textbox(
|
| 392 |
+
label="System Message",
|
| 393 |
+
lines=2
|
| 394 |
+
)
|
| 395 |
+
with gr.Row():
|
| 396 |
+
provider_dd = gr.Dropdown(
|
| 397 |
+
choices=["huggingface", "openai"],
|
| 398 |
+
label="Provider",
|
| 399 |
+
value="huggingface",
|
| 400 |
+
#allow_custom_value=True
|
| 401 |
+
)
|
| 402 |
+
backend_choice = gr.Dropdown(
|
| 403 |
+
choices=["model-id", "provider", "endpoint"],
|
| 404 |
+
label="HF Backend Choice"
|
| 405 |
+
) ## SMY: ensure HFClient maps correctly
|
| 406 |
+
model_tb = gr.Textbox(
|
| 407 |
+
label="Model ID",
|
| 408 |
+
value="meta-llama/Llama-4-Maverick-17B-128E-Instruct", #image-Text-to-Text #"openai/gpt-oss-120b", ##Text-to-Text
|
| 409 |
+
)
|
| 410 |
+
endpoint_tb = gr.Textbox(
|
| 411 |
+
label="Endpoint",
|
| 412 |
+
placeholder="Optional custom endpoint"
|
| 413 |
+
)
|
| 414 |
+
with gr.Row():
|
| 415 |
+
max_token_sl = gr.Slider(
|
| 416 |
+
label="Max Tokens",
|
| 417 |
+
minimum=1,
|
| 418 |
+
maximum=131172, #65536, #32768, #16384, #8192,
|
| 419 |
+
value=1024, #512,
|
| 420 |
+
step=1
|
| 421 |
+
)
|
| 422 |
+
temperature_sl = gr.Slider(
|
| 423 |
+
label="Temperature",
|
| 424 |
+
minimum=0.0,
|
| 425 |
+
maximum=1.0,
|
| 426 |
+
value=0.0,
|
| 427 |
+
step=0.1 #0.01
|
| 428 |
+
)
|
| 429 |
+
top_p_sl = gr.Slider(
|
| 430 |
+
label="Top-p",
|
| 431 |
+
minimum=0.0,
|
| 432 |
+
maximum=1.0,
|
| 433 |
+
value=0.1,
|
| 434 |
+
step=0.1 #0.01
|
| 435 |
+
)
|
| 436 |
+
stream_cb = gr.Checkbox(
|
| 437 |
+
label="LLM Streaming",
|
| 438 |
+
value=False
|
| 439 |
+
)
|
| 440 |
+
with gr.Row():
|
| 441 |
+
api_token_tb = gr.Textbox(
|
| 442 |
+
label="API Token [OPTIONAL]",
|
| 443 |
+
type="password",
|
| 444 |
+
placeholder="hf_xxx or openai key"
|
| 445 |
+
)
|
| 446 |
+
hf_provider_dd = gr.Dropdown(
|
| 447 |
+
choices=["fireworks-ai", "together-ai", "openrouter-ai", "hf-inference"],
|
| 448 |
+
value="fireworks-ai",
|
| 449 |
+
label="Provider",
|
| 450 |
+
allow_custom_value=True, # let users type new providers as they appear
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
# Validate provider on change; warn but allow continue
|
| 454 |
+
def on_provider_change(provider_value: str):
|
| 455 |
+
if not provider_value:
|
| 456 |
+
return
|
| 457 |
+
if not is_valid_provider(provider_value):
|
| 458 |
+
sug = suggest_providers(provider_value)
|
| 459 |
+
extra = f" Suggestions: {', '.join(sug)}." if sug else ""
|
| 460 |
+
gr.Warning(
|
| 461 |
+
f"Provider not on HF provider list. See https://huggingface.co/docs/inference-providers/index.{extra}"
|
| 462 |
+
)
|
| 463 |
+
hf_provider_dd.change(on_provider_change, inputs=hf_provider_dd, outputs=None)
|
| 464 |
+
|
| 465 |
+
# Clean UI: Model parameters hidden in collapsible accordion
|
| 466 |
+
with gr.Accordion("⚙️ Marker Settings", open=False):
|
| 467 |
+
gr.Markdown(f"#### **Marker Configuration**")
|
| 468 |
+
with gr.Row():
|
| 469 |
+
openai_base_url_tb = gr.Textbox(
|
| 470 |
+
label="OpenAI Base URL: Default HuggingFace",
|
| 471 |
+
value="https://router.huggingface.co/v1",
|
| 472 |
+
lines=1,
|
| 473 |
+
max_lines=1,
|
| 474 |
+
)
|
| 475 |
+
openai_image_format_dd = gr.Dropdown(
|
| 476 |
+
choices=["webp", "png", "jpeg"],
|
| 477 |
+
label="OpenAI Image Format",
|
| 478 |
+
value="webp",
|
| 479 |
+
)
|
| 480 |
+
output_format_dd = gr.Dropdown(
|
| 481 |
+
choices=["markdown", "html"], #, "json", "chunks"], ##SMY: To be enabled later
|
| 482 |
+
#choices=["markdown", "html", "json", "chunks"],
|
| 483 |
+
label="Output Format",
|
| 484 |
+
value="markdown",
|
| 485 |
+
)
|
| 486 |
+
output_dir_tb = gr.Textbox(
|
| 487 |
+
label="Output Directory",
|
| 488 |
+
value="output_dir", #"output_md",
|
| 489 |
+
lines=1,
|
| 490 |
+
max_lines=1,
|
| 491 |
+
)
|
| 492 |
+
with gr.Row():
|
| 493 |
+
max_workers_sl = gr.Slider(
|
| 494 |
+
label="Max Worker",
|
| 495 |
+
minimum=1,
|
| 496 |
+
maximum=7,
|
| 497 |
+
value=4,
|
| 498 |
+
step=1
|
| 499 |
+
)
|
| 500 |
+
max_retries_sl = gr.Slider(
|
| 501 |
+
label="Max Retry",
|
| 502 |
+
minimum=1,
|
| 503 |
+
maximum=3,
|
| 504 |
+
value=2,
|
| 505 |
+
step=1 #0.01
|
| 506 |
+
)
|
| 507 |
+
use_llm_cb = gr.Checkbox(
|
| 508 |
+
label="Use LLM for Marker conversion",
|
| 509 |
+
value=False
|
| 510 |
+
)
|
| 511 |
+
page_range_tb = gr.Textbox(
|
| 512 |
+
label="Page Range (Optional)",
|
| 513 |
+
placeholder="Example: 0,1-5,8,12-15",
|
| 514 |
+
lines=1,
|
| 515 |
+
max_lines=1,
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
# Initialise gr.State
|
| 519 |
+
state_max_workers = gr.State(4) #max_workers_sl,
|
| 520 |
+
state_max_retries = gr.State(2) #max_retries_sl,
|
| 521 |
+
|
| 522 |
+
def update_state_stored_value(new_component_input):
|
| 523 |
+
""" Updates stored state: use for max_workers and max_retries """
|
| 524 |
+
return new_component_input
|
| 525 |
+
|
| 526 |
+
# Update gr.State values on slider components change. NB: initial value of `gr.State` must be able to be deepcopied
|
| 527 |
+
max_workers_sl.change(update_state_stored_value, inputs=max_workers_sl, outputs=state_max_workers)
|
| 528 |
+
max_retries_sl.change(update_state_stored_value, inputs=max_retries_sl, outputs=state_max_retries)
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
with gr.Accordion("🤗 HuggingFace Logout", open=False):
|
| 532 |
+
# Logout controls
|
| 533 |
+
def do_logout():
|
| 534 |
+
#ok = docextractor.client.logout()
|
| 535 |
+
ok = docconverter.client.logout()
|
| 536 |
+
# Reset token textbox on successful logout
|
| 537 |
+
msg = "✅ Logged out of Hugging Face and cleared tokens." if ok else "⚠️ Logout failed."
|
| 538 |
+
return gr.update(value=""), gr.update(visible=True, value=msg)
|
| 539 |
+
|
| 540 |
+
logout_status = gr.Markdown(visible=False)
|
| 541 |
+
logout_btn = gr.Button("Logout from Hugging Face", variant="stop")
|
| 542 |
+
|
| 543 |
+
logout_btn.click(fn=do_logout, inputs=None, outputs=[api_token_tb, logout_status])
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
# The gr.State component to hold the accumulated list of files
|
| 547 |
+
uploaded_file_list = gr.State([]) ##NB: initial value of `gr.State` must be able to be deepcopied
|
| 548 |
+
|
| 549 |
+
# --- PDF & HTML → Markdown tab ---
|
| 550 |
+
with gr.Tab(" 📄 PDF & HTML ➜ Markdown"):
|
| 551 |
+
gr.Markdown(f"#### {DESCRIPTION_PDF_HTML}")
|
| 552 |
+
|
| 553 |
+
### flag4deprecation #earlier implementation
|
| 554 |
+
'''
|
| 555 |
+
pdf_files = gr.File(
|
| 556 |
+
label="Upload PDF, HTML or PDF and HTMLfiles",
|
| 557 |
+
file_count="directory", ## handle directory and files upload #"multiple",
|
| 558 |
+
type="filepath",
|
| 559 |
+
file_types=["pdf", ".pdf"],
|
| 560 |
+
#size="small",
|
| 561 |
+
)
|
| 562 |
+
pdf_files_count = gr.TextArea(label="Files Count", interactive=False, lines=1)
|
| 563 |
+
with gr.Row():
|
| 564 |
+
btn_pdf_count = gr.Button("Count Files")
|
| 565 |
+
#btn_pdf_upload = gr.UploadButton("Upload files")
|
| 566 |
+
btn_pdf_convert = gr.Button("Convert PDF(s)")
|
| 567 |
+
'''
|
| 568 |
+
|
| 569 |
+
with gr.Column(elem_classes=["file-or-directory-area"]):
|
| 570 |
+
with gr.Row():
|
| 571 |
+
file_btn = gr.UploadButton(
|
| 572 |
+
#file_btn = gr.File(
|
| 573 |
+
label="Upload Multiple Files",
|
| 574 |
+
file_count="multiple",
|
| 575 |
+
file_types=["file"],
|
| 576 |
+
#height=25, #"sm",
|
| 577 |
+
size="sm",
|
| 578 |
+
elem_classes=["gradio-upload-btn"]
|
| 579 |
+
)
|
| 580 |
+
dir_btn = gr.UploadButton(
|
| 581 |
+
#dir_btn = gr.File(
|
| 582 |
+
label="Upload a Directory",
|
| 583 |
+
file_count="directory",
|
| 584 |
+
#file_types=["file"], #Warning: The `file_types` parameter is ignored when `file_count` is 'directory'
|
| 585 |
+
#height=25, #"0.5",
|
| 586 |
+
size="sm",
|
| 587 |
+
elem_classes=["gradio-upload-btn"]
|
| 588 |
+
)
|
| 589 |
+
with gr.Accordion("Display uploaded", open=True):
|
| 590 |
+
# Displays the accumulated file paths
|
| 591 |
+
output_textbox = gr.Textbox(label="Accumulated Files", lines=3) #, max_lines=4) #10
|
| 592 |
+
|
| 593 |
+
with gr.Row():
|
| 594 |
+
process_button = gr.Button("Process All Uploaded Files", variant="primary")
|
| 595 |
+
clear_button = gr.Button("Clear All Uploads", variant="secondary")
|
| 596 |
+
|
| 597 |
+
# Event handler for the multiple file upload button
|
| 598 |
+
file_btn.upload(
|
| 599 |
+
fn=accumulate_files,
|
| 600 |
+
inputs=[file_btn, uploaded_file_list],
|
| 601 |
+
outputs=[uploaded_file_list, output_textbox]
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
# Event handler for the directory upload button
|
| 605 |
+
dir_btn.upload(
|
| 606 |
+
fn=accumulate_files,
|
| 607 |
+
inputs=[dir_btn, uploaded_file_list],
|
| 608 |
+
outputs=[uploaded_file_list, output_textbox]
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
# Event handler for the "Clear" button
|
| 612 |
+
clear_button.click(
|
| 613 |
+
fn=clear_state,
|
| 614 |
+
inputs=None,
|
| 615 |
+
outputs=[uploaded_file_list, output_textbox, file_btn, dir_btn],
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
# --- PDF → Markdown tab ---
|
| 619 |
+
with gr.Tab(" 📄 PDF ➜ Markdown (Flag for DEPRECATION)", interactive=False, visible=True): #False
|
| 620 |
+
gr.Markdown(f"#### {DESCRIPTION_PDF}")
|
| 621 |
+
|
| 622 |
+
files_upload_pdf = gr.File(
|
| 623 |
+
label="Upload PDF files",
|
| 624 |
+
file_count="directory", ## handle directory and files upload #"multiple",
|
| 625 |
+
type="filepath",
|
| 626 |
+
file_types=["pdf", ".pdf"],
|
| 627 |
+
#size="small",
|
| 628 |
+
)
|
| 629 |
+
files_count = gr.TextArea(label="Files Count", interactive=False, lines=1) #pdf_files_count
|
| 630 |
+
with gr.Row():
|
| 631 |
+
btn_pdf_count = gr.Button("Count Files")
|
| 632 |
+
#btn_pdf_upload = gr.UploadButton("Upload files")
|
| 633 |
+
btn_pdf_convert = gr.Button("Convert PDF(s)")
|
| 634 |
+
|
| 635 |
+
# --- 📃 HTML → Markdown tab ---
|
| 636 |
+
with gr.Tab("🕸️ HTML ➜ Markdown: (Flag for DEPRECATION)", interactive=False, visible=False):
|
| 637 |
+
gr.Markdown(f"#### {DESCRIPTION_HTML}")
|
| 638 |
+
|
| 639 |
+
files_upload_html = gr.File(
|
| 640 |
+
label="Upload HTML files",
|
| 641 |
+
file_count="multiple",
|
| 642 |
+
type="filepath",
|
| 643 |
+
file_types=["html", ".html", "htm", ".htm"]
|
| 644 |
+
)
|
| 645 |
+
#btn_html_convert = gr.Button("Convert HTML(s)")
|
| 646 |
+
html_files_count = gr.TextArea(label="Files Count", interactive=False, lines=1)
|
| 647 |
+
with gr.Row():
|
| 648 |
+
btn_html_count = gr.Button("Count Files")
|
| 649 |
+
#btn_pdf_upload = gr.UploadButton("Upload files")
|
| 650 |
+
btn_html_convert = gr.Button("Convert PDF(s)")
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
# --- Markdown → PDF tab ---
|
| 654 |
+
with gr.Tab("PENDING: Markdown ➜ PDF", interactive=False):
|
| 655 |
+
gr.Markdown(f"#### {DESCRIPTION_MD}")
|
| 656 |
+
|
| 657 |
+
md_files = gr.File(
|
| 658 |
+
label="Upload Markdown files",
|
| 659 |
+
file_count="multiple",
|
| 660 |
+
type="filepath",
|
| 661 |
+
file_types=["md", ".md"]
|
| 662 |
+
)
|
| 663 |
+
btn_md_convert = gr.Button("Convert Markdown to PDF)")
|
| 664 |
+
output_pdf = gr.Gallery(label="Generated PDFs", elem_id="pdf_gallery")
|
| 665 |
+
|
| 666 |
+
'''
|
| 667 |
+
md_input = gr.File(label="Upload a single Markdown file", file_count="single")
|
| 668 |
+
md_folder_input = gr.Textbox(
|
| 669 |
+
label="Or provide a folder path (recursively)",
|
| 670 |
+
placeholder="/path/to/folder",
|
| 671 |
+
)
|
| 672 |
+
convert_md_btn = gr.Button("Convert Markdown to PDF")
|
| 673 |
+
output_pdf = gr.Gallery(label="Generated PDFs", elem_id="pdf_gallery")
|
| 674 |
+
|
| 675 |
+
convert_md_btn.click(
|
| 676 |
+
fn=convert_md_to_pdf,
|
| 677 |
+
inputs=[md_input, md_folder_input],
|
| 678 |
+
outputs=output_pdf,
|
| 679 |
+
)
|
| 680 |
+
'''
|
| 681 |
+
|
| 682 |
+
# A Files component to display individual processed files as download links
|
| 683 |
+
with gr.Accordion("⏬ View and Download processed files", open=False):
|
| 684 |
+
with gr.Row():
|
| 685 |
+
files_individual_JSON = gr.JSON(label="Serialised JSON list", max_height=250)
|
| 686 |
+
files_individual_downloads = gr.Files(label="Individual Processed Files")
|
| 687 |
+
|
| 688 |
+
## Displays processed file paths
|
| 689 |
+
with gr.Accordion("View processing log", open=False):
|
| 690 |
+
log_output = gr.Textbox(
|
| 691 |
+
label="Conversion Logs",
|
| 692 |
+
lines=5,
|
| 693 |
+
#max_lines=25,
|
| 694 |
+
interactive=False
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
# file inputs
|
| 698 |
+
## [wierd] NB: inputs_arg is a list of Gradio component objects, not the values of those components.
|
| 699 |
+
## inputs_arg variable captures the state of these components at the time the list is created.
|
| 700 |
+
## When btn_convert.click() is called later, it uses the list as it was initially defined
|
| 701 |
+
##
|
| 702 |
+
## SMY: Gradio component values are not directly mutable.
|
| 703 |
+
## Instead, you should pass the component values to a function,
|
| 704 |
+
## and then use the return value of the function to update the component.
|
| 705 |
+
## Discarding for now. #//TODO: investigate further.
|
| 706 |
+
## SMY: Solved: using gr.State
|
| 707 |
+
inputs_arg = [
|
| 708 |
+
#pdf_files,
|
| 709 |
+
##pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
|
| 710 |
+
uploaded_file_list,
|
| 711 |
+
files_count, #pdf_files_count,
|
| 712 |
+
provider_dd,
|
| 713 |
+
model_tb,
|
| 714 |
+
hf_provider_dd,
|
| 715 |
+
endpoint_tb,
|
| 716 |
+
backend_choice,
|
| 717 |
+
system_message,
|
| 718 |
+
max_token_sl,
|
| 719 |
+
temperature_sl,
|
| 720 |
+
top_p_sl,
|
| 721 |
+
stream_cb,
|
| 722 |
+
api_token_tb,
|
| 723 |
+
#gr.State(4), # max_workers
|
| 724 |
+
#gr.State(3), # max_retries
|
| 725 |
+
openai_base_url_tb,
|
| 726 |
+
openai_image_format_dd,
|
| 727 |
+
state_max_workers, #gr.State(4), #max_workers_sl,
|
| 728 |
+
state_max_retries, #gr.State(2), #max_retries_sl,
|
| 729 |
+
output_format_dd,
|
| 730 |
+
output_dir_tb,
|
| 731 |
+
use_llm_cb,
|
| 732 |
+
page_range_tb,
|
| 733 |
+
]
|
| 734 |
+
|
| 735 |
+
## debug
|
| 736 |
+
#logger.log(level=30, msg="About to execute btn_pdf_convert.click", extra={"files_len": pdf_files_count, "pdf_files": pdf_files})
|
| 737 |
+
|
| 738 |
+
try:
|
| 739 |
+
#logger.log(level=30, msg="input_arg[0]: {input_arg[0]}")
|
| 740 |
+
process_button.click(
|
| 741 |
+
#pdf_files.upload(
|
| 742 |
+
fn=convert_batch,
|
| 743 |
+
inputs=inputs_arg,
|
| 744 |
+
outputs=[log_output, files_individual_JSON, files_individual_downloads],
|
| 745 |
+
)
|
| 746 |
+
except Exception as exc:
|
| 747 |
+
tb = traceback.format_exc()
|
| 748 |
+
logger.exception(f"✗ Error during process_button.click → {exc}\n{tb}", exc_info=True)
|
| 749 |
+
return f"✗ An error occurred during process_button.click → {exc}\n{tb}"
|
| 750 |
+
|
| 751 |
+
##gr.File .upload() event, fire only after a file has been uploaded
|
| 752 |
+
# Event handler for the pdf file upload button
|
| 753 |
+
files_upload_pdf.upload(
|
| 754 |
+
fn=accumulate_files,
|
| 755 |
+
inputs=[files_upload_pdf, uploaded_file_list],
|
| 756 |
+
outputs=[uploaded_file_list, log_output]
|
| 757 |
+
)
|
| 758 |
+
#inputs_arg[0] = files_upload
|
| 759 |
+
btn_pdf_convert.click(
|
| 760 |
+
#pdf_files.upload(
|
| 761 |
+
fn=convert_batch,
|
| 762 |
+
outputs=[log_output, files_individual_downloads],
|
| 763 |
+
inputs=inputs_arg,
|
| 764 |
+
)
|
| 765 |
+
'''
|
| 766 |
+
inputs = [
|
| 767 |
+
pdf_files,
|
| 768 |
+
#pdf_files_wrap(pdf_files), # wrap pdf_files in a list (if not already)
|
| 769 |
+
pdf_files_count,
|
| 770 |
+
provider_dd,
|
| 771 |
+
model_tb,
|
| 772 |
+
hf_provider_dd,
|
| 773 |
+
endpoint_tb,
|
| 774 |
+
backend_choice,
|
| 775 |
+
system_message,
|
| 776 |
+
max_token_sl,
|
| 777 |
+
temperature_sl,
|
| 778 |
+
top_p_sl,
|
| 779 |
+
stream_cb,
|
| 780 |
+
api_token_tb,
|
| 781 |
+
#gr.State(4), # max_workers
|
| 782 |
+
#gr.State(3), # max_retries
|
| 783 |
+
openai_base_url_tb,
|
| 784 |
+
openai_image_format_dd,
|
| 785 |
+
state_max_workers, #gr.State(max_workers_sl), #max_workers_sl,
|
| 786 |
+
state_max_retries, #gr.State(max_retries_sl), #max_retries_sl,
|
| 787 |
+
output_format_dd,
|
| 788 |
+
output_dir_tb,
|
| 789 |
+
use_llm_cb,
|
| 790 |
+
page_range_tb,
|
| 791 |
+
],
|
| 792 |
+
'''
|
| 793 |
+
# )
|
| 794 |
+
|
| 795 |
+
# reuse the same business logic for HTML tab
|
| 796 |
+
# Event handler for the pdf file upload button
|
| 797 |
+
files_upload_html.upload(
|
| 798 |
+
fn=accumulate_files,
|
| 799 |
+
inputs=[files_upload_html, uploaded_file_list],
|
| 800 |
+
outputs=[uploaded_file_list, log_output]
|
| 801 |
+
)
|
| 802 |
+
#inputs_arg[0] = html_files
|
| 803 |
+
btn_html_convert.click(
|
| 804 |
+
fn=convert_batch,
|
| 805 |
+
inputs=inputs_arg,
|
| 806 |
+
outputs=[log_output, files_individual_downloads]
|
| 807 |
+
)
|
| 808 |
+
|
| 809 |
+
def get_file_count(file_list):
|
| 810 |
+
"""
|
| 811 |
+
Counts the number of files in the list.
|
| 812 |
+
|
| 813 |
+
Args:
|
| 814 |
+
file_list (list): A list of temporary file objects.
|
| 815 |
+
Returns:
|
| 816 |
+
str: A message with the number of uploaded files.
|
| 817 |
+
"""
|
| 818 |
+
if file_list:
|
| 819 |
+
return f"{len(file_list)}", f"Upload: {len(file_list)} files: \n {file_list}" #{[pdf_files.value]}"
|
| 820 |
+
else:
|
| 821 |
+
return "No files uploaded.", "No files uploaded." # Count files button
|
| 822 |
+
|
| 823 |
+
btn_pdf_count.click(
|
| 824 |
+
fn=get_file_count,
|
| 825 |
+
inputs=[files_upload_pdf],
|
| 826 |
+
outputs=[files_count, log_output]
|
| 827 |
+
)
|
| 828 |
+
btn_html_count.click(
|
| 829 |
+
fn=get_file_count,
|
| 830 |
+
inputs=[files_upload_html],
|
| 831 |
+
outputs=[html_files_count, log_output]
|
| 832 |
+
)
|
| 833 |
+
|
| 834 |
+
# Validate files upload on change; warn but allow continue
|
| 835 |
+
def on_pdf_files_change(pdf_files_value: list[str]):
|
| 836 |
+
# explicitly wrap file object in a list
|
| 837 |
+
pdf_files_value = pdf_files_wrap(pdf_files_value)
|
| 838 |
+
#if not isinstance(pdf_files_value, list):
|
| 839 |
+
# pdf_files_value = [pdf_files_value]
|
| 840 |
+
|
| 841 |
+
pdf_files_path = [file.name for file in pdf_files_value]
|
| 842 |
+
pdf_files_len = len(pdf_files_value) #len(pdf_files_path)
|
| 843 |
+
if pdf_files_value:
|
| 844 |
+
#return
|
| 845 |
+
return pdf_files_path, pdf_files_len
|
| 846 |
+
#pdf_files.change(on_pdf_files_change, inputs=pdf_files, outputs=[log_output, pdf_files_count]) #, postprocess=False) ##debug
|
| 847 |
+
|
| 848 |
+
|
| 849 |
+
return demo
|
| 850 |
+
|
utils/__init__.py
ADDED
|
File without changes
|
utils/config.ini
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[marker]
|
| 2 |
+
provider=openai
|
| 3 |
+
#model_id=openai/gpt-oss-120b
|
| 4 |
+
## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
|
| 5 |
+
## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
|
| 6 |
+
## Note that Marker works pretty well using it's own transformer-based model without LLM
|
| 7 |
+
model_id=meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
| 8 |
+
hf_provider=fireworks-ai
|
| 9 |
+
endpoint_url=""
|
| 10 |
+
backend_choice=provider
|
| 11 |
+
system_message=""
|
| 12 |
+
max_tokens=8192
|
| 13 |
+
temperature=0.2
|
| 14 |
+
top_p=0.2
|
| 15 |
+
stream=True
|
| 16 |
+
api_token=a1b2c3
|
| 17 |
+
openai_model=openai/gpt-oss-120b
|
| 18 |
+
openai_api_key=a1b2c3
|
| 19 |
+
openai_base_url=https://router.huggingface.co/v1
|
| 20 |
+
openai_image_format=webp
|
| 21 |
+
#max_retries=3
|
| 22 |
+
|
| 23 |
+
#[Configuration]
|
| 24 |
+
use_llm=True
|
| 25 |
+
output_format=markdown
|
| 26 |
+
input_dir=inputs
|
| 27 |
+
output_dir=output_md
|
| 28 |
+
max_workers=4
|
| 29 |
+
max_retries=2
|
| 30 |
+
extract_images=True
|
| 31 |
+
output_image_format=png
|
| 32 |
+
output_encoding=utf-8
|
| 33 |
+
debug_data_folder=debug_data
|
| 34 |
+
|
| 35 |
+
[unsure]
|
| 36 |
+
image_output_dir="images"
|
| 37 |
+
image_output_format="png"
|
| 38 |
+
base_dir=Path(__file__).resolve().parent.parent
|
| 39 |
+
###
|
| 40 |
+
# Create a Path object from the current file's location, resolve it to an absolute path,
|
| 41 |
+
# and then get its parent's parent using chained .parent calls or the parents[] attribute.
|
| 42 |
+
#grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 43 |
+
###
|
| 44 |
+
|
| 45 |
+
[libraries]
|
| 46 |
+
libobject_path = C:\\Dat\\dev\\gtk3-runtime\\bin
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# from config.ini ##SMY: future plan to merge
|
| 50 |
+
[MARKER_CAP]
|
| 51 |
+
#[marker]
|
| 52 |
+
PROVIDER = openai
|
| 53 |
+
#MODEL_ID = openai/gpt-oss-120b
|
| 54 |
+
## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
|
| 55 |
+
## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
|
| 56 |
+
## Note that Marker works pretty well using it's own transformer-based model without LLM
|
| 57 |
+
MODEL_ID=meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
| 58 |
+
HF_PROVIDER = fireworks-ai
|
| 59 |
+
ENDPOINT_URL = ""
|
| 60 |
+
BACKEND_CHOiCE = provider
|
| 61 |
+
SYSTEM_MESSAGE = ""
|
| 62 |
+
MAX_TOKENS = 8192
|
| 63 |
+
TEMMPERATURE = 0.2
|
| 64 |
+
TOP_P = 0.2
|
| 65 |
+
STREAM = True
|
| 66 |
+
API_TOKEN = a1b2c3
|
| 67 |
+
OPENAI_MODEL = openai/gpt-oss-120b
|
| 68 |
+
OPENAI_API_KEY = a1b2c3
|
| 69 |
+
OPENAI_BASE_URL = https://router.huggingface.co/v1
|
| 70 |
+
OPENAI_IMAGE_FORMAT = webp
|
| 71 |
+
|
| 72 |
+
#[CONFIGURATION]
|
| 73 |
+
MAX_WORKERS = 4
|
| 74 |
+
MAX_RETRIES = 2
|
| 75 |
+
OUTPUT_FORMAT = markdown
|
| 76 |
+
INPUT_DIR = inputs
|
| 77 |
+
OUTPUT_DIR = output_dir
|
| 78 |
+
USE_LLM = False
|
| 79 |
+
EXTRACT_IMAGES = True
|
| 80 |
+
OUTPUT_IMAGE_FORMAT = png
|
| 81 |
+
OUTPUT_ENCODING = utf-8
|
| 82 |
+
DEBUG_DATA_FOLDER = debug_data
|
| 83 |
+
|
| 84 |
+
[UNSURE_CAP]
|
| 85 |
+
IMAGE_OUTPUT_DIR = images
|
| 86 |
+
IMAGE_OUTPUT_FORMAT = png
|
| 87 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 88 |
+
###
|
| 89 |
+
# Create a Path object from the current file's location, resolve it to an absolute path
|
| 90 |
+
# Get its parent's parent using chained .parent calls or the parents[] attribute.
|
| 91 |
+
#grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 92 |
+
###
|
| 93 |
+
|
| 94 |
+
[LIBRARIES_CAP]
|
| 95 |
+
LIBOBJECT_PATH = C:\\Dat\\dev\\gtk3-runtime\\bin
|
| 96 |
+
WEASYPRINT_DLL_DIRECTORIES = C:\\Dat\\dev\\gtk3-runtime\\bin
|
| 97 |
+
|
| 98 |
+
[GLOBAL_CAP]
|
| 99 |
+
# Globals within each worker process
|
| 100 |
+
HF_MODEL ="openai/gpt-oss-120b"
|
| 101 |
+
HF_TOKEN = ""
|
| 102 |
+
HF_CLIENT = None
|
| 103 |
+
ARTIFACT_DICT = None
|
| 104 |
+
PDF_CONVERTER = None
|
| 105 |
+
HTML_CONVERTER = None
|
| 106 |
+
|
| 107 |
+
[marker_dict]
|
| 108 |
+
## "meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
|
| 109 |
+
provider:"openai" #provider,
|
| 110 |
+
model_id:"openai/gpt-oss-120b" #model_id, #"meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
|
| 111 |
+
hf_provider:"fireworks-ai" #hf_provider,
|
| 112 |
+
endpoint_url:"" #endpoint_url,
|
| 113 |
+
backend_choice:"provider" #backend_choice,
|
| 114 |
+
system_message:"" #system_message,
|
| 115 |
+
max_tokens:8192 #max_tokens,
|
| 116 |
+
temperature:0.2 #temperature,
|
| 117 |
+
top_p:0.2 #top_p,
|
| 118 |
+
stream:"stream"
|
| 119 |
+
api_token:"a1b2c3" #get_token,
|
| 120 |
+
output_format:"markdown" #output_format, #"markdown",
|
| 121 |
+
openai_model:"openai/gpt-oss-120b" #self.client.model_id, #"model_name"
|
| 122 |
+
openai_api_key:"a1b2c3" #self.client.openai_api_key, #self.api_token,
|
| 123 |
+
openai_base_url:"https://router.huggingface.co/v1" #self.client.base_url, #self.base_url,
|
| 124 |
+
#temperature=self.client.temperature,
|
| 125 |
+
#top_p=self.client.top_p,
|
| 126 |
+
openai_image_format:"webp" #"png" #better compatibility
|
| 127 |
+
max_retries:3 ## pass to __call__
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
[marker_nostrip]
|
| 131 |
+
provider="openai"
|
| 132 |
+
model_id="openai/gpt-oss-120b"
|
| 133 |
+
hf_provider="fireworks-ai"
|
| 134 |
+
endpoint_url=""
|
| 135 |
+
backend_choice="provider"
|
| 136 |
+
system_message=""
|
| 137 |
+
max_tokens=8192
|
| 138 |
+
temperature=0.2
|
| 139 |
+
top_p=0.2
|
| 140 |
+
stream=True
|
| 141 |
+
api_token="a1b2c3"
|
| 142 |
+
openai_model="openai/gpt-oss-120b"
|
| 143 |
+
openai_api_key="a1b2c3"
|
| 144 |
+
openai_base_url="https://router.huggingface.co/v1"
|
| 145 |
+
openai_image_format="webp"
|
| 146 |
+
#max_retries=3
|
| 147 |
+
|
| 148 |
+
#[Configuration]
|
| 149 |
+
use_llm=True
|
| 150 |
+
output_format="markdown"
|
| 151 |
+
input_dir="inputs"
|
| 152 |
+
output_dir="output_md"
|
| 153 |
+
max_workers=4
|
| 154 |
+
max_retries=2
|
| 155 |
+
extract_images=True
|
| 156 |
+
output_image_format="png"
|
| 157 |
+
output_encoding=utf-8
|
| 158 |
+
debug_data_folder="debug_data"
|
utils/config.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/config.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
Centralised configuration constants.
|
| 7 |
+
##SMY: TODO: Create Class Settings(BaseSettings) leveraging from pydantic_settings import BaseSettings
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# UI text
|
| 11 |
+
TITLE = "PyPDFMd – PDF & HTML ↔ Markdown Converter"
|
| 12 |
+
DESCRIPTION = (
|
| 13 |
+
"Drag‑and‑drop a single PDF/HTML, a folder to convert to Markdown."
|
| 14 |
+
"Or upload Markdown/LaTeX files and generate a polished PDF."
|
| 15 |
+
)
|
| 16 |
+
DESCRIPTION_PDF_HTML = (
|
| 17 |
+
"Upload a single or multiple PDF or HTML, a folder or an entire directory tree "
|
| 18 |
+
"to convert to Markdown."
|
| 19 |
+
)
|
| 20 |
+
DESCRIPTION_PDF = (
|
| 21 |
+
"Drag‑and‑drop a single PDF, a folder of PDFs or an entire directory tree "
|
| 22 |
+
"to convert to Markdown."
|
| 23 |
+
)
|
| 24 |
+
DESCRIPTION_HTML = (
|
| 25 |
+
"Drag‑and‑drop a single HTML, a folder of HTMLs or an entire directory tree "
|
| 26 |
+
"to convert to Markdown."
|
| 27 |
+
)
|
| 28 |
+
DESCRIPTION_MD = (
|
| 29 |
+
"Upload Markdown/LaTeX files and generate a polished PDF."
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Conversion defaults
|
| 33 |
+
DEFAULT_MARKER_OPTIONS = {
|
| 34 |
+
"include_images": True,
|
| 35 |
+
"image_format": "png",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Configuration
|
| 39 |
+
MAX_WORKERS = int(os.getenv("MAX_WORKERS", "4"))
|
| 40 |
+
MAX_RETRIES = int(os.getenv("MAX_RETRIES", "2")) #3
|
| 41 |
+
INPUT_DIR = os.getenv("INPUT_DIR", "inputs") # unused
|
| 42 |
+
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "md_output")
|
| 43 |
+
USE_LLM = bool(os.getenv("USE-LLM", False)) #True
|
| 44 |
+
EXTRACT_IMAGES = bool(os.getenv("EXTRACT_IMAGES", True)) #True
|
| 45 |
+
OUTPUT_IMAGE_FORMAT = os.getenv("OUTPUT_IMAGE_FORMAT", "png") #png
|
| 46 |
+
OUTPUT_ENCODING = os.getenv("OUTPUT_ENCODING", "utf-8") #utf-8
|
| 47 |
+
DEBUG_DATA_FOLDER = os.getenv("DEBUG_DATA_FOLDER", "debug_data") #debug_data
|
| 48 |
+
|
| 49 |
+
# Global
|
| 50 |
+
HF_MODEL = os.getenv("HF_MODEL", "gpt2") # swap for a chat-capable model
|
| 51 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # your Hugging Face token
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
## //TODO:
|
| 56 |
+
# from config.ini ##SMY: future plan to merge
|
| 57 |
+
api_token="a1b2c3"
|
| 58 |
+
OUTPUT_FORMAT = "markdown" #output_format
|
| 59 |
+
OPENAI_MODEL = "openai/gpt-oss-120b" #openai_model
|
| 60 |
+
OPENAI_API_KEY = "a1b2c3" #openai_api_key
|
| 61 |
+
OPENAI_BASE_URL = "https://router.huggingface.co/v1" ##openai_base_url
|
| 62 |
+
OPENAI_IMAGE_FORMAT = "webp" #openai_image_format
|
| 63 |
+
OUTPUT_IMAGE_FORMAT = "png"
|
| 64 |
+
#max_retries=3
|
| 65 |
+
|
| 66 |
+
#[marker]
|
| 67 |
+
PROVIDER = "openai" #provider
|
| 68 |
+
MODEL_ID = "openai/gpt-oss-120b" #model_id
|
| 69 |
+
HF_PROVIDER = "fireworks-ai" #hf_provider
|
| 70 |
+
ENDPOINT_URL = "" #endpoint_url
|
| 71 |
+
BACKEND_CHOiCE = "provider" #backend_choice
|
| 72 |
+
SYSTEM_MESSAGE = "" #system_message
|
| 73 |
+
MAX_TOKENS = 8192 #max_tokens
|
| 74 |
+
TEMMPERATURE = 0.2 #temperature
|
| 75 |
+
TOP_P = 0.2 #top_p
|
| 76 |
+
STREAM = True #stream
|
| 77 |
+
|
| 78 |
+
# Globals within each worker process
|
| 79 |
+
hf_client = None
|
| 80 |
+
artifact_dict = None
|
| 81 |
+
pdf_converter = None
|
| 82 |
+
html_converter = None
|
| 83 |
+
|
utils/get_arg_name.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
|
| 3 |
+
def get_arg_name_as_string(arg):
|
| 4 |
+
"""
|
| 5 |
+
Returns the name of the argument passed to the function as a string.
|
| 6 |
+
This works by inspecting the calling frame's local variables.
|
| 7 |
+
|
| 8 |
+
example usage:
|
| 9 |
+
def my_function(x):
|
| 10 |
+
arg_name = get_arg_name_as_string(arg_x)
|
| 11 |
+
print(f"The argument name is: {arg_name}") # Outputs: "The argument name is: arg_x"
|
| 12 |
+
"""
|
| 13 |
+
frame = inspect.currentframe().f_back # Get the frame of the caller
|
| 14 |
+
arg_name = None
|
| 15 |
+
for name, value in frame.f_locals.items():
|
| 16 |
+
if value is arg:
|
| 17 |
+
arg_name = name
|
| 18 |
+
break
|
| 19 |
+
return arg_name
|
utils/get_config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from configparser import ConfigParser as config
|
| 2 |
+
from typing import Union
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
#from utils.get_arg_name import get_arg_name_as_string
|
| 5 |
+
import traceback
|
| 6 |
+
|
| 7 |
+
def get_config_value(section:str, parameter:str, fallback:str=None, configfile: Union[str, Path]="utils\\config.ini"):
|
| 8 |
+
""" Load config file, locate section, read parameter and return value """
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
cfg = config()
|
| 12 |
+
cfg.read(configfile)
|
| 13 |
+
param_value = cfg[section].get(option=parameter, fallback=fallback) #"C:\\Dat\\dev\\gtk3-runtime\\bin")
|
| 14 |
+
return param_value
|
| 15 |
+
except Exception as exc:
|
| 16 |
+
tb = traceback.format_exc()
|
| 17 |
+
raise RuntimeWarning(f"Error loading config: {exc}\n{tb}")
|
| 18 |
+
#pass
|
utils/lib_loader.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import sys
|
| 4 |
+
import ctypes
|
| 5 |
+
from typing import Union
|
| 6 |
+
from configparser import ConfigParser as config
|
| 7 |
+
from venv import logger
|
| 8 |
+
from utils.get_arg_name import get_arg_name_as_string
|
| 9 |
+
from utils.get_config import get_config_value
|
| 10 |
+
import traceback
|
| 11 |
+
|
| 12 |
+
from utils.logger import get_logger
|
| 13 |
+
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
def set_weasyprint_library(libpath: Union[str, Path] = None, config_file: Union[str, Path] = "utils\\config.ini"):
|
| 17 |
+
""" Loads Weasyprint backend dependency libraries to environment """
|
| 18 |
+
# Check if the current platform is Windows
|
| 19 |
+
if sys.platform == 'win32':
|
| 20 |
+
|
| 21 |
+
#libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
|
| 22 |
+
if not libpath:
|
| 23 |
+
'''cfg = config()
|
| 24 |
+
cfg.read(config_file) #"utils\\config.ini")
|
| 25 |
+
lib_path = cfg["LIBRARIES_CAP"].get(f"WEASYPRINT_DLL_DIRECTORIES", "C:\\Dat\\dev\\gtk3-runtime\\bin")
|
| 26 |
+
'''
|
| 27 |
+
lib_path = get_config_value("LIBRARIES_CAP", "WEASYPRINT_DLL_DIRECTORIES") if not libpath else "C:\\Dat\\dev\\gtk3-runtime\\bin"
|
| 28 |
+
|
| 29 |
+
# Check if the file exists before attempting to load it
|
| 30 |
+
#if not os.path.exists(libobject):
|
| 31 |
+
if not Path(lib_path).exists():
|
| 32 |
+
raise FileNotFoundError(f"The specified Weasyprint DLL Directory does not exist: {lib_path}. Follow Weasyprint installation guide or provide a valid GTK3-runtime path.")
|
| 33 |
+
#logger.exception(f"gobject library path: {libgobject_path}") ##debug
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Set a new environment variable
|
| 37 |
+
lib_path = lib_path ##SMY: on dev machine, using extracted 'portable' GTK3 rather than installing 'MSYS2'
|
| 38 |
+
os.environ["WEASYPRINT_DLL_DIRECTORIES"] = lib_path
|
| 39 |
+
#logger.info(f"sets Weasyprint DLL library path: {lib_path}") #debug
|
| 40 |
+
|
| 41 |
+
except Exception as exc:
|
| 42 |
+
tb = traceback.format_exc()
|
| 43 |
+
logger.exception(f"Error setting environ: weasyprint backend dependency → {exc}\n{tb}", exc_info=True) # Log the full traceback
|
| 44 |
+
|
| 45 |
+
raise RuntimeWarning(f"✗ error during setting environ: weasyprint backend dependency → {exc}\n{tb}")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def load_library(libobject_name: Union[str, Path]):
|
| 49 |
+
"""
|
| 50 |
+
Loads Weasyprint backend dependency libraries
|
| 51 |
+
usage: list(map(load_library, library_list)) ##SMY: map the load_library function to each item in library_list
|
| 52 |
+
The library list was starting to grow excessively, opt to setting environ
|
| 53 |
+
"""
|
| 54 |
+
# Check if the current platform is Windows
|
| 55 |
+
if sys.platform == 'win32':
|
| 56 |
+
|
| 57 |
+
#libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
|
| 58 |
+
cfg = config()
|
| 59 |
+
cfg.read("utils\\config.ini")
|
| 60 |
+
lib_path = cfg["libraries"].get(f"libobject_path", "C:\\Dat\\dev\\gtk3-runtime\\bin")
|
| 61 |
+
lib_object_dll = get_arg_name_as_string(libobject_name) ## future use
|
| 62 |
+
|
| 63 |
+
# Construct the path to libgobject-2.0.dll
|
| 64 |
+
#libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', 'C:\\glib'), 'bin', 'libgobject-2.0-0.dll')
|
| 65 |
+
libobject = f"{lib_path}\\{libobject_name}.dll" ##libgobject-2.0-0.dll"
|
| 66 |
+
#print(f"Loading gobject library: {libgobject}") #debug
|
| 67 |
+
|
| 68 |
+
# Check if the file exists before attempting to load it
|
| 69 |
+
#if not os.path.exists(libobject):
|
| 70 |
+
if not Path(libobject).exists():
|
| 71 |
+
raise FileNotFoundError(f"The specified library file does not exist: {libobject}")
|
| 72 |
+
#print(f"gobject library path: {libgobject_path}") ##debug
|
| 73 |
+
|
| 74 |
+
# Load the library using ctypes
|
| 75 |
+
try:
|
| 76 |
+
ctypes_libgobject = ctypes.CDLL(libobject)
|
| 77 |
+
#msg = f"libgobject-2.0-0.dll loaded successfully via ctypes. {str(ctypes_libgobject)}"
|
| 78 |
+
#print(msg) ##debug
|
| 79 |
+
except OSError as exc:
|
| 80 |
+
tb = traceback.format_exc()
|
| 81 |
+
raise RuntimeWarning(f"Failed to load library: {exc}\n{tb}") ##raise RuntimeError
|
| 82 |
+
|
| 83 |
+
## Test
|
| 84 |
+
#load_library("libpango-1.0-0")
|
| 85 |
+
#load_library("libgobject-2.0-0")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
##SMY: Original implementation: TODO: for refactoring
|
| 89 |
+
def load_libgobject():
|
| 90 |
+
# Check if the current platform is Windows
|
| 91 |
+
if sys.platform == 'win32':
|
| 92 |
+
|
| 93 |
+
#libgobject_path = #"/path/to/your/custom/glib/install/lib/libgobject-2.0.so.0"
|
| 94 |
+
cfg = config()
|
| 95 |
+
cfg.read("utils\\config.ini")
|
| 96 |
+
libgobject_path = cfg["libraries"].get("libgobject_path", "C:\\Dat\\dev\\gtk3-runtime\\bin")
|
| 97 |
+
|
| 98 |
+
# Construct the path to libgobject-2.0.dll
|
| 99 |
+
#libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', 'C:\\glib'), 'bin', 'libgobject-2.0-0.dll')
|
| 100 |
+
libgobject = f"{libgobject_path}\\libgobject-2.0-0.dll"
|
| 101 |
+
#print(f"Loading gobject library: {libgobject}") #debug
|
| 102 |
+
|
| 103 |
+
# Check if the file exists before attempting to load it
|
| 104 |
+
if not os.path.exists(libgobject):
|
| 105 |
+
raise FileNotFoundError(f"The specified library file does not exist: {libgobject}")
|
| 106 |
+
#print(f"gobject library path: {libgobject_path}") ##debug
|
| 107 |
+
|
| 108 |
+
# Load the library using ctypes
|
| 109 |
+
try:
|
| 110 |
+
ctypes_libgobject = ctypes.CDLL(libgobject)
|
| 111 |
+
#msg = f"libgobject-2.0-0.dll loaded successfully via ctypes. {str(ctypes_libgobject)}"
|
| 112 |
+
#print(msg) ##debug
|
| 113 |
+
|
| 114 |
+
return ctypes_libgobject
|
| 115 |
+
except OSError as exc:
|
| 116 |
+
tb = traceback.format_exc()
|
| 117 |
+
raise RuntimeWarning(f"Failed to load library: {exc}\n{tb}") ##raise RuntimeError
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Load the library using ctypes (Linux/macOS)
|
| 121 |
+
# Construct the path to libgobject-2.0.so.0 in the custom GLib installation
|
| 122 |
+
#libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', '/opt/glib'), 'lib', 'libgobject-2.0.so.0')
|
| 123 |
+
#print("This script is intended to run on Unix-like systems, not Windows.")
|
| 124 |
+
else:
|
| 125 |
+
# Load the library using ctypes (Linux/macOS)
|
| 126 |
+
# Construct the path to libgobject-2.0.so.0 in the custom GLib installation
|
| 127 |
+
libgobject_path = os.path.join(os.environ.get('GLIB_PREFIX', '/opt/glib'), 'lib', 'libgobject-2.0.so.0')
|
| 128 |
+
#print("This script is intended to run on Unix-like systems, not Windows.")
|
| 129 |
+
|
| 130 |
+
return libgobject_path
|
utils/logger.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/logger.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import sys
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
|
| 8 |
+
'''
|
| 9 |
+
def get_logger(name: str) -> logging.Logger:
|
| 10 |
+
"""
|
| 11 |
+
Returns a logger configured with a console handler.
|
| 12 |
+
"""
|
| 13 |
+
logger = logging.getLogger(name)
|
| 14 |
+
if not logger.handlers:
|
| 15 |
+
logger.setLevel(logging.INFO)
|
| 16 |
+
ch = logging.StreamHandler()
|
| 17 |
+
formatter = logging.Formatter(
|
| 18 |
+
"[%(asctime)s] %(levelname)s - %(name)s: %(message)s",
|
| 19 |
+
datefmt="%H:%M:%S",
|
| 20 |
+
)
|
| 21 |
+
ch.setFormatter(formatter)
|
| 22 |
+
logger.addHandler(ch)
|
| 23 |
+
return logger
|
| 24 |
+
'''
|
| 25 |
+
|
| 26 |
+
class JsonFormatter(logging.Formatter):
|
| 27 |
+
"""Minimal JSON formatter for structured logs."""
|
| 28 |
+
|
| 29 |
+
def format(self, record: logging.LogRecord) -> str: #
|
| 30 |
+
payload = {
|
| 31 |
+
#"ts": datetime.now(timezone.utc).isoformat(), ## default to 'YYYY-MM-DD HH:MM:SS.mmmmmm',
|
| 32 |
+
"ts": datetime.now(timezone.utc).strftime("%Y-%m-%d : %H:%M:%S"), ## SMY: interested in datefmt="%H:%M:%S",
|
| 33 |
+
"level": record.levelname,
|
| 34 |
+
"logger": record.name,
|
| 35 |
+
"message": record.getMessage(),
|
| 36 |
+
}
|
| 37 |
+
# Include extra attributes (fields not in default LogRecord)
|
| 38 |
+
for key, value in record.__dict__.items():
|
| 39 |
+
if key in ("args", "msg", "levelno", "levelname", "name", "pathname", "filename",
|
| 40 |
+
"module", "exc_info", "exc_text", "stack_info", "lineno", "funcName",
|
| 41 |
+
"created", "msecs", "relativeCreated", "thread", "threadName",
|
| 42 |
+
"processName", "process"):
|
| 43 |
+
continue
|
| 44 |
+
payload[key] = value
|
| 45 |
+
return json.dumps(payload, ensure_ascii=False)
|
| 46 |
+
|
| 47 |
+
#def setup_logging(level: int = logging.INFO) -> None: ## Causing non-stop logging on HF spaces
|
| 48 |
+
def setup_logging(level: int = None) -> None:
|
| 49 |
+
"""Configure root logger with JSON output to both stdout and file.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
level: Logging level. If None, uses WARNING for production (HF Spaces)
|
| 53 |
+
and INFO for local development.
|
| 54 |
+
"""
|
| 55 |
+
if level is None:
|
| 56 |
+
# Auto-detect environment: WARNING for production, INFO for local dev
|
| 57 |
+
import os
|
| 58 |
+
is_production = os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID") or os.getenv("HUGGINGFACE_SPACE_ID")
|
| 59 |
+
level = logging.WARNING if is_production else logging.INFO
|
| 60 |
+
|
| 61 |
+
# Console handler
|
| 62 |
+
console_handler = logging.StreamHandler(stream=sys.stdout)
|
| 63 |
+
console_handler.setFormatter(JsonFormatter()) #, datefmt="%H:%M:%S",) ##explicit time format
|
| 64 |
+
|
| 65 |
+
# File handler
|
| 66 |
+
#file_handler = logging.FileHandler("logs/app_logging_scrap.log", mode="a", encoding="utf-8")
|
| 67 |
+
file_handler = logging.FileHandler("logs/app_logging.log", mode="a", encoding="utf-8")
|
| 68 |
+
file_handler.setFormatter(JsonFormatter())
|
| 69 |
+
|
| 70 |
+
root = logging.getLogger()
|
| 71 |
+
root.handlers.clear()
|
| 72 |
+
root.addHandler(console_handler)
|
| 73 |
+
root.addHandler(file_handler)
|
| 74 |
+
root.setLevel(level)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_logger(name: str) -> logging.Logger:
|
| 78 |
+
"""Return a module logger configured with console handler using defined JSON format."""
|
| 79 |
+
return logging.getLogger(name)
|
| 80 |
+
|
| 81 |
+
|
utils/utils.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def is_dict(variable):
|
| 2 |
+
"""Checks if a variable is a dict."""
|
| 3 |
+
if isinstance(variable, dict):
|
| 4 |
+
return True
|
| 5 |
+
|
| 6 |
+
return False
|
| 7 |
+
|
| 8 |
+
def is_list_of_dicts(variable):
|
| 9 |
+
"""Checks if a variable is a list containing only dicts."""
|
| 10 |
+
|
| 11 |
+
if isinstance(variable, list):
|
| 12 |
+
# Return True only if the list is empty or all elements are dicts.
|
| 13 |
+
return all(isinstance(item, dict) for item in variable)
|
| 14 |
+
|
| 15 |
+
return False
|