taprosoft
commited on
Commit
·
9adfc08
1
Parent(s):
d381432
fix: disable formula recognition and add env var to toggle
Browse files- backends/docling.py +2 -2
- backends/marker.py +8 -1
- backends/mineru.py +2 -2
- backends/settings.py +1 -0
backends/docling.py
CHANGED
|
@@ -10,7 +10,7 @@ from docling.datamodel.settings import settings
|
|
| 10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 11 |
from docling_core.types.doc import ImageRefMode
|
| 12 |
|
| 13 |
-
from .settings import ENABLE_DEBUG_MODE
|
| 14 |
|
| 15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
| 16 |
|
|
@@ -20,7 +20,7 @@ pipeline_options = PdfPipelineOptions()
|
|
| 20 |
pipeline_options.accelerator_options = accelerator_options
|
| 21 |
pipeline_options.do_ocr = True
|
| 22 |
pipeline_options.do_table_structure = True
|
| 23 |
-
pipeline_options.do_formula_enrichment =
|
| 24 |
pipeline_options.generate_picture_images = True
|
| 25 |
pipeline_options.images_scale = 2.0
|
| 26 |
|
|
|
|
| 10 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 11 |
from docling_core.types.doc import ImageRefMode
|
| 12 |
|
| 13 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
| 14 |
|
| 15 |
DOCLING_DEBUG_PATH = Path("/tmp/docling")
|
| 16 |
|
|
|
|
| 20 |
pipeline_options.accelerator_options = accelerator_options
|
| 21 |
pipeline_options.do_ocr = True
|
| 22 |
pipeline_options.do_table_structure = True
|
| 23 |
+
pipeline_options.do_formula_enrichment = ENABLE_FORMULA
|
| 24 |
pipeline_options.generate_picture_images = True
|
| 25 |
pipeline_options.images_scale = 2.0
|
| 26 |
|
backends/marker.py
CHANGED
|
@@ -6,11 +6,18 @@ from pathlib import Path
|
|
| 6 |
from marker.converters.pdf import PdfConverter
|
| 7 |
from marker.models import create_model_dict
|
| 8 |
from marker.output import text_from_rendered
|
|
|
|
| 9 |
from marker.settings import settings
|
| 10 |
|
| 11 |
-
from .settings import ENABLE_DEBUG_MODE
|
| 12 |
|
| 13 |
# Marker init
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
marker_converter = PdfConverter(
|
| 15 |
artifact_dict=create_model_dict(),
|
| 16 |
config={
|
|
|
|
| 6 |
from marker.converters.pdf import PdfConverter
|
| 7 |
from marker.models import create_model_dict
|
| 8 |
from marker.output import text_from_rendered
|
| 9 |
+
from marker.processors.equation import EquationProcessor
|
| 10 |
from marker.settings import settings
|
| 11 |
|
| 12 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
| 13 |
|
| 14 |
# Marker init
|
| 15 |
+
if not ENABLE_FORMULA:
|
| 16 |
+
PdfConverter.default_processors = (
|
| 17 |
+
processor
|
| 18 |
+
for processor in PdfConverter.default_processors
|
| 19 |
+
if processor != EquationProcessor
|
| 20 |
+
)
|
| 21 |
marker_converter = PdfConverter(
|
| 22 |
artifact_dict=create_model_dict(),
|
| 23 |
config={
|
backends/mineru.py
CHANGED
|
@@ -7,7 +7,7 @@ import pymupdf
|
|
| 7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
| 8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 9 |
|
| 10 |
-
from .settings import ENABLE_DEBUG_MODE
|
| 11 |
|
| 12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
| 13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
|
@@ -52,7 +52,7 @@ def do_process_mineru(input_path, output_dir):
|
|
| 52 |
f_dump_orig_pdf=False,
|
| 53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
| 54 |
f_draw_char_bbox=False,
|
| 55 |
-
formula_enable=
|
| 56 |
table_enable=True,
|
| 57 |
)
|
| 58 |
return local_md_dir, file_name
|
|
|
|
| 7 |
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
| 8 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 9 |
|
| 10 |
+
from .settings import ENABLE_DEBUG_MODE, ENABLE_FORMULA
|
| 11 |
|
| 12 |
MINERU_DEBUG_PATH = Path("/tmp/mineru")
|
| 13 |
MINERU_DEBUG_PATH.mkdir(exist_ok=True)
|
|
|
|
| 52 |
f_dump_orig_pdf=False,
|
| 53 |
f_draw_layout_bbox=ENABLE_DEBUG_MODE,
|
| 54 |
f_draw_char_bbox=False,
|
| 55 |
+
formula_enable=ENABLE_FORMULA,
|
| 56 |
table_enable=True,
|
| 57 |
)
|
| 58 |
return local_md_dir, file_name
|
backends/settings.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
ENABLE_DEBUG_MODE = os.environ.get("ENABLE_DEBUG_MODE", "True").lower() == "true"
|
| 4 |
+
ENABLE_FORMULA = os.environ.get("ENABLE_FORMULA", "False").lower() == "true"
|