marker-io / marker /ocr /recognition.py
Ritvik19's picture
Add all files and directories
c8a32e7
raw
history blame
No virus
5.74 kB
from itertools import repeat
from typing import List, Optional, Dict
import pypdfium2 as pdfium
import io
from concurrent.futures import ThreadPoolExecutor
from surya.ocr import run_recognition
from marker.models import setup_recognition_model
from marker.ocr.heuristics import should_ocr_page, no_text_found, detect_bad_ocr
from marker.ocr.lang import langs_to_ids
from marker.pdf.images import render_image
from marker.schema.page import Page
from marker.schema.block import Block, Line, Span
from marker.settings import settings
from marker.pdf.extract_text import get_text_blocks
def get_batch_size():
if settings.RECOGNITION_BATCH_SIZE is not None:
return settings.RECOGNITION_BATCH_SIZE
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 32
elif settings.TORCH_DEVICE_MODEL == "mps":
return 32
return 32
def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplier=1) -> (List[Page], Dict):
ocr_pages = 0
ocr_success = 0
ocr_failed = 0
no_text = no_text_found(pages)
ocr_idxs = []
for pnum, page in enumerate(pages):
ocr_needed = should_ocr_page(page, no_text)
if ocr_needed:
ocr_idxs.append(pnum)
ocr_pages += 1
# No pages need OCR
if ocr_pages == 0:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
ocr_method = settings.OCR_ENGINE
if ocr_method is None:
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
elif ocr_method == "surya":
# Load model just in time if we're not OCRing everything
del_rec_model = False
if rec_model is None:
lang_tokens = langs_to_ids(langs)
rec_model = setup_recognition_model(lang_tokens)
del_rec_model = True
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
if del_rec_model:
del rec_model
elif ocr_method == "ocrmypdf":
new_pages = tesseract_recognition(doc, ocr_idxs, langs)
else:
raise ValueError(f"Unknown OCR method {ocr_method}")
for orig_idx, page in zip(ocr_idxs, new_pages):
if detect_bad_ocr(page.prelim_text) or len(page.prelim_text) == 0:
ocr_failed += 1
else:
ocr_success += 1
pages[orig_idx] = page
return pages, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success, "ocr_engine": ocr_method}
def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page], batch_multiplier=1) -> List[Optional[Page]]:
images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
processor = rec_model.processor
selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
surya_langs = [langs] * len(page_idxs)
detection_results = [p.text_lines.bboxes for p in selected_pages]
polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=get_batch_size() * batch_multiplier)
new_pages = []
for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):
text_lines = old_page.text_lines
ocr_results = result.text_lines
blocks = []
for i, line in enumerate(ocr_results):
block = Block(
bbox=line.bbox,
pnum=page_idx,
lines=[Line(
bbox=line.bbox,
spans=[Span(
text=line.text,
bbox=line.bbox,
span_id=f"{page_idx}_{i}",
font="",
font_weight=0,
font_size=0,
)
]
)]
)
blocks.append(block)
page = Page(
blocks=blocks,
pnum=page_idx,
bbox=result.image_bbox,
rotation=0,
text_lines=text_lines,
ocr_method="surya"
)
new_pages.append(page)
return new_pages
def tesseract_recognition(doc, page_idxs, langs: List[str]) -> List[Optional[Page]]:
pdf_pages = generate_single_page_pdfs(doc, page_idxs)
with ThreadPoolExecutor(max_workers=settings.OCR_PARALLEL_WORKERS) as executor:
pages = list(executor.map(_tesseract_recognition, pdf_pages, repeat(langs, len(pdf_pages))))
return pages
def generate_single_page_pdfs(doc, page_idxs) -> List[io.BytesIO]:
pdf_pages = []
for page_idx in page_idxs:
blank_doc = pdfium.PdfDocument.new()
blank_doc.import_pages(doc, pages=[page_idx])
assert len(blank_doc) == 1, "Failed to import page"
in_pdf = io.BytesIO()
blank_doc.save(in_pdf)
in_pdf.seek(0)
pdf_pages.append(in_pdf)
return pdf_pages
def _tesseract_recognition(in_pdf, langs: List[str]) -> Optional[Page]:
import ocrmypdf
out_pdf = io.BytesIO()
ocrmypdf.ocr(
in_pdf,
out_pdf,
language=langs[0],
output_type="pdf",
redo_ocr=None,
force_ocr=True,
progress_bar=False,
optimize=False,
fast_web_view=1e6,
skip_big=15, # skip images larger than 15 megapixels
tesseract_timeout=settings.TESSERACT_TIMEOUT,
tesseract_non_ocr_timeout=settings.TESSERACT_TIMEOUT,
)
new_doc = pdfium.PdfDocument(out_pdf.getvalue())
blocks, _ = get_text_blocks(new_doc, max_pages=1)
page = blocks[0]
page.ocr_method = "tesseract"
return page