Spaces:
Running
Add skip_inverted_ocr flag to opt out of the inverted Tesseract pass
Browse filesThe inverted-binarization pass exists to recover white-on-coloured
callout text the normal pass turns into noise, but on slides with no
such callouts it just halves throughput and risks producing garbled
near-duplicates of body text (filtered as of the previous fix).
Add a per-job toggle that disables the inverted pass entirely:
- ocr_frame(skip_inverted=True) returns only normal-pass output
- PipelineInputs.skip_inverted_ocr threads it through the orchestrator
- /jobs and /process endpoints accept skip_inverted_ocr as a form field
- CLI exposes --skip-inverted-ocr on build and metadata
- Upload page advanced settings include a "Skip inverted OCR pass" toggle
- Job remembers the choice so the lazy /ocr endpoint stays consistent
when the reviewer picks an alt frame in the editor
- app/cli.py +4 -0
- app/jobs.py +5 -0
- app/main.py +8 -1
- app/pipeline/ocr.py +17 -9
- app/pipeline/orchestrator.py +8 -1
- app/templates/index.html +10 -0
- tests/test_ocr_dedup.py +35 -0
|
@@ -102,6 +102,7 @@ def build(
|
|
| 102 |
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
|
| 103 |
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
|
| 104 |
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
|
|
|
|
| 105 |
face_threshold: float = typer.Option(0.12, "--face-threshold"),
|
| 106 |
lang: str = typer.Option("en", "--lang"),
|
| 107 |
fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
|
|
@@ -124,6 +125,7 @@ def build(
|
|
| 124 |
min_gap_seconds=min_gap,
|
| 125 |
max_frames=max_frames,
|
| 126 |
skip_ocr=skip_ocr,
|
|
|
|
| 127 |
face_threshold=face_threshold,
|
| 128 |
auto_transcribe=auto_transcribe,
|
| 129 |
whisper_model=whisper_model,
|
|
@@ -200,6 +202,7 @@ def export_metadata_cmd(
|
|
| 200 |
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
|
| 201 |
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
|
| 202 |
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
|
|
|
|
| 203 |
face_threshold: float = typer.Option(0.12, "--face-threshold"),
|
| 204 |
lang: str = typer.Option("en", "--lang"),
|
| 205 |
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
@@ -218,6 +221,7 @@ def export_metadata_cmd(
|
|
| 218 |
min_gap_seconds=min_gap,
|
| 219 |
max_frames=max_frames,
|
| 220 |
skip_ocr=skip_ocr,
|
|
|
|
| 221 |
face_threshold=face_threshold,
|
| 222 |
auto_transcribe=auto_transcribe,
|
| 223 |
whisper_model=whisper_model,
|
|
|
|
| 102 |
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
|
| 103 |
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
|
| 104 |
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
|
| 105 |
+
skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
|
| 106 |
face_threshold: float = typer.Option(0.12, "--face-threshold"),
|
| 107 |
lang: str = typer.Option("en", "--lang"),
|
| 108 |
fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
|
|
|
|
| 125 |
min_gap_seconds=min_gap,
|
| 126 |
max_frames=max_frames,
|
| 127 |
skip_ocr=skip_ocr,
|
| 128 |
+
skip_inverted_ocr=skip_inverted_ocr,
|
| 129 |
face_threshold=face_threshold,
|
| 130 |
auto_transcribe=auto_transcribe,
|
| 131 |
whisper_model=whisper_model,
|
|
|
|
| 202 |
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
|
| 203 |
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
|
| 204 |
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
|
| 205 |
+
skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
|
| 206 |
face_threshold: float = typer.Option(0.12, "--face-threshold"),
|
| 207 |
lang: str = typer.Option("en", "--lang"),
|
| 208 |
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
|
|
| 221 |
min_gap_seconds=min_gap,
|
| 222 |
max_frames=max_frames,
|
| 223 |
skip_ocr=skip_ocr,
|
| 224 |
+
skip_inverted_ocr=skip_inverted_ocr,
|
| 225 |
face_threshold=face_threshold,
|
| 226 |
auto_transcribe=auto_transcribe,
|
| 227 |
whisper_model=whisper_model,
|
|
@@ -56,6 +56,11 @@ class Job:
|
|
| 56 |
# first request to /ocr/{id}/{fn} and kept here so subsequent picks
|
| 57 |
# of the same alt don't re-OCR.
|
| 58 |
alt_ocr_cache: dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
class JobRegistry:
|
|
|
|
| 56 |
# first request to /ocr/{id}/{fn} and kept here so subsequent picks
|
| 57 |
# of the same alt don't re-OCR.
|
| 58 |
alt_ocr_cache: dict[str, str] = field(default_factory=dict)
|
| 59 |
+
# Mirrors the per-job OCR options chosen at upload time so the lazy
|
| 60 |
+
# /ocr endpoint keeps the same behaviour when the reviewer picks an
|
| 61 |
+
# alt frame — otherwise an alt's OCR could re-introduce garbling
|
| 62 |
+
# the user opted out of for the main pipeline.
|
| 63 |
+
ocr_skip_inverted: bool = False
|
| 64 |
|
| 65 |
|
| 66 |
class JobRegistry:
|
|
@@ -250,6 +250,7 @@ async def start_job(
|
|
| 250 |
min_gap: float = Form(0.0),
|
| 251 |
max_frames: str | None = Form(None),
|
| 252 |
skip_ocr: bool = Form(False),
|
|
|
|
| 253 |
face_threshold: float = Form(0.12),
|
| 254 |
lang: str = Form("en"),
|
| 255 |
subtitle: str | None = Form(None),
|
|
@@ -293,6 +294,7 @@ async def start_job(
|
|
| 293 |
min_gap_seconds=min_gap,
|
| 294 |
max_frames=_parse_optional_int(max_frames),
|
| 295 |
skip_ocr=skip_ocr,
|
|
|
|
| 296 |
face_threshold=face_threshold,
|
| 297 |
inline_images=(format in ("single", "review")),
|
| 298 |
anthropic_api_key=(anthropic_api_key or None),
|
|
@@ -310,6 +312,7 @@ async def start_job(
|
|
| 310 |
job = registry.create()
|
| 311 |
except JobQuotaExceeded as exc:
|
| 312 |
raise HTTPException(429, str(exc))
|
|
|
|
| 313 |
threading.Thread(
|
| 314 |
target=_run_job_in_thread,
|
| 315 |
args=(job, inputs, workdir),
|
|
@@ -498,7 +501,9 @@ async def fetch_ocr(job_id: str, filename: str) -> Response:
|
|
| 498 |
tmp.write(data)
|
| 499 |
tmp_path = Path(tmp.name)
|
| 500 |
try:
|
| 501 |
-
text = await run_in_threadpool(
|
|
|
|
|
|
|
| 502 |
finally:
|
| 503 |
try:
|
| 504 |
tmp_path.unlink(missing_ok=True)
|
|
@@ -633,6 +638,7 @@ async def process(
|
|
| 633 |
min_gap: float = Form(0.0),
|
| 634 |
max_frames: str | None = Form(None),
|
| 635 |
skip_ocr: bool = Form(False),
|
|
|
|
| 636 |
face_threshold: float = Form(0.12),
|
| 637 |
lang: str = Form("en"),
|
| 638 |
subtitle: str | None = Form(None),
|
|
@@ -675,6 +681,7 @@ async def process(
|
|
| 675 |
min_gap_seconds=min_gap,
|
| 676 |
max_frames=_parse_optional_int(max_frames),
|
| 677 |
skip_ocr=skip_ocr,
|
|
|
|
| 678 |
face_threshold=face_threshold,
|
| 679 |
inline_images=(format in ("single", "review")),
|
| 680 |
anthropic_api_key=(anthropic_api_key or None),
|
|
|
|
| 250 |
min_gap: float = Form(0.0),
|
| 251 |
max_frames: str | None = Form(None),
|
| 252 |
skip_ocr: bool = Form(False),
|
| 253 |
+
skip_inverted_ocr: bool = Form(False),
|
| 254 |
face_threshold: float = Form(0.12),
|
| 255 |
lang: str = Form("en"),
|
| 256 |
subtitle: str | None = Form(None),
|
|
|
|
| 294 |
min_gap_seconds=min_gap,
|
| 295 |
max_frames=_parse_optional_int(max_frames),
|
| 296 |
skip_ocr=skip_ocr,
|
| 297 |
+
skip_inverted_ocr=skip_inverted_ocr,
|
| 298 |
face_threshold=face_threshold,
|
| 299 |
inline_images=(format in ("single", "review")),
|
| 300 |
anthropic_api_key=(anthropic_api_key or None),
|
|
|
|
| 312 |
job = registry.create()
|
| 313 |
except JobQuotaExceeded as exc:
|
| 314 |
raise HTTPException(429, str(exc))
|
| 315 |
+
job.ocr_skip_inverted = skip_inverted_ocr
|
| 316 |
threading.Thread(
|
| 317 |
target=_run_job_in_thread,
|
| 318 |
args=(job, inputs, workdir),
|
|
|
|
| 501 |
tmp.write(data)
|
| 502 |
tmp_path = Path(tmp.name)
|
| 503 |
try:
|
| 504 |
+
text = await run_in_threadpool(
|
| 505 |
+
lambda: ocr_frame(tmp_path, skip_inverted=job.ocr_skip_inverted)
|
| 506 |
+
)
|
| 507 |
finally:
|
| 508 |
try:
|
| 509 |
tmp_path.unlink(missing_ok=True)
|
|
|
|
| 638 |
min_gap: float = Form(0.0),
|
| 639 |
max_frames: str | None = Form(None),
|
| 640 |
skip_ocr: bool = Form(False),
|
| 641 |
+
skip_inverted_ocr: bool = Form(False),
|
| 642 |
face_threshold: float = Form(0.12),
|
| 643 |
lang: str = Form("en"),
|
| 644 |
subtitle: str | None = Form(None),
|
|
|
|
| 681 |
min_gap_seconds=min_gap,
|
| 682 |
max_frames=_parse_optional_int(max_frames),
|
| 683 |
skip_ocr=skip_ocr,
|
| 684 |
+
skip_inverted_ocr=skip_inverted_ocr,
|
| 685 |
face_threshold=face_threshold,
|
| 686 |
inline_images=(format in ("single", "review")),
|
| 687 |
anthropic_api_key=(anthropic_api_key or None),
|
|
@@ -76,17 +76,23 @@ def _preprocess_variants(image_path: Path) -> list[Image.Image]:
|
|
| 76 |
return [Image.fromarray(normal), Image.fromarray(inverted)]
|
| 77 |
|
| 78 |
|
| 79 |
-
def ocr_frame(
|
|
|
|
|
|
|
| 80 |
"""Return cleaned OCR text for a single frame; empty string on failure.
|
| 81 |
|
| 82 |
-
Runs Tesseract on
|
| 83 |
-
|
| 84 |
-
the inverted pass recovers white-on-coloured
|
| 85 |
-
normal pass turns into noise. Order from the normal
|
| 86 |
-
preserved; inverted-only lines are appended after a fuzzy-
|
| 87 |
-
against the normal pass — inverting a clean dark-on-light line
|
| 88 |
-
often produces a garbled near-duplicate, and we want only the
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"""
|
| 91 |
try:
|
| 92 |
variants = _preprocess_variants(image_path)
|
|
@@ -103,6 +109,8 @@ def ocr_frame(image_path: Path, lang: str = "eng") -> str:
|
|
| 103 |
return [s.strip() for s in text.splitlines() if s.strip()]
|
| 104 |
|
| 105 |
normal_lines = _ocr_lines(variants[0])
|
|
|
|
|
|
|
| 106 |
merged: list[str] = list(normal_lines)
|
| 107 |
seen_lower: set[str] = {ln.lower() for ln in normal_lines}
|
| 108 |
for img in variants[1:]:
|
|
|
|
| 76 |
return [Image.fromarray(normal), Image.fromarray(inverted)]
|
| 77 |
|
| 78 |
|
| 79 |
+
def ocr_frame(
|
| 80 |
+
image_path: Path, lang: str = "eng", *, skip_inverted: bool = False
|
| 81 |
+
) -> str:
|
| 82 |
"""Return cleaned OCR text for a single frame; empty string on failure.
|
| 83 |
|
| 84 |
+
Runs Tesseract on the normal binarization, and — unless `skip_inverted`
|
| 85 |
+
is True — on the inverted binarization too. The normal pass picks up
|
| 86 |
+
most slide content; the inverted pass recovers white-on-coloured
|
| 87 |
+
callout text the normal pass turns into noise. Order from the normal
|
| 88 |
+
pass is preserved; inverted-only lines are appended after a fuzzy-
|
| 89 |
+
dedupe against the normal pass — inverting a clean dark-on-light line
|
| 90 |
+
often produces a garbled near-duplicate, and we want only the callout
|
| 91 |
+
text the normal pass actually missed.
|
| 92 |
+
|
| 93 |
+
Pass `skip_inverted=True` for slides without coloured callouts to halve
|
| 94 |
+
OCR time and rule out any inverted-pass garbling at the cost of losing
|
| 95 |
+
callout-text recovery.
|
| 96 |
"""
|
| 97 |
try:
|
| 98 |
variants = _preprocess_variants(image_path)
|
|
|
|
| 109 |
return [s.strip() for s in text.splitlines() if s.strip()]
|
| 110 |
|
| 111 |
normal_lines = _ocr_lines(variants[0])
|
| 112 |
+
if skip_inverted:
|
| 113 |
+
return "\n".join(normal_lines)
|
| 114 |
merged: list[str] = list(normal_lines)
|
| 115 |
seen_lower: set[str] = {ln.lower() for ln in normal_lines}
|
| 116 |
for img in variants[1:]:
|
|
@@ -50,6 +50,11 @@ class PipelineInputs:
|
|
| 50 |
max_frames: int | None = None
|
| 51 |
face_threshold: float = 0.12
|
| 52 |
skip_ocr: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
auto_transcribe: bool = False
|
| 54 |
whisper_model: str = "small"
|
| 55 |
inline_images: bool = False
|
|
@@ -153,7 +158,9 @@ def run_pipeline(
|
|
| 153 |
_emit(65, "ocr", f"Running OCR on {n} frames...")
|
| 154 |
ocr_texts: list[str] = []
|
| 155 |
for i, f in enumerate(kept):
|
| 156 |
-
ocr_texts.append(
|
|
|
|
|
|
|
| 157 |
pct = 65 + ((i + 1) / max(n, 1)) * 30
|
| 158 |
_emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
|
| 159 |
# Alt-frame OCR is computed lazily by the /ocr endpoint when the
|
|
|
|
| 50 |
max_frames: int | None = None
|
| 51 |
face_threshold: float = 0.12
|
| 52 |
skip_ocr: bool = False
|
| 53 |
+
# When True, Tesseract runs only on the normal binarization. The
|
| 54 |
+
# inverted pass (callout-text recovery) is skipped — useful when a
|
| 55 |
+
# deck has no white-on-coloured callouts and the inverted pass is
|
| 56 |
+
# producing garbled near-duplicates of body text.
|
| 57 |
+
skip_inverted_ocr: bool = False
|
| 58 |
auto_transcribe: bool = False
|
| 59 |
whisper_model: str = "small"
|
| 60 |
inline_images: bool = False
|
|
|
|
| 158 |
_emit(65, "ocr", f"Running OCR on {n} frames...")
|
| 159 |
ocr_texts: list[str] = []
|
| 160 |
for i, f in enumerate(kept):
|
| 161 |
+
ocr_texts.append(
|
| 162 |
+
ocr_frame(f.image_path, skip_inverted=inputs.skip_inverted_ocr)
|
| 163 |
+
)
|
| 164 |
pct = 65 + ((i + 1) / max(n, 1)) * 30
|
| 165 |
_emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
|
| 166 |
# Alt-frame OCR is computed lazily by the /ocr endpoint when the
|
|
@@ -691,6 +691,16 @@
|
|
| 691 |
</small>
|
| 692 |
</div>
|
| 693 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
<div class="vgm-tune">
|
| 695 |
<div class="vgm-tune__head">
|
| 696 |
<label for="lang">Document language</label>
|
|
|
|
| 691 |
</small>
|
| 692 |
</div>
|
| 693 |
|
| 694 |
+
<div class="vgm-tune">
|
| 695 |
+
<label class="vgm-tune__toggle">
|
| 696 |
+
<input type="checkbox" name="skip_inverted_ocr" value="true"> Skip inverted OCR pass
|
| 697 |
+
</label>
|
| 698 |
+
<small class="vgm-tune__help">
|
| 699 |
+
Halves OCR time and avoids the inverted pass producing garbled near-duplicates of body text.
|
| 700 |
+
Trade-off: white-on-coloured callout text won't be recovered. Leave off for slides with coloured callouts.
|
| 701 |
+
</small>
|
| 702 |
+
</div>
|
| 703 |
+
|
| 704 |
<div class="vgm-tune">
|
| 705 |
<div class="vgm-tune__head">
|
| 706 |
<label for="lang">Document language</label>
|
|
@@ -49,3 +49,38 @@ def test_empty_candidate_or_existing_is_safe():
|
|
| 49 |
def test_norm_for_match_strips_non_alnum():
|
| 50 |
assert _norm_for_match("Hello, World! 123") == "helloworld123"
|
| 51 |
assert _norm_for_match(" ") == ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def test_norm_for_match_strips_non_alnum():
|
| 50 |
assert _norm_for_match("Hello, World! 123") == "helloworld123"
|
| 51 |
assert _norm_for_match(" ") == ""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_skip_inverted_only_runs_normal_pass(monkeypatch, tmp_path):
|
| 55 |
+
"""When skip_inverted=True, ocr_frame must not invoke Tesseract on
|
| 56 |
+
the inverted variant — that's the whole point of the flag (halves
|
| 57 |
+
OCR time and rules out inverted-pass garbling). Verify by counting
|
| 58 |
+
image_to_string calls and asserting they cover only the normal pass.
|
| 59 |
+
"""
|
| 60 |
+
from PIL import Image as _Image
|
| 61 |
+
|
| 62 |
+
from app.pipeline import ocr as ocr_mod
|
| 63 |
+
|
| 64 |
+
# Stub the preprocess to return two distinguishable PIL images so we
|
| 65 |
+
# don't need a real slide image for the test.
|
| 66 |
+
normal = _Image.new("L", (10, 10), color=255)
|
| 67 |
+
inverted = _Image.new("L", (10, 10), color=0)
|
| 68 |
+
monkeypatch.setattr(ocr_mod, "_preprocess_variants", lambda _p: [normal, inverted])
|
| 69 |
+
|
| 70 |
+
calls: list[int] = []
|
| 71 |
+
|
| 72 |
+
def fake_image_to_string(img, lang="eng"):
|
| 73 |
+
calls.append(id(img))
|
| 74 |
+
return "Body line one\nBody line two"
|
| 75 |
+
|
| 76 |
+
monkeypatch.setattr(ocr_mod.pytesseract, "image_to_string", fake_image_to_string)
|
| 77 |
+
|
| 78 |
+
text_skip = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=True)
|
| 79 |
+
assert len(calls) == 1, "skip_inverted must not invoke the inverted pass"
|
| 80 |
+
assert text_skip == "Body line one\nBody line two"
|
| 81 |
+
|
| 82 |
+
calls.clear()
|
| 83 |
+
text_full = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=False)
|
| 84 |
+
assert len(calls) == 2, "default behaviour must run both passes"
|
| 85 |
+
# With identical text from both variants, the dedupe collapses them.
|
| 86 |
+
assert text_full == "Body line one\nBody line two"
|