Spaces:

joemartis
/

Video2Guide

Running

Claude commited on 7 days ago

Commit

355b428

1 Parent(s): 11a6e81

Add skip_inverted_ocr flag to opt out of the inverted Tesseract pass

The inverted-binarization pass exists to recover white-on-coloured
callout text the normal pass turns into noise, but on slides with no
such callouts it just halves throughput and risks producing garbled
near-duplicates of body text (filtered as of the previous fix).

Add a per-job toggle that disables the inverted pass entirely:
- ocr_frame(skip_inverted=True) returns only normal-pass output
- PipelineInputs.skip_inverted_ocr threads it through the orchestrator
- /jobs and /process endpoints accept skip_inverted_ocr as a form field
- CLI exposes --skip-inverted-ocr on build and metadata
- Upload page advanced settings include a "Skip inverted OCR pass" toggle
- Job remembers the choice so the lazy /ocr endpoint stays consistent
when the reviewer picks an alt frame in the editor

Files changed (7) hide show

app/cli.py +4 -0
app/jobs.py +5 -0
app/main.py +8 -1
app/pipeline/ocr.py +17 -9
app/pipeline/orchestrator.py +8 -1
app/templates/index.html +10 -0
tests/test_ocr_dedup.py +35 -0

app/cli.py CHANGED Viewed

@@ -102,6 +102,7 @@ def build(
     min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
     max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
     skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
     face_threshold: float = typer.Option(0.12, "--face-threshold"),
     lang: str = typer.Option("en", "--lang"),
     fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
@@ -124,6 +125,7 @@ def build(
         min_gap_seconds=min_gap,
         max_frames=max_frames,
         skip_ocr=skip_ocr,
         face_threshold=face_threshold,
         auto_transcribe=auto_transcribe,
         whisper_model=whisper_model,
@@ -200,6 +202,7 @@ def export_metadata_cmd(
     min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
     max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
     skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
     face_threshold: float = typer.Option(0.12, "--face-threshold"),
     lang: str = typer.Option("en", "--lang"),
     verbose: bool = typer.Option(False, "--verbose", "-v"),
@@ -218,6 +221,7 @@ def export_metadata_cmd(
         min_gap_seconds=min_gap,
         max_frames=max_frames,
         skip_ocr=skip_ocr,
         face_threshold=face_threshold,
         auto_transcribe=auto_transcribe,
         whisper_model=whisper_model,

     min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
     max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
     skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
+    skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
     face_threshold: float = typer.Option(0.12, "--face-threshold"),
     lang: str = typer.Option("en", "--lang"),
     fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
         min_gap_seconds=min_gap,
         max_frames=max_frames,
         skip_ocr=skip_ocr,
+        skip_inverted_ocr=skip_inverted_ocr,
         face_threshold=face_threshold,
         auto_transcribe=auto_transcribe,
         whisper_model=whisper_model,
     min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
     max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
     skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
+    skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
     face_threshold: float = typer.Option(0.12, "--face-threshold"),
     lang: str = typer.Option("en", "--lang"),
     verbose: bool = typer.Option(False, "--verbose", "-v"),
         min_gap_seconds=min_gap,
         max_frames=max_frames,
         skip_ocr=skip_ocr,
+        skip_inverted_ocr=skip_inverted_ocr,
         face_threshold=face_threshold,
         auto_transcribe=auto_transcribe,
         whisper_model=whisper_model,

app/jobs.py CHANGED Viewed

@@ -56,6 +56,11 @@ class Job:
     # first request to /ocr/{id}/{fn} and kept here so subsequent picks
     # of the same alt don't re-OCR.
     alt_ocr_cache: dict[str, str] = field(default_factory=dict)
 class JobRegistry:

     # first request to /ocr/{id}/{fn} and kept here so subsequent picks
     # of the same alt don't re-OCR.
     alt_ocr_cache: dict[str, str] = field(default_factory=dict)
+    # Mirrors the per-job OCR options chosen at upload time so the lazy
+    # /ocr endpoint keeps the same behaviour when the reviewer picks an
+    # alt frame — otherwise an alt's OCR could re-introduce garbling
+    # the user opted out of for the main pipeline.
+    ocr_skip_inverted: bool = False
 class JobRegistry:

app/main.py CHANGED Viewed

@@ -250,6 +250,7 @@ async def start_job(
     min_gap: float = Form(0.0),
     max_frames: str | None = Form(None),
     skip_ocr: bool = Form(False),
     face_threshold: float = Form(0.12),
     lang: str = Form("en"),
     subtitle: str | None = Form(None),
@@ -293,6 +294,7 @@ async def start_job(
             min_gap_seconds=min_gap,
             max_frames=_parse_optional_int(max_frames),
             skip_ocr=skip_ocr,
             face_threshold=face_threshold,
             inline_images=(format in ("single", "review")),
             anthropic_api_key=(anthropic_api_key or None),
@@ -310,6 +312,7 @@ async def start_job(
             job = registry.create()
         except JobQuotaExceeded as exc:
             raise HTTPException(429, str(exc))
         threading.Thread(
             target=_run_job_in_thread,
             args=(job, inputs, workdir),
@@ -498,7 +501,9 @@ async def fetch_ocr(job_id: str, filename: str) -> Response:
         tmp.write(data)
         tmp_path = Path(tmp.name)
     try:
-        text = await run_in_threadpool(ocr_frame, tmp_path)
     finally:
         try:
             tmp_path.unlink(missing_ok=True)
@@ -633,6 +638,7 @@ async def process(
     min_gap: float = Form(0.0),
     max_frames: str | None = Form(None),
     skip_ocr: bool = Form(False),
     face_threshold: float = Form(0.12),
     lang: str = Form("en"),
     subtitle: str | None = Form(None),
@@ -675,6 +681,7 @@ async def process(
             min_gap_seconds=min_gap,
             max_frames=_parse_optional_int(max_frames),
             skip_ocr=skip_ocr,
             face_threshold=face_threshold,
             inline_images=(format in ("single", "review")),
             anthropic_api_key=(anthropic_api_key or None),

     min_gap: float = Form(0.0),
     max_frames: str | None = Form(None),
     skip_ocr: bool = Form(False),
+    skip_inverted_ocr: bool = Form(False),
     face_threshold: float = Form(0.12),
     lang: str = Form("en"),
     subtitle: str | None = Form(None),
             min_gap_seconds=min_gap,
             max_frames=_parse_optional_int(max_frames),
             skip_ocr=skip_ocr,
+            skip_inverted_ocr=skip_inverted_ocr,
             face_threshold=face_threshold,
             inline_images=(format in ("single", "review")),
             anthropic_api_key=(anthropic_api_key or None),
             job = registry.create()
         except JobQuotaExceeded as exc:
             raise HTTPException(429, str(exc))
+        job.ocr_skip_inverted = skip_inverted_ocr
         threading.Thread(
             target=_run_job_in_thread,
             args=(job, inputs, workdir),
         tmp.write(data)
         tmp_path = Path(tmp.name)
     try:
+        text = await run_in_threadpool(
+            lambda: ocr_frame(tmp_path, skip_inverted=job.ocr_skip_inverted)
+        )
     finally:
         try:
             tmp_path.unlink(missing_ok=True)
     min_gap: float = Form(0.0),
     max_frames: str | None = Form(None),
     skip_ocr: bool = Form(False),
+    skip_inverted_ocr: bool = Form(False),
     face_threshold: float = Form(0.12),
     lang: str = Form("en"),
     subtitle: str | None = Form(None),
             min_gap_seconds=min_gap,
             max_frames=_parse_optional_int(max_frames),
             skip_ocr=skip_ocr,
+            skip_inverted_ocr=skip_inverted_ocr,
             face_threshold=face_threshold,
             inline_images=(format in ("single", "review")),
             anthropic_api_key=(anthropic_api_key or None),

app/pipeline/ocr.py CHANGED Viewed

@@ -76,17 +76,23 @@ def _preprocess_variants(image_path: Path) -> list[Image.Image]:
     return [Image.fromarray(normal), Image.fromarray(inverted)]
-def ocr_frame(image_path: Path, lang: str = "eng") -> str:
     """Return cleaned OCR text for a single frame; empty string on failure.
-    Runs Tesseract on both the normal and inverted binarizations and
-    merges unique lines. The normal pass picks up most slide content,
-    the inverted pass recovers white-on-coloured callout text the
-    normal pass turns into noise. Order from the normal pass is
-    preserved; inverted-only lines are appended after a fuzzy-dedupe
-    against the normal pass — inverting a clean dark-on-light line
-    often produces a garbled near-duplicate, and we want only the
-    callout text the normal pass actually missed.
     """
     try:
         variants = _preprocess_variants(image_path)
@@ -103,6 +109,8 @@ def ocr_frame(image_path: Path, lang: str = "eng") -> str:
         return [s.strip() for s in text.splitlines() if s.strip()]
     normal_lines = _ocr_lines(variants[0])
     merged: list[str] = list(normal_lines)
     seen_lower: set[str] = {ln.lower() for ln in normal_lines}
     for img in variants[1:]:

     return [Image.fromarray(normal), Image.fromarray(inverted)]
+def ocr_frame(
+    image_path: Path, lang: str = "eng", *, skip_inverted: bool = False
+) -> str:
     """Return cleaned OCR text for a single frame; empty string on failure.
+    Runs Tesseract on the normal binarization, and — unless `skip_inverted`
+    is True — on the inverted binarization too. The normal pass picks up
+    most slide content; the inverted pass recovers white-on-coloured
+    callout text the normal pass turns into noise. Order from the normal
+    pass is preserved; inverted-only lines are appended after a fuzzy-
+    dedupe against the normal pass — inverting a clean dark-on-light line
+    often produces a garbled near-duplicate, and we want only the callout
+    text the normal pass actually missed.
+    Pass `skip_inverted=True` for slides without coloured callouts to halve
+    OCR time and rule out any inverted-pass garbling at the cost of losing
+    callout-text recovery.
     """
     try:
         variants = _preprocess_variants(image_path)
         return [s.strip() for s in text.splitlines() if s.strip()]
     normal_lines = _ocr_lines(variants[0])
+    if skip_inverted:
+        return "\n".join(normal_lines)
     merged: list[str] = list(normal_lines)
     seen_lower: set[str] = {ln.lower() for ln in normal_lines}
     for img in variants[1:]:

app/pipeline/orchestrator.py CHANGED Viewed

@@ -50,6 +50,11 @@ class PipelineInputs:
     max_frames: int | None = None
     face_threshold: float = 0.12
     skip_ocr: bool = False
     auto_transcribe: bool = False
     whisper_model: str = "small"
     inline_images: bool = False
@@ -153,7 +158,9 @@ def run_pipeline(
         _emit(65, "ocr", f"Running OCR on {n} frames...")
         ocr_texts: list[str] = []
         for i, f in enumerate(kept):
-            ocr_texts.append(ocr_frame(f.image_path))
             pct = 65 + ((i + 1) / max(n, 1)) * 30
             _emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
     # Alt-frame OCR is computed lazily by the /ocr endpoint when the

     max_frames: int | None = None
     face_threshold: float = 0.12
     skip_ocr: bool = False
+    # When True, Tesseract runs only on the normal binarization. The
+    # inverted pass (callout-text recovery) is skipped — useful when a
+    # deck has no white-on-coloured callouts and the inverted pass is
+    # producing garbled near-duplicates of body text.
+    skip_inverted_ocr: bool = False
     auto_transcribe: bool = False
     whisper_model: str = "small"
     inline_images: bool = False
         _emit(65, "ocr", f"Running OCR on {n} frames...")
         ocr_texts: list[str] = []
         for i, f in enumerate(kept):
+            ocr_texts.append(
+                ocr_frame(f.image_path, skip_inverted=inputs.skip_inverted_ocr)
+            )
             pct = 65 + ((i + 1) / max(n, 1)) * 30
             _emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
     # Alt-frame OCR is computed lazily by the /ocr endpoint when the

app/templates/index.html CHANGED Viewed

@@ -691,6 +691,16 @@
                 </small>
               </div>
               <div class="vgm-tune">
                 <div class="vgm-tune__head">
                   <label for="lang">Document language</label>

                 </small>
               </div>
+              <div class="vgm-tune">
+                <label class="vgm-tune__toggle">
+                  <input type="checkbox" name="skip_inverted_ocr" value="true"> Skip inverted OCR pass
+                </label>
+                <small class="vgm-tune__help">
+                  Halves OCR time and avoids the inverted pass producing garbled near-duplicates of body text.
+                  Trade-off: white-on-coloured callout text won't be recovered. Leave off for slides with coloured callouts.
+                </small>
+              </div>
               <div class="vgm-tune">
                 <div class="vgm-tune__head">
                   <label for="lang">Document language</label>

tests/test_ocr_dedup.py CHANGED Viewed

@@ -49,3 +49,38 @@ def test_empty_candidate_or_existing_is_safe():
 def test_norm_for_match_strips_non_alnum():
     assert _norm_for_match("Hello, World! 123") == "helloworld123"
     assert _norm_for_match("  ") == ""

 def test_norm_for_match_strips_non_alnum():
     assert _norm_for_match("Hello, World! 123") == "helloworld123"
     assert _norm_for_match("  ") == ""
+def test_skip_inverted_only_runs_normal_pass(monkeypatch, tmp_path):
+    """When skip_inverted=True, ocr_frame must not invoke Tesseract on
+    the inverted variant — that's the whole point of the flag (halves
+    OCR time and rules out inverted-pass garbling). Verify by counting
+    image_to_string calls and asserting they cover only the normal pass.
+    """
+    from PIL import Image as _Image
+    from app.pipeline import ocr as ocr_mod
+    # Stub the preprocess to return two distinguishable PIL images so we
+    # don't need a real slide image for the test.
+    normal = _Image.new("L", (10, 10), color=255)
+    inverted = _Image.new("L", (10, 10), color=0)
+    monkeypatch.setattr(ocr_mod, "_preprocess_variants", lambda _p: [normal, inverted])
+    calls: list[int] = []
+    def fake_image_to_string(img, lang="eng"):
+        calls.append(id(img))
+        return "Body line one\nBody line two"
+    monkeypatch.setattr(ocr_mod.pytesseract, "image_to_string", fake_image_to_string)
+    text_skip = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=True)
+    assert len(calls) == 1, "skip_inverted must not invoke the inverted pass"
+    assert text_skip == "Body line one\nBody line two"
+    calls.clear()
+    text_full = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=False)
+    assert len(calls) == 2, "default behaviour must run both passes"
+    # With identical text from both variants, the dedupe collapses them.
+    assert text_full == "Body line one\nBody line two"