Claude commited on
Commit
355b428
·
1 Parent(s): 11a6e81

Add skip_inverted_ocr flag to opt out of the inverted Tesseract pass

Browse files

The inverted-binarization pass exists to recover white-on-coloured
callout text the normal pass turns into noise, but on slides with no
such callouts it just halves throughput and risks producing garbled
near-duplicates of body text (filtered as of the previous fix).

Add a per-job toggle that disables the inverted pass entirely:
- ocr_frame(skip_inverted=True) returns only normal-pass output
- PipelineInputs.skip_inverted_ocr threads it through the orchestrator
- /jobs and /process endpoints accept skip_inverted_ocr as a form field
- CLI exposes --skip-inverted-ocr on build and metadata
- Upload page advanced settings include a "Skip inverted OCR pass" toggle
- Job remembers the choice so the lazy /ocr endpoint stays consistent
when the reviewer picks an alt frame in the editor

app/cli.py CHANGED
@@ -102,6 +102,7 @@ def build(
102
  min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
103
  max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
104
  skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
 
105
  face_threshold: float = typer.Option(0.12, "--face-threshold"),
106
  lang: str = typer.Option("en", "--lang"),
107
  fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
@@ -124,6 +125,7 @@ def build(
124
  min_gap_seconds=min_gap,
125
  max_frames=max_frames,
126
  skip_ocr=skip_ocr,
 
127
  face_threshold=face_threshold,
128
  auto_transcribe=auto_transcribe,
129
  whisper_model=whisper_model,
@@ -200,6 +202,7 @@ def export_metadata_cmd(
200
  min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
201
  max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
202
  skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
 
203
  face_threshold: float = typer.Option(0.12, "--face-threshold"),
204
  lang: str = typer.Option("en", "--lang"),
205
  verbose: bool = typer.Option(False, "--verbose", "-v"),
@@ -218,6 +221,7 @@ def export_metadata_cmd(
218
  min_gap_seconds=min_gap,
219
  max_frames=max_frames,
220
  skip_ocr=skip_ocr,
 
221
  face_threshold=face_threshold,
222
  auto_transcribe=auto_transcribe,
223
  whisper_model=whisper_model,
 
102
  min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
103
  max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
104
  skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
105
+ skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
106
  face_threshold: float = typer.Option(0.12, "--face-threshold"),
107
  lang: str = typer.Option("en", "--lang"),
108
  fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
 
125
  min_gap_seconds=min_gap,
126
  max_frames=max_frames,
127
  skip_ocr=skip_ocr,
128
+ skip_inverted_ocr=skip_inverted_ocr,
129
  face_threshold=face_threshold,
130
  auto_transcribe=auto_transcribe,
131
  whisper_model=whisper_model,
 
202
  min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
203
  max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
204
  skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
205
+ skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
206
  face_threshold: float = typer.Option(0.12, "--face-threshold"),
207
  lang: str = typer.Option("en", "--lang"),
208
  verbose: bool = typer.Option(False, "--verbose", "-v"),
 
221
  min_gap_seconds=min_gap,
222
  max_frames=max_frames,
223
  skip_ocr=skip_ocr,
224
+ skip_inverted_ocr=skip_inverted_ocr,
225
  face_threshold=face_threshold,
226
  auto_transcribe=auto_transcribe,
227
  whisper_model=whisper_model,
app/jobs.py CHANGED
@@ -56,6 +56,11 @@ class Job:
56
  # first request to /ocr/{id}/{fn} and kept here so subsequent picks
57
  # of the same alt don't re-OCR.
58
  alt_ocr_cache: dict[str, str] = field(default_factory=dict)
 
 
 
 
 
59
 
60
 
61
  class JobRegistry:
 
56
  # first request to /ocr/{id}/{fn} and kept here so subsequent picks
57
  # of the same alt don't re-OCR.
58
  alt_ocr_cache: dict[str, str] = field(default_factory=dict)
59
+ # Mirrors the per-job OCR options chosen at upload time so the lazy
60
+ # /ocr endpoint keeps the same behaviour when the reviewer picks an
61
+ # alt frame — otherwise an alt's OCR could re-introduce garbling
62
+ # the user opted out of for the main pipeline.
63
+ ocr_skip_inverted: bool = False
64
 
65
 
66
  class JobRegistry:
app/main.py CHANGED
@@ -250,6 +250,7 @@ async def start_job(
250
  min_gap: float = Form(0.0),
251
  max_frames: str | None = Form(None),
252
  skip_ocr: bool = Form(False),
 
253
  face_threshold: float = Form(0.12),
254
  lang: str = Form("en"),
255
  subtitle: str | None = Form(None),
@@ -293,6 +294,7 @@ async def start_job(
293
  min_gap_seconds=min_gap,
294
  max_frames=_parse_optional_int(max_frames),
295
  skip_ocr=skip_ocr,
 
296
  face_threshold=face_threshold,
297
  inline_images=(format in ("single", "review")),
298
  anthropic_api_key=(anthropic_api_key or None),
@@ -310,6 +312,7 @@ async def start_job(
310
  job = registry.create()
311
  except JobQuotaExceeded as exc:
312
  raise HTTPException(429, str(exc))
 
313
  threading.Thread(
314
  target=_run_job_in_thread,
315
  args=(job, inputs, workdir),
@@ -498,7 +501,9 @@ async def fetch_ocr(job_id: str, filename: str) -> Response:
498
  tmp.write(data)
499
  tmp_path = Path(tmp.name)
500
  try:
501
- text = await run_in_threadpool(ocr_frame, tmp_path)
 
 
502
  finally:
503
  try:
504
  tmp_path.unlink(missing_ok=True)
@@ -633,6 +638,7 @@ async def process(
633
  min_gap: float = Form(0.0),
634
  max_frames: str | None = Form(None),
635
  skip_ocr: bool = Form(False),
 
636
  face_threshold: float = Form(0.12),
637
  lang: str = Form("en"),
638
  subtitle: str | None = Form(None),
@@ -675,6 +681,7 @@ async def process(
675
  min_gap_seconds=min_gap,
676
  max_frames=_parse_optional_int(max_frames),
677
  skip_ocr=skip_ocr,
 
678
  face_threshold=face_threshold,
679
  inline_images=(format in ("single", "review")),
680
  anthropic_api_key=(anthropic_api_key or None),
 
250
  min_gap: float = Form(0.0),
251
  max_frames: str | None = Form(None),
252
  skip_ocr: bool = Form(False),
253
+ skip_inverted_ocr: bool = Form(False),
254
  face_threshold: float = Form(0.12),
255
  lang: str = Form("en"),
256
  subtitle: str | None = Form(None),
 
294
  min_gap_seconds=min_gap,
295
  max_frames=_parse_optional_int(max_frames),
296
  skip_ocr=skip_ocr,
297
+ skip_inverted_ocr=skip_inverted_ocr,
298
  face_threshold=face_threshold,
299
  inline_images=(format in ("single", "review")),
300
  anthropic_api_key=(anthropic_api_key or None),
 
312
  job = registry.create()
313
  except JobQuotaExceeded as exc:
314
  raise HTTPException(429, str(exc))
315
+ job.ocr_skip_inverted = skip_inverted_ocr
316
  threading.Thread(
317
  target=_run_job_in_thread,
318
  args=(job, inputs, workdir),
 
501
  tmp.write(data)
502
  tmp_path = Path(tmp.name)
503
  try:
504
+ text = await run_in_threadpool(
505
+ lambda: ocr_frame(tmp_path, skip_inverted=job.ocr_skip_inverted)
506
+ )
507
  finally:
508
  try:
509
  tmp_path.unlink(missing_ok=True)
 
638
  min_gap: float = Form(0.0),
639
  max_frames: str | None = Form(None),
640
  skip_ocr: bool = Form(False),
641
+ skip_inverted_ocr: bool = Form(False),
642
  face_threshold: float = Form(0.12),
643
  lang: str = Form("en"),
644
  subtitle: str | None = Form(None),
 
681
  min_gap_seconds=min_gap,
682
  max_frames=_parse_optional_int(max_frames),
683
  skip_ocr=skip_ocr,
684
+ skip_inverted_ocr=skip_inverted_ocr,
685
  face_threshold=face_threshold,
686
  inline_images=(format in ("single", "review")),
687
  anthropic_api_key=(anthropic_api_key or None),
app/pipeline/ocr.py CHANGED
@@ -76,17 +76,23 @@ def _preprocess_variants(image_path: Path) -> list[Image.Image]:
76
  return [Image.fromarray(normal), Image.fromarray(inverted)]
77
 
78
 
79
- def ocr_frame(image_path: Path, lang: str = "eng") -> str:
 
 
80
  """Return cleaned OCR text for a single frame; empty string on failure.
81
 
82
- Runs Tesseract on both the normal and inverted binarizations and
83
- merges unique lines. The normal pass picks up most slide content,
84
- the inverted pass recovers white-on-coloured callout text the
85
- normal pass turns into noise. Order from the normal pass is
86
- preserved; inverted-only lines are appended after a fuzzy-dedupe
87
- against the normal pass — inverting a clean dark-on-light line
88
- often produces a garbled near-duplicate, and we want only the
89
- callout text the normal pass actually missed.
 
 
 
 
90
  """
91
  try:
92
  variants = _preprocess_variants(image_path)
@@ -103,6 +109,8 @@ def ocr_frame(image_path: Path, lang: str = "eng") -> str:
103
  return [s.strip() for s in text.splitlines() if s.strip()]
104
 
105
  normal_lines = _ocr_lines(variants[0])
 
 
106
  merged: list[str] = list(normal_lines)
107
  seen_lower: set[str] = {ln.lower() for ln in normal_lines}
108
  for img in variants[1:]:
 
76
  return [Image.fromarray(normal), Image.fromarray(inverted)]
77
 
78
 
79
+ def ocr_frame(
80
+ image_path: Path, lang: str = "eng", *, skip_inverted: bool = False
81
+ ) -> str:
82
  """Return cleaned OCR text for a single frame; empty string on failure.
83
 
84
+ Runs Tesseract on the normal binarization, and unless `skip_inverted`
85
+ is True — on the inverted binarization too. The normal pass picks up
86
+ most slide content; the inverted pass recovers white-on-coloured
87
+ callout text the normal pass turns into noise. Order from the normal
88
+ pass is preserved; inverted-only lines are appended after a fuzzy-
89
+ dedupe against the normal pass — inverting a clean dark-on-light line
90
+ often produces a garbled near-duplicate, and we want only the callout
91
+ text the normal pass actually missed.
92
+
93
+ Pass `skip_inverted=True` for slides without coloured callouts to halve
94
+ OCR time and rule out any inverted-pass garbling at the cost of losing
95
+ callout-text recovery.
96
  """
97
  try:
98
  variants = _preprocess_variants(image_path)
 
109
  return [s.strip() for s in text.splitlines() if s.strip()]
110
 
111
  normal_lines = _ocr_lines(variants[0])
112
+ if skip_inverted:
113
+ return "\n".join(normal_lines)
114
  merged: list[str] = list(normal_lines)
115
  seen_lower: set[str] = {ln.lower() for ln in normal_lines}
116
  for img in variants[1:]:
app/pipeline/orchestrator.py CHANGED
@@ -50,6 +50,11 @@ class PipelineInputs:
50
  max_frames: int | None = None
51
  face_threshold: float = 0.12
52
  skip_ocr: bool = False
 
 
 
 
 
53
  auto_transcribe: bool = False
54
  whisper_model: str = "small"
55
  inline_images: bool = False
@@ -153,7 +158,9 @@ def run_pipeline(
153
  _emit(65, "ocr", f"Running OCR on {n} frames...")
154
  ocr_texts: list[str] = []
155
  for i, f in enumerate(kept):
156
- ocr_texts.append(ocr_frame(f.image_path))
 
 
157
  pct = 65 + ((i + 1) / max(n, 1)) * 30
158
  _emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
159
  # Alt-frame OCR is computed lazily by the /ocr endpoint when the
 
50
  max_frames: int | None = None
51
  face_threshold: float = 0.12
52
  skip_ocr: bool = False
53
+ # When True, Tesseract runs only on the normal binarization. The
54
+ # inverted pass (callout-text recovery) is skipped — useful when a
55
+ # deck has no white-on-coloured callouts and the inverted pass is
56
+ # producing garbled near-duplicates of body text.
57
+ skip_inverted_ocr: bool = False
58
  auto_transcribe: bool = False
59
  whisper_model: str = "small"
60
  inline_images: bool = False
 
158
  _emit(65, "ocr", f"Running OCR on {n} frames...")
159
  ocr_texts: list[str] = []
160
  for i, f in enumerate(kept):
161
+ ocr_texts.append(
162
+ ocr_frame(f.image_path, skip_inverted=inputs.skip_inverted_ocr)
163
+ )
164
  pct = 65 + ((i + 1) / max(n, 1)) * 30
165
  _emit(pct, "ocr", f"OCR frame {i + 1}/{n}")
166
  # Alt-frame OCR is computed lazily by the /ocr endpoint when the
app/templates/index.html CHANGED
@@ -691,6 +691,16 @@
691
  </small>
692
  </div>
693
 
 
 
 
 
 
 
 
 
 
 
694
  <div class="vgm-tune">
695
  <div class="vgm-tune__head">
696
  <label for="lang">Document language</label>
 
691
  </small>
692
  </div>
693
 
694
+ <div class="vgm-tune">
695
+ <label class="vgm-tune__toggle">
696
+ <input type="checkbox" name="skip_inverted_ocr" value="true"> Skip inverted OCR pass
697
+ </label>
698
+ <small class="vgm-tune__help">
699
+ Halves OCR time and avoids the inverted pass producing garbled near-duplicates of body text.
700
+ Trade-off: white-on-coloured callout text won't be recovered. Leave off for slides with coloured callouts.
701
+ </small>
702
+ </div>
703
+
704
  <div class="vgm-tune">
705
  <div class="vgm-tune__head">
706
  <label for="lang">Document language</label>
tests/test_ocr_dedup.py CHANGED
@@ -49,3 +49,38 @@ def test_empty_candidate_or_existing_is_safe():
49
  def test_norm_for_match_strips_non_alnum():
50
  assert _norm_for_match("Hello, World! 123") == "helloworld123"
51
  assert _norm_for_match(" ") == ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def test_norm_for_match_strips_non_alnum():
50
  assert _norm_for_match("Hello, World! 123") == "helloworld123"
51
  assert _norm_for_match(" ") == ""
52
+
53
+
54
+ def test_skip_inverted_only_runs_normal_pass(monkeypatch, tmp_path):
55
+ """When skip_inverted=True, ocr_frame must not invoke Tesseract on
56
+ the inverted variant — that's the whole point of the flag (halves
57
+ OCR time and rules out inverted-pass garbling). Verify by counting
58
+ image_to_string calls and asserting they cover only the normal pass.
59
+ """
60
+ from PIL import Image as _Image
61
+
62
+ from app.pipeline import ocr as ocr_mod
63
+
64
+ # Stub the preprocess to return two distinguishable PIL images so we
65
+ # don't need a real slide image for the test.
66
+ normal = _Image.new("L", (10, 10), color=255)
67
+ inverted = _Image.new("L", (10, 10), color=0)
68
+ monkeypatch.setattr(ocr_mod, "_preprocess_variants", lambda _p: [normal, inverted])
69
+
70
+ calls: list[int] = []
71
+
72
+ def fake_image_to_string(img, lang="eng"):
73
+ calls.append(id(img))
74
+ return "Body line one\nBody line two"
75
+
76
+ monkeypatch.setattr(ocr_mod.pytesseract, "image_to_string", fake_image_to_string)
77
+
78
+ text_skip = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=True)
79
+ assert len(calls) == 1, "skip_inverted must not invoke the inverted pass"
80
+ assert text_skip == "Body line one\nBody line two"
81
+
82
+ calls.clear()
83
+ text_full = ocr_mod.ocr_frame(tmp_path / "fake.jpg", skip_inverted=False)
84
+ assert len(calls) == 2, "default behaviour must run both passes"
85
+ # With identical text from both variants, the dedupe collapses them.
86
+ assert text_full == "Body line one\nBody line two"