Video2Guide / app /cli.py
Claude
Add skip_inverted_ocr flag to opt out of the inverted Tesseract pass
355b428
"""`vgm` command-line interface.
Three subcommands:
- `build` — full pipeline: video (+ optional transcript) → HTML / zip / review HTML.
- `export-metadata` — run the pipeline and dump the unified metadata JSON.
- `render-from-metadata` — re-render HTML from a previously exported metadata JSON.
"""
from __future__ import annotations
import contextlib
import logging
from pathlib import Path
from typing import Optional
import typer
from rich.progress import (
BarColumn,
Progress,
TaskProgressColumn,
TextColumn,
TimeElapsedColumn,
)
from .pipeline import bundle_zip, dump_metadata, load_metadata, render_guide, render_review
from .pipeline.html_gen import metadata_to_segment, render_from_metadata
from .pipeline.orchestrator import (
PipelineError,
PipelineInputs,
ProgressEvent,
run_pipeline,
)
@contextlib.contextmanager
def _progress_bar():
"""Yield a (callable progress_cb, finalize) pair backed by a rich Progress.
The progress_cb signature matches orchestrator.ProgressCallback.
"""
progress = Progress(
TextColumn("[bold]{task.fields[stage]:<13}", justify="left"),
BarColumn(bar_width=None),
TaskProgressColumn(),
TextColumn("{task.fields[msg]}"),
TimeElapsedColumn(),
transient=False,
)
progress.start()
task_id = progress.add_task("vgm", total=100, stage="starting", msg="")
def cb(ev: ProgressEvent) -> None:
progress.update(task_id, completed=ev.percent, stage=ev.stage, msg=ev.message)
try:
yield cb
finally:
progress.stop()
app = typer.Typer(
add_completion=False,
help="VideoGuideMaker — generate WCAG-ready study guides from video + transcript.",
no_args_is_help=True,
)
log = logging.getLogger("videoguidemaker.cli")
def _setup_logging(verbose: bool) -> None:
# Default to WARNING so the rich progress bar isn't disrupted by INFO logs.
# `--verbose` opts into DEBUG output.
logging.basicConfig(
level=logging.DEBUG if verbose else logging.WARNING,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
)
def _resolve_format(fmt: str, output: Path) -> str:
fmt = fmt.lower()
if fmt not in ("review", "single", "zip", "guide"):
raise typer.BadParameter("format must be one of: review, single, zip, guide")
return fmt
def _safe_filename(title: str) -> str:
safe = "".join(c if c.isalnum() or c in "-_ " else "-" for c in title).strip()
safe = safe.replace(" ", "-")
return safe or "study-guide"
@app.command()
def build(
video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
title: str = typer.Option("Untitled Study Guide", "--title"),
subtitle: Optional[str] = typer.Option(None, "--subtitle"),
module: Optional[str] = typer.Option(None, "--module"),
output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
whisper_model: str = typer.Option("small", "--whisper-model"),
threshold: float = typer.Option(27.0, "--threshold"),
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
face_threshold: float = typer.Option(0.12, "--face-threshold"),
lang: str = typer.Option("en", "--lang"),
fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
export_metadata: Optional[Path] = typer.Option(None, "--export-metadata"),
verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
"""Run the full pipeline: video + transcript → HTML."""
_setup_logging(verbose)
fmt = _resolve_format(fmt, output)
inputs = PipelineInputs(
video_path=video,
transcript_path=transcript if transcript and transcript.exists() else None,
frames_dir=frames_dir,
title=title,
subtitle=subtitle,
module=module,
lang=lang,
threshold=threshold,
min_gap_seconds=min_gap,
max_frames=max_frames,
skip_ocr=skip_ocr,
skip_inverted_ocr=skip_inverted_ocr,
face_threshold=face_threshold,
auto_transcribe=auto_transcribe,
whisper_model=whisper_model,
inline_images=(fmt in ("single", "review")),
)
try:
with _progress_bar() as cb:
result = run_pipeline(inputs, progress=cb)
except PipelineError as exc:
typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
raise typer.Exit(2)
# Dump metadata BEFORE rendering: a render failure (template bug,
# disk full mid-write) would otherwise discard the LLM/OCR work
# the user just paid for.
if export_metadata:
dump_metadata(export_metadata, result.page)
typer.echo(f"wrote {export_metadata}")
common = dict(
title=title,
segments=result.segments,
lang=lang,
subtitle=subtitle,
module=module,
meta_lines=result.page.meta_lines or None,
eyebrow=result.page.eyebrow,
)
if fmt == "review":
html = render_review(**common)
else:
inline = fmt == "single"
if inline:
# Inline audio data URIs alongside images so the single HTML
# stays self-contained (no broken audio/foo.mp3 references).
import base64
for seg, ap in zip(result.segments, result.audio_paths):
if ap and ap.exists():
seg.audio_data_uri = (
"data:audio/mpeg;base64,"
+ base64.b64encode(ap.read_bytes()).decode("ascii")
)
html = render_guide(inline_images=inline, **common)
if fmt == "zip":
audio_disk_paths = [p for p in result.audio_paths if p is not None]
zip_bytes = bundle_zip(
html,
[f.image_path for f in result.kept_frames],
audio_paths=audio_disk_paths,
)
if output.suffix.lower() != ".zip":
output = output.with_suffix(".zip")
output.write_bytes(zip_bytes)
else:
output.write_text(html, encoding="utf-8")
typer.echo(f"wrote {output}")
@app.command("export-metadata")
def export_metadata_cmd(
video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
title: str = typer.Option("Untitled Study Guide", "--title"),
subtitle: Optional[str] = typer.Option(None, "--subtitle"),
module: Optional[str] = typer.Option(None, "--module"),
output: Path = typer.Option(Path("study_guide_metadata.json"), "--output", "-o"),
frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
whisper_model: str = typer.Option("small", "--whisper-model"),
threshold: float = typer.Option(27.0, "--threshold"),
min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
face_threshold: float = typer.Option(0.12, "--face-threshold"),
lang: str = typer.Option("en", "--lang"),
verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
"""Run the pipeline and dump the metadata JSON only (no HTML)."""
_setup_logging(verbose)
inputs = PipelineInputs(
video_path=video,
transcript_path=transcript if transcript and transcript.exists() else None,
frames_dir=frames_dir,
title=title,
subtitle=subtitle,
module=module,
lang=lang,
threshold=threshold,
min_gap_seconds=min_gap,
max_frames=max_frames,
skip_ocr=skip_ocr,
skip_inverted_ocr=skip_inverted_ocr,
face_threshold=face_threshold,
auto_transcribe=auto_transcribe,
whisper_model=whisper_model,
inline_images=False,
)
try:
with _progress_bar() as cb:
result = run_pipeline(inputs, progress=cb)
except PipelineError as exc:
typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
raise typer.Exit(2)
dump_metadata(output, result.page)
typer.echo(f"wrote {output} ({len(result.page.segments)} segments, frames in {frames_dir})")
@app.command("render-from-metadata")
def render_from_metadata_cmd(
metadata_json: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
frames_dir: Optional[Path] = typer.Option(
None, "--frames-dir",
help="Override the frames_dir recorded in the metadata JSON.",
),
fmt: str = typer.Option("single", "--format", help="review | single | guide"),
verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
"""Re-render HTML from a previously exported metadata JSON."""
_setup_logging(verbose)
fmt = _resolve_format(fmt, output)
if fmt == "zip":
raise typer.BadParameter("zip format requires source frames; use 'build' instead.")
page = load_metadata(metadata_json)
resolved_frames_dir = (
frames_dir
if frames_dir is not None
else (metadata_json.parent / page.frames_dir).resolve()
)
html = render_from_metadata(page, Path(resolved_frames_dir), mode=fmt)
output.write_text(html, encoding="utf-8")
typer.echo(f"wrote {output}")
if __name__ == "__main__": # pragma: no cover
app()