Spaces:

joemartis
/

Video2Guide

Running

Claude

Add skip_inverted_ocr flag to opt out of the inverted Tesseract pass

355b428 2 days ago

10.6 kB

	"""`vgm` command-line interface.

	Three subcommands:
	- `build` — full pipeline: video (+ optional transcript) → HTML / zip / review HTML.
	- `export-metadata` — run the pipeline and dump the unified metadata JSON.
	- `render-from-metadata` — re-render HTML from a previously exported metadata JSON.
	"""
	from __future__ import annotations

	import contextlib
	import logging
	from pathlib import Path
	from typing import Optional

	import typer
	from rich.progress import (
	BarColumn,
	Progress,
	TaskProgressColumn,
	TextColumn,
	TimeElapsedColumn,
	)

	from .pipeline import bundle_zip, dump_metadata, load_metadata, render_guide, render_review
	from .pipeline.html_gen import metadata_to_segment, render_from_metadata
	from .pipeline.orchestrator import (
	PipelineError,
	PipelineInputs,
	ProgressEvent,
	run_pipeline,
	)


	@contextlib.contextmanager
	def _progress_bar():
	"""Yield a (callable progress_cb, finalize) pair backed by a rich Progress.

	The progress_cb signature matches orchestrator.ProgressCallback.
	"""
	progress = Progress(
	TextColumn("[bold]{task.fields[stage]:<13}", justify="left"),
	BarColumn(bar_width=None),
	TaskProgressColumn(),
	TextColumn("{task.fields[msg]}"),
	TimeElapsedColumn(),
	transient=False,
	)
	progress.start()
	task_id = progress.add_task("vgm", total=100, stage="starting", msg="")

	def cb(ev: ProgressEvent) -> None:
	progress.update(task_id, completed=ev.percent, stage=ev.stage, msg=ev.message)

	try:
	yield cb
	finally:
	progress.stop()

	app = typer.Typer(
	add_completion=False,
	help="VideoGuideMaker — generate WCAG-ready study guides from video + transcript.",
	no_args_is_help=True,
	)

	log = logging.getLogger("videoguidemaker.cli")


	def _setup_logging(verbose: bool) -> None:
	# Default to WARNING so the rich progress bar isn't disrupted by INFO logs.
	# `--verbose` opts into DEBUG output.
	logging.basicConfig(
	level=logging.DEBUG if verbose else logging.WARNING,
	format="%(asctime)s %(levelname)s %(name)s: %(message)s",
	)


	def _resolve_format(fmt: str, output: Path) -> str:
	fmt = fmt.lower()
	if fmt not in ("review", "single", "zip", "guide"):
	raise typer.BadParameter("format must be one of: review, single, zip, guide")
	return fmt


	def _safe_filename(title: str) -> str:
	safe = "".join(c if c.isalnum() or c in "-_ " else "-" for c in title).strip()
	safe = safe.replace(" ", "-")
	return safe or "study-guide"


	@app.command()
	def build(
	video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
	transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
	title: str = typer.Option("Untitled Study Guide", "--title"),
	subtitle: Optional[str] = typer.Option(None, "--subtitle"),
	module: Optional[str] = typer.Option(None, "--module"),
	output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
	frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
	auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
	whisper_model: str = typer.Option("small", "--whisper-model"),
	threshold: float = typer.Option(27.0, "--threshold"),
	min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
	max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
	skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
	skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
	face_threshold: float = typer.Option(0.12, "--face-threshold"),
	lang: str = typer.Option("en", "--lang"),
	fmt: str = typer.Option("single", "--format", help="review \| single \| zip \| guide"),
	export_metadata: Optional[Path] = typer.Option(None, "--export-metadata"),
	verbose: bool = typer.Option(False, "--verbose", "-v"),
	) -> None:
	"""Run the full pipeline: video + transcript → HTML."""
	_setup_logging(verbose)
	fmt = _resolve_format(fmt, output)

	inputs = PipelineInputs(
	video_path=video,
	transcript_path=transcript if transcript and transcript.exists() else None,
	frames_dir=frames_dir,
	title=title,
	subtitle=subtitle,
	module=module,
	lang=lang,
	threshold=threshold,
	min_gap_seconds=min_gap,
	max_frames=max_frames,
	skip_ocr=skip_ocr,
	skip_inverted_ocr=skip_inverted_ocr,
	face_threshold=face_threshold,
	auto_transcribe=auto_transcribe,
	whisper_model=whisper_model,
	inline_images=(fmt in ("single", "review")),
	)
	try:
	with _progress_bar() as cb:
	result = run_pipeline(inputs, progress=cb)
	except PipelineError as exc:
	typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
	raise typer.Exit(2)

	# Dump metadata BEFORE rendering: a render failure (template bug,
	# disk full mid-write) would otherwise discard the LLM/OCR work
	# the user just paid for.
	if export_metadata:
	dump_metadata(export_metadata, result.page)
	typer.echo(f"wrote {export_metadata}")

	common = dict(
	title=title,
	segments=result.segments,
	lang=lang,
	subtitle=subtitle,
	module=module,
	meta_lines=result.page.meta_lines or None,
	eyebrow=result.page.eyebrow,
	)

	if fmt == "review":
	html = render_review(**common)
	else:
	inline = fmt == "single"
	if inline:
	# Inline audio data URIs alongside images so the single HTML
	# stays self-contained (no broken audio/foo.mp3 references).
	import base64
	for seg, ap in zip(result.segments, result.audio_paths):
	if ap and ap.exists():
	seg.audio_data_uri = (
	"data:audio/mpeg;base64,"
	+ base64.b64encode(ap.read_bytes()).decode("ascii")
	)
	html = render_guide(inline_images=inline, **common)

	if fmt == "zip":
	audio_disk_paths = [p for p in result.audio_paths if p is not None]
	zip_bytes = bundle_zip(
	html,
	[f.image_path for f in result.kept_frames],
	audio_paths=audio_disk_paths,
	)
	if output.suffix.lower() != ".zip":
	output = output.with_suffix(".zip")
	output.write_bytes(zip_bytes)
	else:
	output.write_text(html, encoding="utf-8")

	typer.echo(f"wrote {output}")


	@app.command("export-metadata")
	def export_metadata_cmd(
	video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
	transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
	title: str = typer.Option("Untitled Study Guide", "--title"),
	subtitle: Optional[str] = typer.Option(None, "--subtitle"),
	module: Optional[str] = typer.Option(None, "--module"),
	output: Path = typer.Option(Path("study_guide_metadata.json"), "--output", "-o"),
	frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
	auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
	whisper_model: str = typer.Option("small", "--whisper-model"),
	threshold: float = typer.Option(27.0, "--threshold"),
	min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
	max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
	skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
	skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
	face_threshold: float = typer.Option(0.12, "--face-threshold"),
	lang: str = typer.Option("en", "--lang"),
	verbose: bool = typer.Option(False, "--verbose", "-v"),
	) -> None:
	"""Run the pipeline and dump the metadata JSON only (no HTML)."""
	_setup_logging(verbose)
	inputs = PipelineInputs(
	video_path=video,
	transcript_path=transcript if transcript and transcript.exists() else None,
	frames_dir=frames_dir,
	title=title,
	subtitle=subtitle,
	module=module,
	lang=lang,
	threshold=threshold,
	min_gap_seconds=min_gap,
	max_frames=max_frames,
	skip_ocr=skip_ocr,
	skip_inverted_ocr=skip_inverted_ocr,
	face_threshold=face_threshold,
	auto_transcribe=auto_transcribe,
	whisper_model=whisper_model,
	inline_images=False,
	)
	try:
	with _progress_bar() as cb:
	result = run_pipeline(inputs, progress=cb)
	except PipelineError as exc:
	typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
	raise typer.Exit(2)
	dump_metadata(output, result.page)
	typer.echo(f"wrote {output} ({len(result.page.segments)} segments, frames in {frames_dir})")


	@app.command("render-from-metadata")
	def render_from_metadata_cmd(
	metadata_json: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
	output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
	frames_dir: Optional[Path] = typer.Option(
	None, "--frames-dir",
	help="Override the frames_dir recorded in the metadata JSON.",
	),
	fmt: str = typer.Option("single", "--format", help="review \| single \| guide"),
	verbose: bool = typer.Option(False, "--verbose", "-v"),
	) -> None:
	"""Re-render HTML from a previously exported metadata JSON."""
	_setup_logging(verbose)
	fmt = _resolve_format(fmt, output)
	if fmt == "zip":
	raise typer.BadParameter("zip format requires source frames; use 'build' instead.")
	page = load_metadata(metadata_json)

	resolved_frames_dir = (
	frames_dir
	if frames_dir is not None
	else (metadata_json.parent / page.frames_dir).resolve()
	)
	html = render_from_metadata(page, Path(resolved_frames_dir), mode=fmt)
	output.write_text(html, encoding="utf-8")
	typer.echo(f"wrote {output}")


	if __name__ == "__main__": # pragma: no cover
	app()