Spaces:

NextNeural
/

LUNA

Sleeping

LUNA / frames.py

Malina

LUNA: real video understanding (frames+VLM), generous relevance + fallback, clarify endpoint, console job logs, 1h cache TTL

24ef38b 10 days ago

raw

history blame contribute delete

2.97 kB

	# Keyframe sampling for real video understanding (NINA-style): pull a handful of
	# evenly-spaced frames from a video and return them as base64 JPEG data URLs, ready
	# to hand to the vision model alongside any speech transcript. Uses PyAV (pulled in
	# transitively by faster-whisper) so there's no extra dependency. Best-effort: returns
	# [] on any failure rather than raising.
	import base64
	import io

	from config import VIDEO_FRAME_MAX_PX, VIDEO_MAX_FRAMES


	def _to_data_url(image) -> str:
	from PIL import Image # noqa: F401 (image is already a PIL.Image)
	w, h = image.size
	scale = VIDEO_FRAME_MAX_PX / max(w, h)
	if scale < 1:
	image = image.resize((max(1, int(w * scale)), max(1, int(h * scale))))
	if image.mode != "RGB":
	image = image.convert("RGB")
	buf = io.BytesIO()
	image.save(buf, format="JPEG", quality=85)
	return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()


	def sample_frames(path: str, max_frames: int = VIDEO_MAX_FRAMES) -> list[tuple[float, str]]:
	"""Return [(timestamp_seconds, data_url), …] sampled evenly across the video."""
	try:
	import av
	except Exception as e:
	print(f"[LUNA] frames: PyAV unavailable ({e}) — video frames skipped.")
	return []

	out: list[tuple[float, str]] = []
	try:
	container = av.open(path)
	vstream = container.streams.video[0]
	vstream.thread_type = "AUTO"
	tb = vstream.time_base
	duration = float(vstream.duration * tb) if vstream.duration else None
	if not duration and container.duration:
	duration = float(container.duration) / 1_000_000.0 # AV_TIME_BASE

	if duration and duration > 0:
	# Evenly spaced timestamps; seek to each and grab the next decoded frame.
	times = [duration * (i + 1) / (max_frames + 1) for i in range(max_frames)]
	for t in times:
	try:
	container.seek(int(t / tb), stream=vstream, any_frame=False, backward=True)
	for frame in container.decode(vstream):
	out.append((round(t, 1), _to_data_url(frame.to_image())))
	break
	except Exception:
	continue
	else:
	# No duration metadata: decode sequentially and keep every Nth frame.
	step, idx = 30, 0
	for frame in container.decode(vstream):
	if idx % step == 0:
	out.append((round(float(frame.pts * tb) if frame.pts else 0.0, 1),
	_to_data_url(frame.to_image())))
	if len(out) >= max_frames:
	break
	idx += 1
	container.close()
	except Exception as e:
	print(f"[LUNA] frames: sampling failed for {path} ({type(e).__name__}: {e}).")
	return out
	print(f"[LUNA] frames: sampled {len(out)} keyframe(s).")
	return out