# Keyframe sampling for real video understanding (NINA-style): pull a handful of # evenly-spaced frames from a video and return them as base64 JPEG data URLs, ready # to hand to the vision model alongside any speech transcript. Uses PyAV (pulled in # transitively by faster-whisper) so there's no extra dependency. Best-effort: returns # [] on any failure rather than raising. import base64 import io from config import VIDEO_FRAME_MAX_PX, VIDEO_MAX_FRAMES def _to_data_url(image) -> str: from PIL import Image # noqa: F401 (image is already a PIL.Image) w, h = image.size scale = VIDEO_FRAME_MAX_PX / max(w, h) if scale < 1: image = image.resize((max(1, int(w * scale)), max(1, int(h * scale)))) if image.mode != "RGB": image = image.convert("RGB") buf = io.BytesIO() image.save(buf, format="JPEG", quality=85) return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode() def sample_frames(path: str, max_frames: int = VIDEO_MAX_FRAMES) -> list[tuple[float, str]]: """Return [(timestamp_seconds, data_url), …] sampled evenly across the video.""" try: import av except Exception as e: print(f"[LUNA] frames: PyAV unavailable ({e}) — video frames skipped.") return [] out: list[tuple[float, str]] = [] try: container = av.open(path) vstream = container.streams.video[0] vstream.thread_type = "AUTO" tb = vstream.time_base duration = float(vstream.duration * tb) if vstream.duration else None if not duration and container.duration: duration = float(container.duration) / 1_000_000.0 # AV_TIME_BASE if duration and duration > 0: # Evenly spaced timestamps; seek to each and grab the next decoded frame. times = [duration * (i + 1) / (max_frames + 1) for i in range(max_frames)] for t in times: try: container.seek(int(t / tb), stream=vstream, any_frame=False, backward=True) for frame in container.decode(vstream): out.append((round(t, 1), _to_data_url(frame.to_image()))) break except Exception: continue else: # No duration metadata: decode sequentially and keep every Nth frame. step, idx = 30, 0 for frame in container.decode(vstream): if idx % step == 0: out.append((round(float(frame.pts * tb) if frame.pts else 0.0, 1), _to_data_url(frame.to_image()))) if len(out) >= max_frames: break idx += 1 container.close() except Exception as e: print(f"[LUNA] frames: sampling failed for {path} ({type(e).__name__}: {e}).") return out print(f"[LUNA] frames: sampled {len(out)} keyframe(s).") return out