LUNA / frames.py
Malina
LUNA: real video understanding (frames+VLM), generous relevance + fallback, clarify endpoint, console job logs, 1h cache TTL
24ef38b
# Keyframe sampling for real video understanding (NINA-style): pull a handful of
# evenly-spaced frames from a video and return them as base64 JPEG data URLs, ready
# to hand to the vision model alongside any speech transcript. Uses PyAV (pulled in
# transitively by faster-whisper) so there's no extra dependency. Best-effort: returns
# [] on any failure rather than raising.
import base64
import io
from config import VIDEO_FRAME_MAX_PX, VIDEO_MAX_FRAMES
def _to_data_url(image) -> str:
from PIL import Image # noqa: F401 (image is already a PIL.Image)
w, h = image.size
scale = VIDEO_FRAME_MAX_PX / max(w, h)
if scale < 1:
image = image.resize((max(1, int(w * scale)), max(1, int(h * scale))))
if image.mode != "RGB":
image = image.convert("RGB")
buf = io.BytesIO()
image.save(buf, format="JPEG", quality=85)
return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()
def sample_frames(path: str, max_frames: int = VIDEO_MAX_FRAMES) -> list[tuple[float, str]]:
"""Return [(timestamp_seconds, data_url), …] sampled evenly across the video."""
try:
import av
except Exception as e:
print(f"[LUNA] frames: PyAV unavailable ({e}) — video frames skipped.")
return []
out: list[tuple[float, str]] = []
try:
container = av.open(path)
vstream = container.streams.video[0]
vstream.thread_type = "AUTO"
tb = vstream.time_base
duration = float(vstream.duration * tb) if vstream.duration else None
if not duration and container.duration:
duration = float(container.duration) / 1_000_000.0 # AV_TIME_BASE
if duration and duration > 0:
# Evenly spaced timestamps; seek to each and grab the next decoded frame.
times = [duration * (i + 1) / (max_frames + 1) for i in range(max_frames)]
for t in times:
try:
container.seek(int(t / tb), stream=vstream, any_frame=False, backward=True)
for frame in container.decode(vstream):
out.append((round(t, 1), _to_data_url(frame.to_image())))
break
except Exception:
continue
else:
# No duration metadata: decode sequentially and keep every Nth frame.
step, idx = 30, 0
for frame in container.decode(vstream):
if idx % step == 0:
out.append((round(float(frame.pts * tb) if frame.pts else 0.0, 1),
_to_data_url(frame.to_image())))
if len(out) >= max_frames:
break
idx += 1
container.close()
except Exception as e:
print(f"[LUNA] frames: sampling failed for {path} ({type(e).__name__}: {e}).")
return out
print(f"[LUNA] frames: sampled {len(out)} keyframe(s).")
return out