Spaces:
Sleeping
Sleeping
Malina
LUNA: real video understanding (frames+VLM), generous relevance + fallback, clarify endpoint, console job logs, 1h cache TTL
24ef38b | # Keyframe sampling for real video understanding (NINA-style): pull a handful of | |
| # evenly-spaced frames from a video and return them as base64 JPEG data URLs, ready | |
| # to hand to the vision model alongside any speech transcript. Uses PyAV (pulled in | |
| # transitively by faster-whisper) so there's no extra dependency. Best-effort: returns | |
| # [] on any failure rather than raising. | |
| import base64 | |
| import io | |
| from config import VIDEO_FRAME_MAX_PX, VIDEO_MAX_FRAMES | |
| def _to_data_url(image) -> str: | |
| from PIL import Image # noqa: F401 (image is already a PIL.Image) | |
| w, h = image.size | |
| scale = VIDEO_FRAME_MAX_PX / max(w, h) | |
| if scale < 1: | |
| image = image.resize((max(1, int(w * scale)), max(1, int(h * scale)))) | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| buf = io.BytesIO() | |
| image.save(buf, format="JPEG", quality=85) | |
| return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode() | |
| def sample_frames(path: str, max_frames: int = VIDEO_MAX_FRAMES) -> list[tuple[float, str]]: | |
| """Return [(timestamp_seconds, data_url), …] sampled evenly across the video.""" | |
| try: | |
| import av | |
| except Exception as e: | |
| print(f"[LUNA] frames: PyAV unavailable ({e}) — video frames skipped.") | |
| return [] | |
| out: list[tuple[float, str]] = [] | |
| try: | |
| container = av.open(path) | |
| vstream = container.streams.video[0] | |
| vstream.thread_type = "AUTO" | |
| tb = vstream.time_base | |
| duration = float(vstream.duration * tb) if vstream.duration else None | |
| if not duration and container.duration: | |
| duration = float(container.duration) / 1_000_000.0 # AV_TIME_BASE | |
| if duration and duration > 0: | |
| # Evenly spaced timestamps; seek to each and grab the next decoded frame. | |
| times = [duration * (i + 1) / (max_frames + 1) for i in range(max_frames)] | |
| for t in times: | |
| try: | |
| container.seek(int(t / tb), stream=vstream, any_frame=False, backward=True) | |
| for frame in container.decode(vstream): | |
| out.append((round(t, 1), _to_data_url(frame.to_image()))) | |
| break | |
| except Exception: | |
| continue | |
| else: | |
| # No duration metadata: decode sequentially and keep every Nth frame. | |
| step, idx = 30, 0 | |
| for frame in container.decode(vstream): | |
| if idx % step == 0: | |
| out.append((round(float(frame.pts * tb) if frame.pts else 0.0, 1), | |
| _to_data_url(frame.to_image()))) | |
| if len(out) >= max_frames: | |
| break | |
| idx += 1 | |
| container.close() | |
| except Exception as e: | |
| print(f"[LUNA] frames: sampling failed for {path} ({type(e).__name__}: {e}).") | |
| return out | |
| print(f"[LUNA] frames: sampled {len(out)} keyframe(s).") | |
| return out | |