# Keyframe sampling for real video understanding (NINA-style): pull a handful of
# evenly-spaced frames from a video and return them as base64 JPEG data URLs, ready
# to hand to the vision model alongside any speech transcript. Uses PyAV (pulled in
# transitively by faster-whisper) so there's no extra dependency. Best-effort: returns
# [] on any failure rather than raising.
import base64
import io

from config import VIDEO_FRAME_MAX_PX, VIDEO_MAX_FRAMES


def _to_data_url(image) -> str:
    from PIL import Image  # noqa: F401  (image is already a PIL.Image)
    w, h = image.size
    scale = VIDEO_FRAME_MAX_PX / max(w, h)
    if scale < 1:
        image = image.resize((max(1, int(w * scale)), max(1, int(h * scale))))
    if image.mode != "RGB":
        image = image.convert("RGB")
    buf = io.BytesIO()
    image.save(buf, format="JPEG", quality=85)
    return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()


def sample_frames(path: str, max_frames: int = VIDEO_MAX_FRAMES) -> list[tuple[float, str]]:
    """Return [(timestamp_seconds, data_url), …] sampled evenly across the video."""
    try:
        import av
    except Exception as e:
        print(f"[LUNA] frames: PyAV unavailable ({e}) — video frames skipped.")
        return []

    out: list[tuple[float, str]] = []
    try:
        container = av.open(path)
        vstream = container.streams.video[0]
        vstream.thread_type = "AUTO"
        tb = vstream.time_base
        duration = float(vstream.duration * tb) if vstream.duration else None
        if not duration and container.duration:
            duration = float(container.duration) / 1_000_000.0  # AV_TIME_BASE

        if duration and duration > 0:
            # Evenly spaced timestamps; seek to each and grab the next decoded frame.
            times = [duration * (i + 1) / (max_frames + 1) for i in range(max_frames)]
            for t in times:
                try:
                    container.seek(int(t / tb), stream=vstream, any_frame=False, backward=True)
                    for frame in container.decode(vstream):
                        out.append((round(t, 1), _to_data_url(frame.to_image())))
                        break
                except Exception:
                    continue
        else:
            # No duration metadata: decode sequentially and keep every Nth frame.
            step, idx = 30, 0
            for frame in container.decode(vstream):
                if idx % step == 0:
                    out.append((round(float(frame.pts * tb) if frame.pts else 0.0, 1),
                                _to_data_url(frame.to_image())))
                    if len(out) >= max_frames:
                        break
                idx += 1
        container.close()
    except Exception as e:
        print(f"[LUNA] frames: sampling failed for {path} ({type(e).__name__}: {e}).")
        return out
    print(f"[LUNA] frames: sampled {len(out)} keyframe(s).")
    return out