| | """ |
| | Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints |
| | Model: nvidia/Eagle2.5-8B |
| | |
| | For ProofPath video assessment - long video understanding with up to 512 frames. |
| | Ideal for full rubric-based video grading in a single call. |
| | |
| | REQUIREMENTS: |
| | 1. Set HF_TOKEN environment variable (model is gated) |
| | 2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B |
| | """ |
| |
|
| | from typing import Dict, List, Any, Optional, Union |
| | import torch |
| | import numpy as np |
| | import base64 |
| | import io |
| | import tempfile |
| | import os |
| | import re |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path: str = ""): |
| | """ |
| | Initialize Eagle 2.5 model for video understanding. |
| | |
| | Args: |
| | path: Path to the model directory (ignored - we always load from HF hub) |
| | """ |
| | |
| | |
| | model_id = "nvidia/Eagle2.5-8B" |
| | |
| | |
| | hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") |
| | |
| | |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | |
| | |
| | from transformers import Qwen2VLProcessor, Qwen2VLForConditionalGeneration |
| | |
| | self.processor = Qwen2VLProcessor.from_pretrained( |
| | model_id, |
| | trust_remote_code=True, |
| | token=hf_token, |
| | ) |
| | |
| | |
| | if hasattr(self.processor, 'tokenizer'): |
| | self.processor.tokenizer.padding_side = "left" |
| | |
| | self.model = Qwen2VLForConditionalGeneration.from_pretrained( |
| | model_id, |
| | trust_remote_code=True, |
| | torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, |
| | attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa", |
| | device_map="auto" if torch.cuda.is_available() else None, |
| | token=hf_token, |
| | ) |
| | |
| | if not torch.cuda.is_available(): |
| | self.model = self.model.to(self.device) |
| | |
| | self.model.eval() |
| | |
| | |
| | self.default_max_frames = 256 |
| | self.max_frames_limit = 512 |
| | |
| | def _load_video_frames( |
| | self, |
| | video_data: Any, |
| | max_frames: int = 256, |
| | fps: float = 2.0 |
| | ) -> tuple: |
| | """ |
| | Load video frames from various input formats. |
| | |
| | Supports: |
| | - URL to video file |
| | - Base64 encoded video |
| | - Raw bytes |
| | """ |
| | import cv2 |
| | from PIL import Image |
| | |
| | |
| | if isinstance(video_data, str): |
| | if video_data.startswith(('http://', 'https://')): |
| | |
| | import requests |
| | response = requests.get(video_data, stream=True) |
| | with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f: |
| | for chunk in response.iter_content(chunk_size=8192): |
| | f.write(chunk) |
| | video_path = f.name |
| | elif video_data.startswith('data:'): |
| | |
| | header, encoded = video_data.split(',', 1) |
| | video_bytes = base64.b64decode(encoded) |
| | with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f: |
| | f.write(video_bytes) |
| | video_path = f.name |
| | else: |
| | |
| | video_bytes = base64.b64decode(video_data) |
| | with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f: |
| | f.write(video_bytes) |
| | video_path = f.name |
| | elif isinstance(video_data, bytes): |
| | with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f: |
| | f.write(video_data) |
| | video_path = f.name |
| | else: |
| | raise ValueError(f"Unsupported video input type: {type(video_data)}") |
| | |
| | try: |
| | |
| | cap = cv2.VideoCapture(video_path) |
| | video_fps = cap.get(cv2.CAP_PROP_FPS) |
| | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
| | duration = total_frames / video_fps if video_fps > 0 else 0 |
| | |
| | |
| | target_frames = min(max_frames, int(duration * fps), total_frames) |
| | if target_frames <= 0: |
| | target_frames = min(max_frames, total_frames) |
| | |
| | frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int) |
| | |
| | frames = [] |
| | for idx in frame_indices: |
| | cap.set(cv2.CAP_PROP_POS_FRAMES, idx) |
| | ret, frame = cap.read() |
| | if ret: |
| | |
| | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
| | pil_image = Image.fromarray(frame_rgb) |
| | frames.append(pil_image) |
| | |
| | cap.release() |
| | |
| | return frames, { |
| | "duration": duration, |
| | "total_frames": total_frames, |
| | "sampled_frames": len(frames), |
| | "video_fps": video_fps |
| | } |
| | |
| | finally: |
| | |
| | if os.path.exists(video_path): |
| | os.unlink(video_path) |
| | |
| | def _load_image(self, image_data: Any): |
| | """Load a single image from various formats.""" |
| | from PIL import Image |
| | import requests |
| | |
| | if isinstance(image_data, Image.Image): |
| | return image_data |
| | elif isinstance(image_data, str): |
| | if image_data.startswith(('http://', 'https://')): |
| | response = requests.get(image_data, stream=True) |
| | return Image.open(response.raw).convert('RGB') |
| | elif image_data.startswith('data:'): |
| | header, encoded = image_data.split(',', 1) |
| | image_bytes = base64.b64decode(encoded) |
| | return Image.open(io.BytesIO(image_bytes)).convert('RGB') |
| | else: |
| | image_bytes = base64.b64decode(image_data) |
| | return Image.open(io.BytesIO(image_bytes)).convert('RGB') |
| | elif isinstance(image_data, bytes): |
| | return Image.open(io.BytesIO(image_data)).convert('RGB') |
| | else: |
| | raise ValueError(f"Unsupported image input type: {type(image_data)}") |
| | |
| | def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| | """ |
| | Process video or images with Eagle 2.5. |
| | |
| | Expected input formats: |
| | |
| | 1. Video analysis: |
| | { |
| | "inputs": <video_url_or_base64>, |
| | "parameters": { |
| | "prompt": "Describe what happens in this video.", |
| | "max_frames": 256, |
| | "fps": 2.0, |
| | "max_new_tokens": 2048 |
| | } |
| | } |
| | |
| | 2. Image analysis: |
| | { |
| | "inputs": <image_url_or_base64>, |
| | "parameters": { |
| | "prompt": "Describe this image.", |
| | "max_new_tokens": 512 |
| | } |
| | } |
| | |
| | 3. Multi-image analysis: |
| | { |
| | "inputs": [<image1>, <image2>, ...], |
| | "parameters": { |
| | "prompt": "Compare these images.", |
| | "max_new_tokens": 1024 |
| | } |
| | } |
| | |
| | 4. ProofPath rubric grading: |
| | { |
| | "inputs": <video_url>, |
| | "parameters": { |
| | "mode": "rubric", |
| | "rubric": [ |
| | {"step": 1, "description": "Click cell B2"}, |
| | {"step": 2, "description": "Type 123"}, |
| | {"step": 3, "description": "Press Enter"} |
| | ], |
| | "max_frames": 512, |
| | "output_format": "json" |
| | } |
| | } |
| | |
| | Returns: |
| | { |
| | "generated_text": "...", |
| | "video_metadata": {...}, # If video input |
| | } |
| | """ |
| | inputs = data.get("inputs") |
| | if inputs is None: |
| | inputs = data.get("video") or data.get("image") or data.get("images") |
| | if inputs is None: |
| | raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.") |
| | |
| | params = data.get("parameters", {}) |
| | mode = params.get("mode", "default") |
| | prompt = params.get("prompt", "Describe this content in detail.") |
| | max_new_tokens = params.get("max_new_tokens", 2048) |
| | |
| | try: |
| | if mode == "rubric": |
| | return self._grade_rubric(inputs, params) |
| | elif isinstance(inputs, list): |
| | return self._process_multi_image(inputs, prompt, max_new_tokens) |
| | elif self._is_video(inputs, params): |
| | return self._process_video(inputs, prompt, params, max_new_tokens) |
| | else: |
| | return self._process_image(inputs, prompt, max_new_tokens) |
| | |
| | except Exception as e: |
| | import traceback |
| | return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()} |
| | |
| | def _is_video(self, inputs: Any, params: Dict) -> bool: |
| | """Determine if input is video based on params or file extension.""" |
| | if params.get("input_type") == "video": |
| | return True |
| | if params.get("input_type") == "image": |
| | return False |
| | |
| | if isinstance(inputs, str): |
| | lower = inputs.lower() |
| | video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v'] |
| | return any(ext in lower for ext in video_exts) |
| | |
| | return False |
| | |
| | def _process_video( |
| | self, |
| | video_data: Any, |
| | prompt: str, |
| | params: Dict, |
| | max_new_tokens: int |
| | ) -> Dict[str, Any]: |
| | """Process a video input.""" |
| | from qwen_vl_utils import process_vision_info |
| | |
| | max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit) |
| | fps = params.get("fps", 2.0) |
| | |
| | |
| | frames, video_metadata = self._load_video_frames(video_data, max_frames, fps) |
| | |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "video", "video": frames, "fps": fps}, |
| | {"type": "text", "text": prompt}, |
| | ], |
| | } |
| | ] |
| | |
| | |
| | text = self.processor.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| | |
| | |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | |
| | inputs = self.processor( |
| | text=[text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | inputs = inputs.to(self.model.device) |
| | |
| | |
| | with torch.inference_mode(): |
| | generated_ids = self.model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | do_sample=False, |
| | ) |
| | |
| | |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | generated_text = self.processor.batch_decode( |
| | generated_ids_trimmed, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=False |
| | )[0] |
| | |
| | return { |
| | "generated_text": generated_text, |
| | "video_metadata": video_metadata |
| | } |
| | |
| | def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]: |
| | """Process a single image.""" |
| | from qwen_vl_utils import process_vision_info |
| | |
| | image = self._load_image(image_data) |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", "image": image}, |
| | {"type": "text", "text": prompt}, |
| | ], |
| | } |
| | ] |
| | |
| | text = self.processor.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| | |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | |
| | inputs = self.processor( |
| | text=[text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | inputs = inputs.to(self.model.device) |
| | |
| | with torch.inference_mode(): |
| | generated_ids = self.model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | do_sample=False, |
| | ) |
| | |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | generated_text = self.processor.batch_decode( |
| | generated_ids_trimmed, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=False |
| | )[0] |
| | |
| | return { |
| | "generated_text": generated_text, |
| | "image_size": {"width": image.width, "height": image.height} |
| | } |
| | |
| | def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]: |
| | """Process multiple images.""" |
| | from qwen_vl_utils import process_vision_info |
| | |
| | images = [self._load_image(img) for img in images_data] |
| | |
| | |
| | content = [] |
| | for image in images: |
| | content.append({"type": "image", "image": image}) |
| | content.append({"type": "text", "text": prompt}) |
| | |
| | messages = [{"role": "user", "content": content}] |
| | |
| | text = self.processor.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| | |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | |
| | inputs = self.processor( |
| | text=[text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | inputs = inputs.to(self.model.device) |
| | |
| | with torch.inference_mode(): |
| | generated_ids = self.model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | do_sample=False, |
| | ) |
| | |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | generated_text = self.processor.batch_decode( |
| | generated_ids_trimmed, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=False |
| | )[0] |
| | |
| | return { |
| | "generated_text": generated_text, |
| | "num_images": len(images) |
| | } |
| | |
| | def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]: |
| | """ |
| | Grade a video against a rubric - ProofPath specific mode. |
| | """ |
| | from qwen_vl_utils import process_vision_info |
| | |
| | rubric = params.get("rubric", []) |
| | if not rubric: |
| | raise ValueError("Rubric required for rubric mode") |
| | |
| | max_frames = min(params.get("max_frames", 512), self.max_frames_limit) |
| | fps = params.get("fps", 2.0) |
| | output_format = params.get("output_format", "json") |
| | |
| | |
| | frames, video_metadata = self._load_video_frames(video_data, max_frames, fps) |
| | |
| | |
| | rubric_text = "\n".join([ |
| | f"Step {item.get('step', i+1)}: {item.get('description', '')}" |
| | for i, item in enumerate(rubric) |
| | ]) |
| | |
| | if output_format == "json": |
| | prompt = f"""Analyze this video against the following rubric and grade each step. |
| | |
| | RUBRIC: |
| | {rubric_text} |
| | |
| | For EACH step, determine: |
| | 1. Whether it was completed (true/false) |
| | 2. The approximate timestamp where it occurs (if completed) |
| | 3. Any issues or partial completion notes |
| | |
| | Respond ONLY with a JSON array in this exact format: |
| | [ |
| | {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}}, |
| | {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}}, |
| | ... |
| | ]""" |
| | else: |
| | prompt = f"""Analyze this video against the following rubric: |
| | |
| | RUBRIC: |
| | {rubric_text} |
| | |
| | For each step, describe whether it was completed, when it occurred, and any issues observed.""" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "video", "video": frames, "fps": fps}, |
| | {"type": "text", "text": prompt}, |
| | ], |
| | } |
| | ] |
| | |
| | text = self.processor.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| | |
| | image_inputs, video_inputs = process_vision_info(messages) |
| | |
| | inputs = self.processor( |
| | text=[text], |
| | images=image_inputs, |
| | videos=video_inputs, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | inputs = inputs.to(self.model.device) |
| | |
| | with torch.inference_mode(): |
| | generated_ids = self.model.generate( |
| | **inputs, |
| | max_new_tokens=params.get("max_new_tokens", 2048), |
| | do_sample=False, |
| | ) |
| | |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | generated_text = self.processor.batch_decode( |
| | generated_ids_trimmed, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=False |
| | )[0] |
| | |
| | result = { |
| | "generated_text": generated_text, |
| | "video_metadata": video_metadata, |
| | "rubric": rubric |
| | } |
| | |
| | |
| | if output_format == "json": |
| | try: |
| | import json |
| | |
| | json_match = re.search(r'\[[\s\S]*\]', generated_text) |
| | if json_match: |
| | result["grading_results"] = json.loads(json_match.group()) |
| | except json.JSONDecodeError: |
| | pass |
| | |
| | return result |
| |
|