eagle / handler.py

Update handler.py

bd33527 verified about 1 month ago

19.3 kB

	"""
	Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
	Model: nvidia/Eagle2.5-8B

	For ProofPath video assessment - long video understanding with up to 512 frames.
	Ideal for full rubric-based video grading in a single call.

	REQUIREMENTS:
	1. Set HF_TOKEN environment variable (model is gated)
	2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B
	"""

	from typing import Dict, List, Any, Optional, Union
	import torch
	import numpy as np
	import base64
	import io
	import tempfile
	import os
	import re


	class EndpointHandler:
	def __init__(self, path: str = ""):
	"""
	Initialize Eagle 2.5 model for video understanding.

	Args:
	path: Path to the model directory (ignored - we always load from HF hub)
	"""
	# IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
	# The repository only contains handler.py and requirements.txt
	model_id = "nvidia/Eagle2.5-8B"

	# Get HF token from environment for gated model access
	hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")

	# Determine device
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Eagle 2.5 uses Qwen2VLProcessor - import and load directly
	from transformers import Qwen2VLProcessor, Qwen2VLForConditionalGeneration

	self.processor = Qwen2VLProcessor.from_pretrained(
	model_id,
	trust_remote_code=True,
	token=hf_token,
	)

	# Set padding side for batch processing
	if hasattr(self.processor, 'tokenizer'):
	self.processor.tokenizer.padding_side = "left"

	self.model = Qwen2VLForConditionalGeneration.from_pretrained(
	model_id,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
	device_map="auto" if torch.cuda.is_available() else None,
	token=hf_token,
	)

	if not torch.cuda.is_available():
	self.model = self.model.to(self.device)

	self.model.eval()

	# Default config - Eagle 2.5 supports up to 512 frames
	self.default_max_frames = 256 # Conservative default
	self.max_frames_limit = 512

	def _load_video_frames(
	self,
	video_data: Any,
	max_frames: int = 256,
	fps: float = 2.0
	) -> tuple:
	"""
	Load video frames from various input formats.

	Supports:
	- URL to video file
	- Base64 encoded video
	- Raw bytes
	"""
	import cv2
	from PIL import Image

	# Decode video to temp file if needed
	if isinstance(video_data, str):
	if video_data.startswith(('http://', 'https://')):
	# URL - download to temp file
	import requests
	response = requests.get(video_data, stream=True)
	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	video_path = f.name
	elif video_data.startswith('data:'):
	# Data URL format
	header, encoded = video_data.split(',', 1)
	video_bytes = base64.b64decode(encoded)
	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
	f.write(video_bytes)
	video_path = f.name
	else:
	# Assume base64 encoded
	video_bytes = base64.b64decode(video_data)
	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
	f.write(video_bytes)
	video_path = f.name
	elif isinstance(video_data, bytes):
	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
	f.write(video_data)
	video_path = f.name
	else:
	raise ValueError(f"Unsupported video input type: {type(video_data)}")

	try:
	# Open video with OpenCV
	cap = cv2.VideoCapture(video_path)
	video_fps = cap.get(cv2.CAP_PROP_FPS)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	duration = total_frames / video_fps if video_fps > 0 else 0

	# Calculate frame indices to sample
	target_frames = min(max_frames, int(duration * fps), total_frames)
	if target_frames <= 0:
	target_frames = min(max_frames, total_frames)

	frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)

	frames = []
	for idx in frame_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(frame_rgb)
	frames.append(pil_image)

	cap.release()

	return frames, {
	"duration": duration,
	"total_frames": total_frames,
	"sampled_frames": len(frames),
	"video_fps": video_fps
	}

	finally:
	# Clean up temp file
	if os.path.exists(video_path):
	os.unlink(video_path)

	def _load_image(self, image_data: Any):
	"""Load a single image from various formats."""
	from PIL import Image
	import requests

	if isinstance(image_data, Image.Image):
	return image_data
	elif isinstance(image_data, str):
	if image_data.startswith(('http://', 'https://')):
	response = requests.get(image_data, stream=True)
	return Image.open(response.raw).convert('RGB')
	elif image_data.startswith('data:'):
	header, encoded = image_data.split(',', 1)
	image_bytes = base64.b64decode(encoded)
	return Image.open(io.BytesIO(image_bytes)).convert('RGB')
	else:
	image_bytes = base64.b64decode(image_data)
	return Image.open(io.BytesIO(image_bytes)).convert('RGB')
	elif isinstance(image_data, bytes):
	return Image.open(io.BytesIO(image_data)).convert('RGB')
	else:
	raise ValueError(f"Unsupported image input type: {type(image_data)}")

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Process video or images with Eagle 2.5.

	Expected input formats:

	1. Video analysis:
	{
	"inputs": <video_url_or_base64>,
	"parameters": {
	"prompt": "Describe what happens in this video.",
	"max_frames": 256,
	"fps": 2.0,
	"max_new_tokens": 2048
	}
	}

	2. Image analysis:
	{
	"inputs": <image_url_or_base64>,
	"parameters": {
	"prompt": "Describe this image.",
	"max_new_tokens": 512
	}
	}

	3. Multi-image analysis:
	{
	"inputs": [<image1>, <image2>, ...],
	"parameters": {
	"prompt": "Compare these images.",
	"max_new_tokens": 1024
	}
	}

	4. ProofPath rubric grading:
	{
	"inputs": <video_url>,
	"parameters": {
	"mode": "rubric",
	"rubric": [
	{"step": 1, "description": "Click cell B2"},
	{"step": 2, "description": "Type 123"},
	{"step": 3, "description": "Press Enter"}
	],
	"max_frames": 512,
	"output_format": "json"
	}
	}

	Returns:
	{
	"generated_text": "...",
	"video_metadata": {...}, # If video input
	}
	"""
	inputs = data.get("inputs")
	if inputs is None:
	inputs = data.get("video") or data.get("image") or data.get("images")
	if inputs is None:
	raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")

	params = data.get("parameters", {})
	mode = params.get("mode", "default")
	prompt = params.get("prompt", "Describe this content in detail.")
	max_new_tokens = params.get("max_new_tokens", 2048)

	try:
	if mode == "rubric":
	return self._grade_rubric(inputs, params)
	elif isinstance(inputs, list):
	return self._process_multi_image(inputs, prompt, max_new_tokens)
	elif self._is_video(inputs, params):
	return self._process_video(inputs, prompt, params, max_new_tokens)
	else:
	return self._process_image(inputs, prompt, max_new_tokens)

	except Exception as e:
	import traceback
	return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}

	def _is_video(self, inputs: Any, params: Dict) -> bool:
	"""Determine if input is video based on params or file extension."""
	if params.get("input_type") == "video":
	return True
	if params.get("input_type") == "image":
	return False

	if isinstance(inputs, str):
	lower = inputs.lower()
	video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
	return any(ext in lower for ext in video_exts)

	return False

	def _process_video(
	self,
	video_data: Any,
	prompt: str,
	params: Dict,
	max_new_tokens: int
	) -> Dict[str, Any]:
	"""Process a video input."""
	from qwen_vl_utils import process_vision_info

	max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
	fps = params.get("fps", 2.0)

	# Load video frames
	frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)

	# Build message for Eagle 2.5 / Qwen2-VL format
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "video", "video": frames, "fps": fps},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Apply chat template
	text = self.processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process vision info
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = self.processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(self.model.device)

	# Generate
	with torch.inference_mode():
	generated_ids = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	)

	# Decode - only the new tokens
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	generated_text = self.processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return {
	"generated_text": generated_text,
	"video_metadata": video_metadata
	}

	def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
	"""Process a single image."""
	from qwen_vl_utils import process_vision_info

	image = self._load_image(image_data)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt},
	],
	}
	]

	text = self.processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	image_inputs, video_inputs = process_vision_info(messages)

	inputs = self.processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(self.model.device)

	with torch.inference_mode():
	generated_ids = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	generated_text = self.processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return {
	"generated_text": generated_text,
	"image_size": {"width": image.width, "height": image.height}
	}

	def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
	"""Process multiple images."""
	from qwen_vl_utils import process_vision_info

	images = [self._load_image(img) for img in images_data]

	# Build content with all images
	content = []
	for image in images:
	content.append({"type": "image", "image": image})
	content.append({"type": "text", "text": prompt})

	messages = [{"role": "user", "content": content}]

	text = self.processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	image_inputs, video_inputs = process_vision_info(messages)

	inputs = self.processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(self.model.device)

	with torch.inference_mode():
	generated_ids = self.model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	generated_text = self.processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return {
	"generated_text": generated_text,
	"num_images": len(images)
	}

	def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
	"""
	Grade a video against a rubric - ProofPath specific mode.
	"""
	from qwen_vl_utils import process_vision_info

	rubric = params.get("rubric", [])
	if not rubric:
	raise ValueError("Rubric required for rubric mode")

	max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
	fps = params.get("fps", 2.0)
	output_format = params.get("output_format", "json")

	# Load video
	frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)

	# Build rubric prompt
	rubric_text = "\n".join([
	f"Step {item.get('step', i+1)}: {item.get('description', '')}"
	for i, item in enumerate(rubric)
	])

	if output_format == "json":
	prompt = f"""Analyze this video against the following rubric and grade each step.

	RUBRIC:
	{rubric_text}

	For EACH step, determine:
	1. Whether it was completed (true/false)
	2. The approximate timestamp where it occurs (if completed)
	3. Any issues or partial completion notes

	Respond ONLY with a JSON array in this exact format:
	[
	{{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
	{{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
	...
	]"""
	else:
	prompt = f"""Analyze this video against the following rubric:

	RUBRIC:
	{rubric_text}

	For each step, describe whether it was completed, when it occurred, and any issues observed."""

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "video", "video": frames, "fps": fps},
	{"type": "text", "text": prompt},
	],
	}
	]

	text = self.processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	image_inputs, video_inputs = process_vision_info(messages)

	inputs = self.processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(self.model.device)

	with torch.inference_mode():
	generated_ids = self.model.generate(
	**inputs,
	max_new_tokens=params.get("max_new_tokens", 2048),
	do_sample=False,
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	generated_text = self.processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	result = {
	"generated_text": generated_text,
	"video_metadata": video_metadata,
	"rubric": rubric
	}

	# Try to parse JSON if requested
	if output_format == "json":
	try:
	import json
	# Extract JSON array from response
	json_match = re.search(r'\[[\s\S]*\]', generated_text)
	if json_match:
	result["grading_results"] = json.loads(json_match.group())
	except json.JSONDecodeError:
	pass # Keep raw text if JSON parsing fails

	return result