Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

SpatialTrackerV2 / app_release.py

xiaoyuxi

vggt_da

e43b66a 6 months ago

56.3 kB

	import gradio as gr
	import os
	import json
	import numpy as np
	import cv2
	import base64
	import requests
	import time
	from typing import List, Tuple
	from gradio_client.utils import handle_file
	from pathlib import Path

	# Backend Space URL - replace with your actual backend space URL
	BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend" # Replace with actual backend space URL
	hf_token = os.getenv("HF_TOKEN") # Replace with your actual Hugging Face token

	# Debug information
	print(f"🔧 Environment Debug Info:")
	print(f" - Backend URL: {BACKEND_SPACE_URL}")
	print(f" - HF Token available: {'Yes' if hf_token else 'No'}")
	print(f" - HF Token length: {len(hf_token) if hf_token else 0}")

	# Flag to track if backend is available
	BACKEND_AVAILABLE = False
	backend_client = None

	def check_user_permissions():
	"""Check if user has necessary permissions"""
	print("🔐 Checking user permissions...")

	if not hf_token:
	print("❌ No HF Token found")
	print("🔧 To get a token:")
	print(" 1. Go to https://huggingface.co/settings/tokens")
	print(" 2. Create a new token with 'read' permissions")
	print(" 3. Set it as environment variable: export HF_TOKEN='your_token'")
	return False

	# Try to access user info
	try:
	headers = {'Authorization': f'Bearer {hf_token}'}
	response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)

	if response.status_code == 200:
	user_info = response.json()
	username = user_info.get('name', 'Unknown')
	print(f"✅ Authenticated as: {username}")

	# Check if user has access to the specific space
	space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
	space_response = requests.get(space_url, headers=headers, timeout=5)

	if space_response.status_code == 200:
	print("✅ You have access to the backend Space")
	return True
	elif space_response.status_code == 401:
	print("❌ You don't have access to the backend Space")
	print("🔧 Solutions:")
	print(" 1. Contact the Space owner to add you as collaborator")
	print(" 2. Ask the owner to make the Space public")
	return False
	elif space_response.status_code == 404:
	print("❌ Backend Space not found")
	print("🔧 Please check if the Space URL is correct")
	return False
	else:
	print(f"⚠️ Unexpected response checking Space access: {space_response.status_code}")
	return False

	else:
	print(f"❌ Token validation failed: {response.status_code}")
	print("🔧 Your token might be invalid or expired")
	return False

	except Exception as e:
	print(f"❌ Error checking permissions: {e}")
	return False

	def check_backend_space_status():
	"""Check if backend space is running via HTTP request"""
	try:
	backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
	print(f"🔍 Checking backend space status: {backend_url}")

	# Prepare headers with authentication if token is available
	headers = {}
	if hf_token:
	headers['Authorization'] = f'Bearer {hf_token}'
	print(f"🔐 Using HF Token for authentication")

	# Try to access the space page
	response = requests.get(backend_url, headers=headers, timeout=10)

	if response.status_code == 200:
	print("✅ Backend space page is accessible")

	# Check if space is running (look for common indicators)
	page_content = response.text.lower()
	if "runtime error" in page_content:
	print("❌ Backend space has runtime error")
	return False
	elif "building" in page_content:
	print("🔄 Backend space is building...")
	return False
	elif "sleeping" in page_content:
	print("😴 Backend space is sleeping")
	return False
	else:
	print("✅ Backend space appears to be running")
	return True

	elif response.status_code == 401:
	print("❌ Authentication failed (HTTP 401)")
	print("🔧 This means:")
	print(" - The backend Space is private")
	print(" - Your HF Token doesn't have access to this Space")
	print(" - You need to be added as a collaborator to the Space")
	print(" - Or the Space owner needs to make it public")
	return False

	elif response.status_code == 404:
	print("❌ Backend space not found (HTTP 404)")
	print("🔧 Please check if the Space URL is correct:")
	print(f" Current URL: {BACKEND_SPACE_URL}")
	return False

	else:
	print(f"❌ Backend space not accessible (HTTP {response.status_code})")
	print(f"🔧 Response: {response.text[:200]}...")
	return False

	except requests.RequestException as e:
	print(f"❌ Failed to check backend space status: {e}")
	return False
	except Exception as e:
	print(f"❌ Unexpected error checking backend: {e}")
	return False

	def initialize_backend():
	"""Initialize backend connection using gradio_client"""
	global backend_client, BACKEND_AVAILABLE

	try:
	from gradio_client import Client

	# Connect to HF Space
	if hf_token:
	backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
	else:
	backend_client = Client(BACKEND_SPACE_URL)

	# Test the connection
	backend_client.view_api()
	BACKEND_AVAILABLE = True
	return True

	except Exception as e:
	print(f"❌ Backend connection failed: {e}")
	BACKEND_AVAILABLE = False
	return False

	def numpy_to_base64(arr):
	"""Convert numpy array to base64 string"""
	return base64.b64encode(arr.tobytes()).decode('utf-8')

	def base64_to_numpy(b64_str, shape, dtype):
	"""Convert base64 string back to numpy array"""
	return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)

	def base64_to_image(b64_str):
	"""Convert base64 string to numpy image array"""
	if not b64_str:
	return None
	try:
	# Decode base64 to bytes
	img_bytes = base64.b64decode(b64_str)
	# Convert bytes to numpy array
	nparr = np.frombuffer(img_bytes, np.uint8)
	# Decode image
	img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
	# Convert BGR to RGB
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	return img
	except Exception as e:
	print(f"Error converting base64 to image: {e}")
	return None

	def get_video_name(video_path):
	"""Extract video name without extension"""
	return os.path.splitext(os.path.basename(video_path))[0]

	def extract_first_frame(video_path):
	"""Extract first frame from video file"""
	try:
	cap = cv2.VideoCapture(video_path)
	ret, frame = cap.read()
	cap.release()

	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	return frame_rgb
	else:
	return None
	except Exception as e:
	print(f"Error extracting first frame: {e}")
	return None

	def handle_video_upload(video):
	"""Handle video upload and extract first frame"""
	if video is None:
	return (None, None, [],
	gr.update(value=50),
	gr.update(value=756),
	gr.update(value=3))

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print("🔧 Calling backend API for video upload...")

	# Call the unified API with upload_video function type - fix: use handle_file wrapper
	result = backend_client.predict(
	"upload_video", # function_type
	handle_file(video), # video file - wrapped with handle_file
	"", # original_image_state (not used for upload)
	[], # selected_points (not used for upload)
	"positive_point", # point_type (not used for upload)
	0, # point_x (not used for upload)
	0, # point_y (not used for upload)
	50, # grid_size (not used for upload)
	756, # vo_points (not used for upload)
	3, # fps (not used for upload)
	api_name="/unified_api"
	)

	print(f"✅ Backend video upload API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result - expect a dict with success status
	if isinstance(result, dict) and result.get("success"):
	# Extract data from backend response
	original_image_state = result.get("original_image_state", "")
	display_image = result.get("display_image", None)
	selected_points = result.get("selected_points", [])

	# Fix: Convert display_image from list back to numpy array if needed
	if isinstance(display_image, list):
	display_image = np.array(display_image, dtype=np.uint8)
	print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")

	# Get video settings based on video name
	video_name = get_video_name(video)
	print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
	grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
	print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")

	return (original_image_state, display_image, selected_points,
	gr.update(value=grid_size_val),
	gr.update(value=vo_points_val),
	gr.update(value=fps_val))
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")
	# Fallback to local processing
	pass

	# Fallback: local processing
	print("Using local video processing...")
	display_image = extract_first_frame(video)

	if display_image is not None:
	# Create a state format compatible with backend
	import tempfile
	import shutil

	# Create a temporary directory for this session
	session_id = str(int(time.time() * 1000)) # Use timestamp as session ID
	temp_dir = os.path.join("temp_frontend", f"session_{session_id}")
	os.makedirs(temp_dir, exist_ok=True)

	# Copy video to temp directory with standardized name
	video_name = get_video_name(video)
	temp_video_path = os.path.join(temp_dir, f"{video_name}.mp4")
	shutil.copy(video, temp_video_path)

	# Create state format compatible with backend
	frame_data = {
	'data': numpy_to_base64(display_image),
	'shape': display_image.shape,
	'dtype': str(display_image.dtype),
	'temp_dir': temp_dir,
	'video_name': video_name,
	'video_path': temp_video_path # Keep for backward compatibility
	}

	original_image_state = json.dumps(frame_data)
	else:
	# Fallback to simple state if frame extraction fails
	original_image_state = json.dumps({
	"video_path": video,
	"frame": "local_processing_failed"
	})

	# Get video settings
	video_name = get_video_name(video)
	print(f"🎬 Local fallback - Video path: '{video}' -> Video name: '{video_name}'")
	grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
	print(f"🎬 Local fallback - Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")

	return (original_image_state, display_image, [],
	gr.update(value=grid_size_val),
	gr.update(value=vo_points_val),
	gr.update(value=fps_val))

	except Exception as e:
	print(f"Error in handle_video_upload: {e}")
	return (None, None, [],
	gr.update(value=50),
	gr.update(value=756),
	gr.update(value=3))

	def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
	"""Handle point selection for SAM"""
	if original_img is None:
	return None, []

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")

	# Call the unified API with select_point function type
	result = backend_client.predict(
	"select_point", # function_type
	None, # video file (not used for select_point)
	original_img, # original_image_state
	sel_pix, # selected_points
	point_type, # point_type
	evt.index[0], # point_x
	evt.index[1], # point_y
	50, # grid_size (not used for select_point)
	756, # vo_points (not used for select_point)
	3, # fps (not used for select_point)
	api_name="/unified_api"
	)

	print(f"✅ Backend select point API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result - expect a dict with success status
	if isinstance(result, dict) and result.get("success"):
	display_image = result.get("display_image", None)
	new_sel_pix = result.get("selected_points", sel_pix)

	# Fix: Convert display_image from list back to numpy array if needed
	if isinstance(display_image, list):
	display_image = np.array(display_image, dtype=np.uint8)
	print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")

	return display_image, new_sel_pix
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")

	# Check for specific gradio_client errors
	if "AppError" in str(type(e)):
	print("🔧 Backend Space has internal errors (AppError)")
	print("🔧 The backend Space code has bugs or configuration issues")
	print("🔧 Contact the Space owner to fix the backend implementation")
	elif "Could not fetch config" in str(e):
	print("🔧 Config fetch failed - possible Gradio version mismatch")
	print("🔧 Frontend and backend may be using incompatible Gradio versions")
	elif "timeout" in str(e).lower():
	print("🔧 Backend request timed out - Space might be overloaded")
	else:
	print(f"🔧 Unexpected error type: {type(e).__name__}")

	print("🔄 Showing error message instead of visualization...")
	# Fallback to local processing
	pass

	# Fallback: local processing with improved visualization
	print("Using local point selection with enhanced visualization...")

	# Parse original image state
	try:
	state_data = json.loads(original_img)
	video_path = state_data.get("video_path")
	except:
	video_path = None

	if video_path:
	# Re-extract frame and add point with mask visualization
	display_image = extract_first_frame(video_path)
	if display_image is not None:
	# Add point to the image with enhanced visualization
	x, y = evt.index[0], evt.index[1]
	color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)

	# Draw a larger, more visible point
	cv2.circle(display_image, (x, y), 8, color, -1)
	cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)

	# Add point to selected points list - fix logic to match local version
	new_sel_pix = sel_pix.copy() if sel_pix else []
	new_sel_pix.append([x, y, point_type])

	return display_image, new_sel_pix

	return None, []

	except Exception as e:
	print(f"Error in select_point: {e}")
	return None, []

	def reset_points(original_img: str, sel_pix):
	"""Reset points and restore original image"""
	if original_img is None:
	return None, []

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print("🔧 Calling backend reset points API...")

	# Call the unified API with reset_points function type
	result = backend_client.predict(
	"reset_points", # function_type
	None, # video file (not used for reset_points)
	original_img, # original_image_state
	sel_pix, # selected_points
	"positive_point", # point_type (not used for reset_points)
	0, # point_x (not used for reset_points)
	0, # point_y (not used for reset_points)
	50, # grid_size (not used for reset_points)
	756, # vo_points (not used for reset_points)
	3, # fps (not used for reset_points)
	api_name="/unified_api"
	)

	print(f"✅ Backend reset points API call successful!")
	print(f"🔧 Result: {result}")

	# Parse the result
	if isinstance(result, dict) and result.get("success"):
	display_image = result.get("display_image", None)
	new_sel_pix = result.get("selected_points", [])

	# Fix: Convert display_image from list back to numpy array if needed
	if isinstance(display_image, list):
	display_image = np.array(display_image, dtype=np.uint8)
	print(f"🔧 Converted display_image from list to numpy array: {display_image.shape}")

	return display_image, new_sel_pix
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")
	# Fallback to local processing
	pass

	# Fallback: local processing
	print("Using local reset points...")

	# Parse original image state
	try:
	state_data = json.loads(original_img)
	video_path = state_data.get("video_path")
	except:
	video_path = None

	if video_path:
	# Re-extract original frame
	display_image = extract_first_frame(video_path)
	return display_image, []

	return None, []

	except Exception as e:
	print(f"Error in reset_points: {e}")
	return None, []

	gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])

	def launch_viz(grid_size, vo_points, fps, original_image_state):
	"""Launch visualization with user-specific temp directory"""
	if original_image_state is None:
	return None, None, None

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
	print(f"🔧 Original image state type: {type(original_image_state)}")
	print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")

	# Validate and potentially fix the original_image_state format
	state_to_send = original_image_state

	# Check if this is a local processing state that needs to be converted
	try:
	if isinstance(original_image_state, str):
	parsed_state = json.loads(original_image_state)
	if "video_path" in parsed_state and "frame" in parsed_state:
	# This is a local processing state, we need to handle differently
	print("🔧 Detected local processing state, cannot use backend for tracking")
	print("🔧 Backend requires proper video upload state from backend API")
	# Fall through to local processing
	raise ValueError("Local state cannot be processed by backend")
	except json.JSONDecodeError:
	print("🔧 Invalid JSON state, cannot send to backend")
	raise ValueError("Invalid state format")

	# Call the unified API with run_tracker function type
	result = backend_client.predict(
	"run_tracker", # function_type
	None, # video file (not used for run_tracker)
	state_to_send, # original_image_state
	[], # selected_points (not used for run_tracker)
	"positive_point", # point_type (not used for run_tracker)
	0, # point_x (not used for run_tracker)
	0, # point_y (not used for run_tracker)
	grid_size, # grid_size
	vo_points, # vo_points
	fps, # fps
	api_name="/unified_api"
	)

	print(f"✅ Backend API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result
	if isinstance(result, dict) and result.get("success"):
	viz_html = result.get("viz_html", "")
	track_video_path = result.get("track_video_path", "")
	track_video_content = result.get("track_video_content", None)
	track_video_filename = result.get("track_video_filename", "tracked_video.mp4")

	# Save HTML to _viz directory (like local version)
	viz_dir = './_viz'
	os.makedirs(viz_dir, exist_ok=True)
	random_path = f'./_viz/_{time.time()}.html'

	with open(random_path, 'w', encoding='utf-8') as f:
	f.write(viz_html)

	# Create iframe HTML
	iframe_html = f"""
	<div style='border: 3px solid #667eea; border-radius: 10px;
	background: #f8f9ff; height: 650px; width: 100%;
	box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
	margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'>
	<iframe id="viz_iframe" src="/gradio_api/file={random_path}"
	width="100%" height="650" frameborder="0"
	style="border: none; display: block; width: 100%; height: 650px;
	margin: 0; padding: 0; border-radius: 7px;">
	</iframe>
	</div>
	"""

	print(f"💾 HTML saved to: {random_path}")
	print(f"📊 HTML content preview: {viz_html[:200]}...")

	# If we have base64 encoded video content, save it as a temporary file
	local_video_path = None
	if track_video_content:
	try:
	# Create a temporary file for the video
	temp_video_dir = "temp_frontend_videos"
	os.makedirs(temp_video_dir, exist_ok=True)

	# Generate unique filename to avoid conflicts
	timestamp = str(int(time.time() * 1000))
	local_video_path = os.path.join(temp_video_dir, f"{timestamp}_{track_video_filename}")

	# Decode base64 and save as video file
	video_bytes = base64.b64decode(track_video_content)
	with open(local_video_path, 'wb') as f:
	f.write(video_bytes)

	print(f"✅ Successfully saved tracking video to: {local_video_path}")
	print(f"🔧 Video file size: {len(video_bytes)} bytes")

	except Exception as e:
	print(f"❌ Failed to process tracking video: {e}")
	local_video_path = None
	else:
	print("⚠️ No tracking video content received from backend")

	# 返回iframe HTML、视频路径和HTML文件路径（用于下载）
	return iframe_html, local_video_path, random_path
	else:
	error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
	print(f"❌ Backend processing failed: {error_msg}")
	# Fall through to error message
	pass
	except Exception as e:
	print(f"❌ Backend API call failed: {e}")
	print(f"🔧 Error type: {type(e)}")
	print(f"🔧 Error details: {str(e)}")

	# Check for specific gradio_client errors
	if "AppError" in str(type(e)):
	print("🔧 Backend Space has internal errors (AppError)")
	print("🔧 The backend Space code has bugs or configuration issues")
	print("🔧 Contact the Space owner to fix the backend implementation")
	elif "Could not fetch config" in str(e):
	print("🔧 Config fetch failed - possible Gradio version mismatch")
	print("🔧 Frontend and backend may be using incompatible Gradio versions")
	elif "timeout" in str(e).lower():
	print("🔧 Backend request timed out - Space might be overloaded")
	elif "Expecting value" in str(e):
	print("🔧 JSON parsing error in backend - state format mismatch")
	print("🔧 This happens when using local processing state with backend API")
	print("🔧 Please upload video again to use backend processing")
	else:
	print(f"🔧 Unexpected error type: {type(e).__name__}")

	print("🔄 Showing error message instead of visualization...")
	# Fall through to error message
	pass

	# Create an informative error message based on the state
	state_info = ""
	try:
	if isinstance(original_image_state, str):
	parsed_state = json.loads(original_image_state)
	if "video_path" in parsed_state:
	video_name = os.path.basename(parsed_state["video_path"])
	state_info = f"Video: {video_name}"
	except:
	state_info = "State format unknown"

	# Fallback: show message that backend is required
	error_message = f"""
	<div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
	<h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
	<p style='color: #2d3436; line-height: 1.6;'>
	The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
	</p>
	<h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
	<ul style='color: #2d3436; line-height: 1.6;'>
	<li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
	<li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
	<li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
	<li><strong>Use compatible state:</strong> Avoid local processing mode</li>
	</ul>
	<div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
	<p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
	</div>
	<div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
	<p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
	<p style='color: #1976d2; font-size: 13px; margin: 0;'>
	Try uploading your video again - this should properly initialize the backend state for tracking.
	</p>
	</div>
	</div>
	"""
	return error_message, None, None

	except Exception as e:
	print(f"Error in launch_viz: {e}")
	return None, None, None

	def clear_all():
	"""Clear all buffers and temporary files"""
	return (None, None, [],
	gr.update(value=50),
	gr.update(value=756),
	gr.update(value=3))

	def clear_all_with_download():
	"""Clear all buffers including both download components"""
	return (None, None, [],
	gr.update(value=50),
	gr.update(value=756),
	gr.update(value=3),
	None, # tracking_video_download
	None) # HTML download component

	def update_tracker_model(model_name):
	"""Update tracker model (placeholder function)"""
	return

	def get_video_settings(video_name):
	"""Get video-specific settings based on video name"""
	video_settings = {
	"kiss": (45, 700, 10),
	"backpack": (40, 600, 2),
	"kitchen": (60, 800, 3),
	"pillow": (35, 500, 2),
	"handwave": (35, 500, 8),
	"hockey": (45, 700, 2),
	"drifting": (35, 1000, 6),
	"basketball": (45, 1500, 5),
	"ken_block_0": (45, 700, 2),
	"ego_kc1": (45, 500, 4),
	"vertical_place": (45, 500, 3),
	"ego_teaser": (45, 1200, 10),
	"robot_unitree": (45, 500, 4),
	"robot_3": (35, 400, 5),
	"teleop2": (45, 256, 7),
	"pusht": (45, 256, 10),
	"cinema_0": (45, 356, 5),
	"cinema_1": (45, 756, 3),
	}

	return video_settings.get(video_name, (50, 756, 3))

	def test_backend_connection():
	"""Test if backend is actually working"""
	global BACKEND_AVAILABLE
	if not backend_client:
	return False

	try:
	print("Testing backend connection with a simple call...")
	# Check if we have fns available
	if hasattr(backend_client, 'fns') and backend_client.fns:
	print("✅ Backend API functions are available")
	print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
	return True
	else:
	print("❌ Backend API functions not found")
	return False
	except Exception as e:
	print(f"❌ Backend connection test failed: {e}")
	return False

	def test_backend_api():
	"""Test specific backend API functions"""
	if not BACKEND_AVAILABLE or not backend_client:
	print("❌ Backend not available for testing")
	return False

	try:
	print("🧪 Testing backend API functions...")

	# Test if fns exist and show available indices
	if hasattr(backend_client, 'fns') and backend_client.fns:
	print(f"✅ Backend has {len(backend_client.fns)} functions available")
	for idx in backend_client.fns.keys():
	print(f"✅ Function {idx} is available")
	else:
	print("❌ No functions found in backend API")
	return False

	return True

	except Exception as e:
	print(f"❌ Backend API test failed: {e}")
	return False

	# Initialize the backend connection
	print("🚀 Initializing frontend application...")
	result = initialize_backend()

	# Test backend connection if available
	if result and BACKEND_AVAILABLE:
	print("✅ Backend connection successful!")
	else:
	print("❌ Backend connection failed!")

	# Create the Gradio interface
	print("🎨 Creating Gradio interface...")

	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2) - Frontend Interface",
	css="""
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.gr-button {
	margin: 5px;
	}
	.gr-form {
	background: white;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 2px 10px rgba(0,0,0,0.1);
	}
	/* 固定3D可视化器尺寸 */
	#viz_container {
	height: 650px !important;
	min-height: 650px !important;
	max-height: 650px !important;
	width: 100% !important;
	margin: 0 !important;
	padding: 0 !important;
	overflow: hidden !important;
	}
	#viz_container > div {
	height: 650px !important;
	min-height: 650px !important;
	max-height: 650px !important;
	width: 100% !important;
	margin: 0 !important;
	padding: 0 !important;
	box-sizing: border-box !important;
	}
	#viz_container iframe {
	height: 650px !important;
	min-height: 650px !important;
	max-height: 650px !important;
	width: 100% !important;
	border: none !important;
	display: block !important;
	margin: 0 !important;
	padding: 0 !important;
	box-sizing: border-box !important;
	}
	/* 固定视频上传组件高度 */
	.gr-video {
	height: 300px !important;
	min-height: 300px !important;
	max-height: 300px !important;
	}
	.gr-video video {
	height: 260px !important;
	max-height: 260px !important;
	object-fit: contain !important;
	background: #f8f9fa;
	}
	.gr-video .gr-video-player {
	height: 260px !important;
	max-height: 260px !important;
	}
	/* 水平滚动的示例视频样式 */
	.example-videos .gr-examples {
	overflow: visible !important;
	}
	.example-videos .gr-examples .gr-table-wrapper {
	overflow-x: auto !important;
	overflow-y: hidden !important;
	scrollbar-width: thin;
	scrollbar-color: #667eea #f1f1f1;
	}
	.example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar {
	height: 8px;
	}
	.example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
	background: #f1f1f1;
	border-radius: 4px;
	}
	.example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 4px;
	}
	.example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
	background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
	}
	.example-videos .gr-examples .gr-table {
	display: flex !important;
	flex-wrap: nowrap !important;
	min-width: max-content !important;
	gap: 10px !important;
	}
	.example-videos .gr-examples .gr-table tbody {
	display: flex !important;
	flex-direction: row !important;
	flex-wrap: nowrap !important;
	gap: 10px !important;
	}
	.example-videos .gr-examples .gr-table tbody tr {
	display: flex !important;
	flex-direction: column !important;
	min-width: 120px !important;
	max-width: 120px !important;
	margin: 0 !important;
	background: white;
	border-radius: 8px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	transition: all 0.3s ease;
	cursor: pointer;
	}
	.example-videos .gr-examples .gr-table tbody tr:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
	}
	.example-videos .gr-examples .gr-table tbody tr td {
	text-align: center !important;
	padding: 8px !important;
	border: none !important;
	}
	.example-videos .gr-examples .gr-table tbody tr td video {
	border-radius: 6px !important;
	width: 100% !important;
	height: auto !important;
	}
	.example-videos .gr-examples .gr-table tbody tr td:last-child {
	font-size: 12px !important;
	font-weight: 500 !important;
	color: #333 !important;
	padding-top: 4px !important;
	}

	/* 新的水平滚动示例视频样式 */
	.horizontal-examples .gr-examples {
	overflow: visible !important;
	}
	.horizontal-examples .gr-examples .gr-table-wrapper {
	overflow-x: auto !important;
	overflow-y: hidden !important;
	scrollbar-width: thin;
	scrollbar-color: #667eea #f1f1f1;
	padding: 10px 0;
	}
	.horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
	height: 8px;
	}
	.horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
	background: #f1f1f1;
	border-radius: 4px;
	}
	.horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 4px;
	}
	.horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
	background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
	}
	.horizontal-examples .gr-examples .gr-table {
	display: flex !important;
	flex-wrap: nowrap !important;
	min-width: max-content !important;
	gap: 15px !important;
	padding-bottom: 10px;
	}
	.horizontal-examples .gr-examples .gr-table tbody {
	display: flex !important;
	flex-direction: row !important;
	flex-wrap: nowrap !important;
	gap: 15px !important;
	}
	.horizontal-examples .gr-examples .gr-table tbody tr {
	display: flex !important;
	flex-direction: column !important;
	min-width: 160px !important;
	max-width: 160px !important;
	margin: 0 !important;
	background: white;
	border-radius: 12px;
	box-shadow: 0 3px 12px rgba(0,0,0,0.12);
	transition: all 0.3s ease;
	cursor: pointer;
	overflow: hidden;
	}
	.horizontal-examples .gr-examples .gr-table tbody tr:hover {
	transform: translateY(-4px);
	box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
	}
	.horizontal-examples .gr-examples .gr-table tbody tr td {
	text-align: center !important;
	padding: 0 !important;
	border: none !important;
	}
	.horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
	padding: 0 !important;
	}
	.horizontal-examples .gr-examples .gr-table tbody tr td video {
	border-radius: 8px 8px 0 0 !important;
	width: 100% !important;
	height: 90px !important;
	object-fit: cover !important;
	}
	.horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
	font-size: 11px !important;
	font-weight: 600 !important;
	color: #333 !important;
	padding: 8px 12px !important;
	background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
	border-radius: 0 0 8px 8px;
	}
	"""
	) as demo:

	gr.Markdown("""
	Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model.

	⚡ Quick Start: Upload video → Click "Start Tracking Now!"

	🔬 Advanced Usage with SAM:
	1. Upload a video file or select from examples below
	2. Expand "Manual Point Selection" to click on specific objects for SAM-guided tracking
	3. Adjust tracking parameters for optimal performance
	4. Click "Start Tracking Now!" to begin 3D tracking with SAM guidance

	""")

	# Status indicator - more compact
	status_info = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Standalone Mode"
	gr.Markdown(f"Status: {status_info} \| Backend: {BACKEND_SPACE_URL}")

	# Main content area - video upload left, 3D visualization right
	with gr.Row():
	with gr.Column(scale=1):
	# Video upload section
	with gr.Group():
	gr.Markdown("### 📂 Select Video")

	# Define video_input here so it can be referenced in examples
	video_input = gr.Video(
	label="Upload Video or Select Example",
	format="mp4",
	height=250 # Matched height with 3D viz
	)

	# Horizontal video examples with slider
	gr.Markdown("Examples: (scroll horizontally to see all videos)")

	# Traditional examples but with horizontal scroll styling
	with gr.Row(elem_classes=["horizontal-examples"]):
	gr.Examples(
	examples=[
	["./examples/kiss.mp4"],
	["./examples/backpack.mp4"],
	["./examples/kitchen.mp4"],
	["./examples/pillow.mp4"],
	["./examples/handwave.mp4"],
	["./examples/hockey.mp4"],
	["./examples/drifting.mp4"],
	["./examples/basketball.mp4"],
	["./examples/ken_block_0.mp4"],
	["./examples/ego_kc1.mp4"],
	["./examples/vertical_place.mp4"],
	["./examples/ego_teaser.mp4"],
	["./examples/robot_unitree.mp4"],
	["./examples/robot_3.mp4"],
	["./examples/teleop2.mp4"],
	["./examples/pusht.mp4"],
	["./examples/cinema_0.mp4"],
	["./examples/cinema_1.mp4"],
	],
	inputs=[video_input],
	outputs=[video_input],
	fn=None,
	cache_examples=False,
	label="",
	examples_per_page=6 # Show 6 examples per page so they can wrap to multiple rows
	)

	with gr.Column(scale=2):
	# 3D Visualization - wider and taller to match left side
	with gr.Group():
	gr.Markdown("### 🌐 3D Trajectory Visualization")
	viz_html = gr.HTML(
	label="3D Trajectory Visualization",
	value="""
	<div style='border: 3px solid #667eea; border-radius: 10px;
	background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
	text-align: center; height: 650px; display: flex;
	flex-direction: column; justify-content: center; align-items: center;
	box-shadow: 0 4px 16px rgba(102, 126, 234, 0.15);
	margin: 0; padding: 20px; box-sizing: border-box;'>
	<div style='font-size: 56px; margin-bottom: 25px;'>🌐</div>
	<h3 style='color: #667eea; margin-bottom: 18px; font-size: 28px; font-weight: 600;'>
	3D Trajectory Visualization
	</h3>
	<p style='color: #666; font-size: 18px; line-height: 1.6; max-width: 550px; margin-bottom: 30px;'>
	Track any pixels in 3D space with camera motion
	</p>
	<div style='background: rgba(102, 126, 234, 0.1); border-radius: 30px;
	padding: 15px 30px; border: 1px solid rgba(102, 126, 234, 0.2);'>
	<span style='color: #667eea; font-weight: 600; font-size: 16px;'>
	⚡ Powered by SpatialTracker V2
	</span>
	</div>
	</div>
	""",
	elem_id="viz_container"
	)

	# Start button section - below video area
	with gr.Row():
	with gr.Column(scale=3):
	launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
	with gr.Column(scale=1):
	clear_all_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm")

	# Tracking parameters section
	with gr.Row():
	gr.Markdown("### ⚙️ Tracking Parameters")
	with gr.Row():
	grid_size = gr.Slider(
	minimum=10, maximum=100, step=10, value=50,
	label="Grid Size", info="Tracking detail level"
	)
	vo_points = gr.Slider(
	minimum=100, maximum=2000, step=50, value=756,
	label="VO Points", info="Motion accuracy"
	)
	fps = gr.Slider(
	minimum=1, maximum=30, step=1, value=3,
	label="FPS", info="Processing speed"
	)

	# Advanced Point Selection with SAM - Collapsed by default
	with gr.Row():
	gr.Markdown("### 🎯 Advanced: Manual Point Selection with SAM")
	with gr.Accordion("🔬 SAM Point Selection Controls", open=False):
	gr.HTML("""
	<div style='margin-bottom: 15px;'>
	<ul style='color: #4a5568; font-size: 14px; line-height: 1.6; margin: 0; padding-left: 20px;'>
	<li>Click on target objects in the image for SAM-guided segmentation</li>
	<li>Positive points: include these areas \| Negative points: exclude these areas</li>
	<li>Get more accurate 3D tracking results with SAM's powerful segmentation</li>
	</ul>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	interactive_frame = gr.Image(
	label="Click to select tracking points with SAM guidance",
	type="numpy",
	interactive=True,
	height=300
	)

	with gr.Row():
	point_type = gr.Radio(
	choices=["positive_point", "negative_point"],
	value="positive_point",
	label="Point Type",
	info="Positive: track these areas \| Negative: avoid these areas"
	)

	with gr.Row():
	reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")

	# Downloads section - hidden but still functional for backend processing
	with gr.Row(visible=False):
	with gr.Column(scale=1):
	tracking_video_download = gr.File(
	label="📹 Download 2D Tracking Video",
	interactive=False,
	visible=False
	)
	with gr.Column(scale=1):
	html_download = gr.File(
	label="📄 Download 3D Visualization HTML",
	interactive=False,
	visible=False
	)

	# GitHub Star Section
	gr.HTML("""
	<div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
	border-radius: 8px; padding: 20px; margin: 15px 0;
	box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
	border: 1px solid rgba(102, 126, 234, 0.15);'>
	<div style='text-align: center;'>
	<h3 style='color: #4a5568; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
	⭐ Love SpatialTracker? Give us a Star! ⭐
	</h3>
	<p style='color: #666; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
	Help us grow by starring our repository on GitHub! Your support means a lot to the community. 🚀
	</p>
	<a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank"
	style='display: inline-flex; align-items: center; gap: 8px;
	background: rgba(102, 126, 234, 0.1); color: #4a5568;
	padding: 10px 20px; border-radius: 25px; text-decoration: none;
	font-weight: bold; font-size: 14px; border: 1px solid rgba(102, 126, 234, 0.2);
	transition: all 0.3s ease;'
	onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-2px)'"
	onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
	<span style='font-size: 16px;'>⭐</span>
	Star SpatialTracker V2 on GitHub
	</a>
	</div>
	</div>
	""")

	# Acknowledgments Section
	gr.HTML("""
	<div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
	border-radius: 8px; padding: 20px; margin: 15px 0;
	box-shadow: 0 2px 8px rgba(255, 193, 7, 0.1);
	border: 1px solid rgba(255, 193, 7, 0.2);'>
	<div style='text-align: center;'>
	<h3 style='color: #5d4037; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
	📚 Acknowledgments
	</h3>
	<p style='color: #5d4037; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
	Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work and contribution to the computer vision community!
	</p>
	<a href="https://github.com/zbw001/TAPIP3D" target="_blank"
	style='display: inline-flex; align-items: center; gap: 8px;
	background: rgba(255, 193, 7, 0.15); color: #5d4037;
	padding: 10px 20px; border-radius: 25px; text-decoration: none;
	font-weight: bold; font-size: 14px; border: 1px solid rgba(255, 193, 7, 0.3);
	transition: all 0.3s ease;'
	onmouseover="this.style.background='rgba(255, 193, 7, 0.25)'; this.style.transform='translateY(-2px)'"
	onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'; this.style.transform='translateY(0)'">
	📚 Visit TAPIP3D Repository
	</a>
	</div>
	</div>
	""")

	# Footer
	gr.HTML("""
	<div style='text-align: center; margin: 20px 0 10px 0;'>
	<span style='font-size: 12px; color: #888; font-style: italic;'>
	Powered by SpatialTracker V2 \| Built with ❤️ for the Computer Vision Community
	</span>
	</div>
	""")

	# Hidden state variables
	original_image_state = gr.State(None)
	selected_points = gr.State([])

	# Event handlers
	video_input.change(
	fn=handle_video_upload,
	inputs=[video_input],
	outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
	)

	interactive_frame.select(
	fn=select_point,
	inputs=[original_image_state, selected_points, point_type],
	outputs=[interactive_frame, selected_points]
	)

	reset_points_btn.click(
	fn=reset_points,
	inputs=[original_image_state, selected_points],
	outputs=[interactive_frame, selected_points]
	)

	clear_all_btn.click(
	fn=clear_all_with_download,
	outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, tracking_video_download, html_download]
	)

	launch_btn.click(
	fn=launch_viz,
	inputs=[grid_size, vo_points, fps, original_image_state],
	outputs=[viz_html, tracking_video_download, html_download]
	)

	# Launch the interface
	if __name__ == "__main__":
	print("🌟 Launching SpatialTracker V2 Frontend...")
	print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=True,
	show_error=True
	)