ml-sharp / mcp_server.py
Robin L. M. Cheung, MBA
feat: Add local CUDA support, MCP server, Spaces GPU selection, and stacking roadmap
01504c4
"""SHARP MCP Server for programmatic access to 3D Gaussian prediction.
Run standalone:
uv run python mcp_server.py
Or integrate with MCP clients via stdio transport.
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Literal
import torch
from mcp.server.fastmcp import FastMCP
from model_utils import (
DEFAULT_OUTPUTS_DIR,
ModelWrapper,
TrajectoryType,
get_global_model,
)
MCP_PORT: int = int(os.getenv("SHARP_MCP_PORT", "49201"))
mcp = FastMCP(
"sharp",
description="SHARP: Single-image 3D Gaussian scene prediction",
)
# -----------------------------------------------------------------------------
# Tools
# -----------------------------------------------------------------------------
@mcp.tool()
def sharp_predict(
image_path: str,
render_video: bool = True,
trajectory_type: TrajectoryType = "rotate_forward",
num_frames: int = 60,
fps: int = 30,
output_long_side: int | None = None,
) -> dict:
"""Predict 3D Gaussians from a single image.
Args:
image_path: Absolute path to input image (jpg/png/webp).
render_video: Whether to render a camera trajectory video (requires CUDA).
trajectory_type: Camera trajectory type (swipe/shake/rotate/rotate_forward).
num_frames: Number of frames for video rendering.
fps: Frames per second for video.
output_long_side: Output resolution (longest side). None = match input.
Returns:
dict with keys:
- ply_path: Path to exported PLY file
- video_path: Path to rendered MP4 (or null if not rendered)
- cuda_available: Whether CUDA was available
"""
image_path_obj = Path(image_path)
if not image_path_obj.exists():
raise FileNotFoundError(f"Image not found: {image_path}")
model = get_global_model()
video_path, ply_path = model.predict_and_maybe_render(
image_path_obj,
trajectory_type=trajectory_type,
num_frames=num_frames,
fps=fps,
output_long_side=output_long_side,
render_video=render_video,
)
return {
"ply_path": str(ply_path),
"video_path": str(video_path) if video_path else None,
"cuda_available": torch.cuda.is_available(),
}
@mcp.tool()
def sharp_render(
ply_path: str,
trajectory_type: TrajectoryType = "rotate_forward",
num_frames: int = 60,
fps: int = 30,
output_long_side: int | None = None,
) -> dict:
"""Render a video from an existing PLY file.
Note: This requires re-predicting from the original image since Gaussians
are not stored in standard PLY format. For now, returns an error.
Future versions may support loading Gaussians from PLY.
Args:
ply_path: Path to PLY file (from previous prediction).
trajectory_type: Camera trajectory type.
num_frames: Number of frames.
fps: Frames per second.
output_long_side: Output resolution.
Returns:
dict with error message (feature not yet implemented).
"""
return {
"error": "Rendering from PLY not yet implemented. Use sharp_predict with render_video=True.",
"hint": "PLY files store only point data, not the full Gaussian parameters needed for rendering.",
}
@mcp.tool()
def list_outputs() -> dict:
"""List all generated output files (PLY and MP4).
Returns:
dict with keys:
- outputs_dir: Path to outputs directory
- ply_files: List of PLY file paths
- video_files: List of MP4 file paths
"""
outputs_dir = DEFAULT_OUTPUTS_DIR
ply_files = sorted(outputs_dir.glob("*.ply"))
video_files = sorted(outputs_dir.glob("*.mp4"))
return {
"outputs_dir": str(outputs_dir),
"ply_files": [str(f) for f in ply_files],
"video_files": [str(f) for f in video_files],
}
# -----------------------------------------------------------------------------
# Resources
# -----------------------------------------------------------------------------
@mcp.resource("sharp://info")
def get_info() -> str:
"""Get SHARP server info including GPU status and configuration."""
cuda_available = torch.cuda.is_available()
gpu_info = []
if cuda_available:
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
gpu_info.append({
"index": i,
"name": props.name,
"total_memory_gb": round(props.total_memory / (1024**3), 2),
"compute_capability": f"{props.major}.{props.minor}",
})
info = {
"model": "SHARP (Apple ml-sharp)",
"description": "Single-image 3D Gaussian scene prediction",
"cuda_available": cuda_available,
"cuda_device_count": torch.cuda.device_count() if cuda_available else 0,
"gpus": gpu_info,
"outputs_dir": str(DEFAULT_OUTPUTS_DIR),
"checkpoint_sources": [
"SHARP_CHECKPOINT_PATH env var",
"HuggingFace Hub (apple/Sharp)",
"Upstream CDN (torch.hub)",
],
"env_vars": {
"SHARP_CHECKPOINT_PATH": os.getenv("SHARP_CHECKPOINT_PATH", "(not set)"),
"SHARP_KEEP_MODEL_ON_DEVICE": os.getenv("SHARP_KEEP_MODEL_ON_DEVICE", "1"),
"CUDA_VISIBLE_DEVICES": os.getenv("CUDA_VISIBLE_DEVICES", "(not set)"),
},
}
return json.dumps(info, indent=2)
@mcp.resource("sharp://help")
def get_help() -> str:
"""Get usage help for the SHARP MCP server."""
help_text = """
# SHARP MCP Server
## Tools
### sharp_predict
Predict 3D Gaussians from a single image.
Parameters:
- image_path (required): Absolute path to input image
- render_video: Whether to render MP4 (default: true, requires CUDA)
- trajectory_type: swipe | shake | rotate | rotate_forward (default: rotate_forward)
- num_frames: Number of video frames (default: 60)
- fps: Video frame rate (default: 30)
- output_long_side: Output resolution, null = match input
### list_outputs
List all generated PLY and MP4 files.
## Resources
### sharp://info
Server info, GPU status, configuration.
### sharp://help
This help text.
## Environment Variables
- SHARP_MCP_PORT: MCP server port (default: 49201)
- SHARP_CHECKPOINT_PATH: Local checkpoint path override
- SHARP_KEEP_MODEL_ON_DEVICE: Keep model on GPU (default: 1)
- CUDA_VISIBLE_DEVICES: GPU selection (e.g., "0" or "0,1")
"""
return help_text.strip()
# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Run as stdio transport for MCP clients
mcp.run()