Spaces:

singlecell
/

jiggle-physics

Sleeping

App Files Files Community

Justin Wood commited on 12 days ago

Commit

c401d3e

0 Parent(s):

Initial backend

Browse files

Files changed (6) hide show

README.md +20 -0
app.py +161 -0
depth.py +52 -0
reconstruction.py +123 -0
requirements.txt +13 -0
segmentation.py +91 -0

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: Jiggle Physics Backend
+emoji: 🎯
+colorFrom: purple
+colorTo: indigo
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+license: mit
+---
+REST API for the Jiggle Physics Simulator. Endpoints mounted on Gradio's FastAPI instance for ZeroGPU compatibility.
+| Endpoint | Method | What it does |
+|---|---|---|
+| `/health` | GET | Liveness check |
+| `/segment` | POST | SAM2 body region masks |
+| `/depth` | POST | Apple Depth Pro metric depth |
+| `/reconstruct` | POST | TripoSR → GLB mesh |

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Jiggle Physics Simulator — HuggingFace Space backend
+ZeroGPU pattern: routes are added to Gradio's underlying FastAPI instance.
+Deploy with sdk: gradio and ZeroGPU hardware selected in Space settings.
+"""
+import base64
+import io
+import json
+from typing import Optional
+import gradio as gr
+import spaces
+import numpy as np
+from fastapi import File, Form, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import Response
+from PIL import Image
+# ── Minimal Gradio UI (required for ZeroGPU Spaces) ──────────────────────────
+with gr.Blocks(title="Jiggle Physics API") as demo:
+    gr.Markdown("## Jiggle Physics ML API\nREST endpoints: `/segment` `/depth` `/reconstruct`")
+# Grab Gradio's underlying FastAPI app and add CORS
+app = demo.app
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["POST", "GET"],
+    allow_headers=["*"],
+)
+def _load_image(upload: UploadFile) -> Image.Image:
+    data = upload.file.read()
+    img = Image.open(io.BytesIO(data)).convert("RGB")
+    max_dim = 1024
+    if max(img.size) > max_dim:
+        ratio = max_dim / max(img.size)
+        img = img.resize(
+            (int(img.width * ratio), int(img.height * ratio)), Image.LANCZOS
+        )
+    return img
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.post("/segment")
+@spaces.GPU
+async def segment(
+    image: UploadFile = File(...),
+    regions: str = Form("breast_left,breast_right,buttocks"),
+    click_points: Optional[str] = Form(None),
+):
+    """SAM2 body region segmentation. Returns RLE-encoded masks + bounding boxes."""
+    from segmentation import segment_regions
+    img = _load_image(image)
+    region_list = [r.strip() for r in regions.split(",") if r.strip()]
+    clicks = json.loads(click_points) if click_points else None
+    try:
+        result = segment_regions(img, region_list, clicks)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    encoded = {}
+    for region, data in result.items():
+        mask_arr = np.array(data["mask"], dtype=bool)
+        flat = mask_arr.flatten()
+        rle: list[int] = []
+        current = bool(flat[0])
+        count = 0
+        for val in flat:
+            if bool(val) == current:
+                count += 1
+            else:
+                rle.append(count)
+                count = 1
+                current = bool(val)
+        rle.append(count)
+        encoded[region] = {
+            "rle": rle,
+            "rle_start": bool(flat[0]),
+            "shape": list(mask_arr.shape),
+            "bbox": data["bbox"],
+        }
+    return {"regions": encoded, "image_size": [img.width, img.height]}
+@app.post("/depth")
+@spaces.GPU
+async def depth(image: UploadFile = File(...)):
+    """Apple Depth Pro depth estimation. Returns base64 float32 depth map."""
+    from depth import estimate_depth
+    img = _load_image(image)
+    try:
+        result = estimate_depth(img)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    arr = np.array(result["depth"], dtype=np.float32)
+    b64 = base64.b64encode(arr.tobytes()).decode("ascii")
+    return {
+        "depth_b64": b64,
+        "width": result["width"],
+        "height": result["height"],
+        "min": result["min"],
+        "max": result["max"],
+        "dtype": "float32",
+    }
+@app.post("/reconstruct")
+@spaces.GPU
+async def reconstruct(
+    image: UploadFile = File(...),
+    mask_rle: str = Form(...),
+    mask_shape: str = Form(...),
+    mask_rle_start: str = Form("false"),
+    bbox: str = Form(...),
+    use_triposr: str = Form("true"),
+):
+    """TripoSR single-image 3D reconstruction. Returns GLB binary."""
+    from reconstruction import reconstruct_region, depth_to_mesh
+    from depth import estimate_depth
+    img = _load_image(image)
+    rle = json.loads(mask_rle)
+    shape = json.loads(mask_shape)
+    start_val = mask_rle_start.lower() == "true"
+    flat: list[bool] = []
+    current = start_val
+    for run in rle:
+        flat.extend([current] * run)
+        current = not current
+    mask = np.array(flat, dtype=bool).reshape(shape)
+    bbox_list = json.loads(bbox)
+    try:
+        if use_triposr.lower() == "true":
+            glb_bytes = reconstruct_region(img, mask.tolist(), bbox_list)
+        else:
+            depth_result = estimate_depth(img)
+            glb_bytes = depth_to_mesh(depth_result["depth"], mask.tolist(), img)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    return Response(content=glb_bytes, media_type="application/octet-stream")
+# ── Entry point ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

depth.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import numpy as np
+import torch
+from PIL import Image
+_depth_cache = None
+def get_depth_model():
+    global _depth_cache
+    if _depth_cache is None:
+        from transformers import pipeline as hf_pipeline
+        # Depth Pro from Apple — best metric depth, ~600MB
+        _depth_cache = hf_pipeline(
+            "depth-estimation",
+            model="apple/DepthPro-hf",
+            device=0 if torch.cuda.is_available() else -1,
+        )
+    return _depth_cache
+def estimate_depth(image: Image.Image) -> dict:
+    """
+    Returns {"depth": [[float]], "width": int, "height": int, "min": float, "max": float}
+    Depth values are metric (meters) when Depth Pro is used.
+    """
+    pipe = get_depth_model()
+    result = pipe(image)
+    depth_map = result["depth"]  # PIL image or numpy array
+    if isinstance(depth_map, Image.Image):
+        arr = np.array(depth_map).astype(np.float32)
+    else:
+        arr = np.array(depth_map, dtype=np.float32)
+    # Resize to match source image if needed
+    if arr.shape[:2] != (image.height, image.width):
+        depth_pil = Image.fromarray(arr).resize(
+            (image.width, image.height), Image.BILINEAR
+        )
+        arr = np.array(depth_pil)
+    dmin = float(arr.min())
+    dmax = float(arr.max())
+    return {
+        "depth": arr.tolist(),
+        "width": image.width,
+        "height": image.height,
+        "min": dmin,
+        "max": dmax,
+    }

reconstruction.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+import torch
+import trimesh
+import io
+from PIL import Image
+_triposr_cache = None
+def get_triposr():
+    global _triposr_cache
+    if _triposr_cache is None:
+        from transformers import TripoSRForImageTo3D, TripoSRImageProcessor
+        processor = TripoSRImageProcessor.from_pretrained("stabilityai/TripoSR")
+        model = TripoSRForImageTo3D.from_pretrained("stabilityai/TripoSR")
+        model.eval()
+        _triposr_cache = (model, processor)
+    return _triposr_cache
+def reconstruct_region(image: Image.Image, mask: list[list[bool]], bbox: list[int]) -> bytes:
+    """
+    Crop the masked region from the image, run TripoSR, return GLB bytes.
+    """
+    model, processor = get_triposr()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    # Crop to bounding box with 20% padding
+    x, y, w, h = bbox
+    pad_x = int(w * 0.20)
+    pad_y = int(h * 0.20)
+    W, H = image.size
+    x0 = max(0, x - pad_x)
+    y0 = max(0, y - pad_y)
+    x1 = min(W, x + w + pad_x)
+    y1 = min(H, y + h + pad_y)
+    cropped = image.crop((x0, y0, x1, y1)).resize((512, 512), Image.LANCZOS)
+    # Apply mask as alpha channel so TripoSR focuses on the region
+    mask_arr = np.array(mask, dtype=np.uint8)[y0:y1, x0:x1]
+    mask_resized = np.array(
+        Image.fromarray(mask_arr * 255).resize((512, 512), Image.NEAREST)
+    )
+    rgba = np.array(cropped.convert("RGBA"))
+    rgba[:, :, 3] = mask_resized
+    input_img = Image.fromarray(rgba)
+    inputs = processor(images=input_img, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Export as GLB via trimesh
+    mesh_data = outputs.mesh  # TripoSR returns a trimesh-compatible object
+    if hasattr(mesh_data, "export"):
+        glb_bytes = mesh_data.export(file_type="glb")
+    else:
+        # Fallback: build trimesh from vertices/faces tensors
+        verts = mesh_data.verts_list()[0].cpu().numpy()
+        faces = mesh_data.faces_list()[0].cpu().numpy()
+        mesh = trimesh.Trimesh(vertices=verts, faces=faces, process=False)
+        buf = io.BytesIO()
+        mesh.export(buf, file_type="glb")
+        glb_bytes = buf.getvalue()
+    return glb_bytes
+def depth_to_mesh(depth: list[list[float]], mask: list[list[bool]], image: Image.Image) -> bytes:
+    """
+    Fallback when TripoSR isn't available: lift depth map into a 3D mesh
+    constrained to the masked region, textured with the source image.
+    """
+    depth_arr = np.array(depth, dtype=np.float32)
+    mask_arr = np.array(mask, dtype=bool)
+    H, W = depth_arr.shape
+    # Normalize depth to [0, 1] then scale to reasonable Z range
+    dmin, dmax = depth_arr.min(), depth_arr.max()
+    if dmax > dmin:
+        depth_norm = (depth_arr - dmin) / (dmax - dmin)
+    else:
+        depth_norm = np.zeros_like(depth_arr)
+    depth_scaled = depth_norm * 0.5  # 0.5 units of Z range
+    # Build vertex grid only for masked pixels
+    ys, xs = np.where(mask_arr)
+    if len(xs) == 0:
+        # Empty mask — return a flat quad
+        verts = np.array([[0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0]], dtype=np.float32)
+        faces = np.array([[0, 1, 2], [0, 2, 3]])
+        mesh = trimesh.Trimesh(vertices=verts, faces=faces)
+        buf = io.BytesIO()
+        mesh.export(buf, file_type="glb")
+        return buf.getvalue()
+    # Normalize to [-0.5, 0.5] XY space
+    x_norm = (xs / W) - 0.5
+    y_norm = 0.5 - (ys / H)
+    z_vals = depth_scaled[ys, xs]
+    vertices = np.stack([x_norm, y_norm, z_vals], axis=1).astype(np.float32)
+    # UV = source pixel position
+    uvs = np.stack([xs / W, 1.0 - ys / H], axis=1).astype(np.float32)
+    # Triangulate the masked grid using Delaunay
+    from scipy.spatial import Delaunay
+    points_2d = np.stack([x_norm, y_norm], axis=1)
+    tri = Delaunay(points_2d)
+    faces = tri.simplices.astype(np.int32)
+    # Build mesh with texture
+    img_arr = np.array(image.convert("RGB"))
+    texture = trimesh.visual.texture.TextureVisuals(
+        uv=uvs,
+        image=Image.fromarray(img_arr),
+    )
+    mesh = trimesh.Trimesh(vertices=vertices, faces=faces, visual=texture, process=False)
+    buf = io.BytesIO()
+    mesh.export(buf, file_type="glb")
+    return buf.getvalue()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio>=4.44.0
+fastapi
+uvicorn
+python-multipart
+torch
+torchvision
+transformers
+Pillow
+numpy
+trimesh
+pygltflib
+opencv-python-headless
+spaces

segmentation.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import numpy as np
+from PIL import Image
+import torch
+def load_sam2():
+    from transformers import Sam2Model, Sam2Processor
+    processor = Sam2Processor.from_pretrained("facebook/sam2-hiera-large")
+    model = Sam2Model.from_pretrained("facebook/sam2-hiera-large")
+    model.eval()
+    return model, processor
+_sam2_cache = None
+def get_sam2():
+    global _sam2_cache
+    if _sam2_cache is None:
+        _sam2_cache = load_sam2()
+    return _sam2_cache
+# Region labels understood by the frontend
+REGION_LABELS = ["breast_left", "breast_right", "buttocks", "ponytail", "hair"]
+# Approximate point prompts on a normalized [0,1] image for each region
+# (x, y) from top-left. Used when user hasn't provided click points.
+DEFAULT_PROMPTS = {
+    "breast_left":  (0.38, 0.38),
+    "breast_right": (0.62, 0.38),
+    "buttocks":     (0.50, 0.72),
+    "ponytail":     (0.50, 0.05),
+    "hair":         (0.50, 0.08),
+}
+def segment_regions(image: Image.Image, requested: list[str], click_points: dict | None = None) -> dict:
+    """
+    Returns a dict of {region_label: {"mask": [[bool]], "bbox": [x,y,w,h]}}
+    """
+    model, processor = get_sam2()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    W, H = image.size
+    results = {}
+    for region in requested:
+        if region not in DEFAULT_PROMPTS:
+            continue
+        # Use user-supplied click or fall back to default
+        if click_points and region in click_points:
+            px, py = click_points[region]
+        else:
+            nx, ny = DEFAULT_PROMPTS[region]
+            px, py = nx * W, ny * H
+        inputs = processor(
+            images=image,
+            input_points=[[[px, py]]],
+            return_tensors="pt",
+        ).to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        masks = processor.post_process_masks(
+            outputs.pred_masks.cpu(),
+            inputs["original_sizes"].cpu(),
+            inputs["reshaped_input_sizes"].cpu(),
+        )[0]  # shape: [1, num_masks, H, W]
+        # Pick highest-score mask
+        scores = outputs.iou_scores[0].cpu().numpy()
+        best = int(np.argmax(scores))
+        mask = masks[0, best].numpy().astype(bool)  # [H, W]
+        # Compute bounding box
+        rows = np.any(mask, axis=1)
+        cols = np.any(mask, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        results[region] = {
+            "mask": mask.tolist(),
+            "bbox": [int(cmin), int(rmin), int(cmax - cmin), int(rmax - rmin)],
+        }
+    return results