Spaces:

Daankular
/

Image2Model

Running on Zero

App Files Files Community

Daankular commited on 6 days ago

Commit

14c3d13

0 Parent(s):

Initial local files

Browse files

Files changed (11) hide show

Dockerfile +35 -0
README.md +27 -0
app.py +728 -0
packages.txt +10 -0
pipeline/__init__.py +0 -0
pipeline/enhance_surface.py +234 -0
pipeline/face_enhance.py +361 -0
pipeline/rig_stage.py +1282 -0
pipeline/rig_yolo.py +679 -0
pipeline/tpose_smpl.py +413 -0
requirements.txt +74 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.10-slim
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git wget curl build-essential cmake ninja-build pkg-config \
+    libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 libxrender-dev ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# HF user setup
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Upgrade pip first
+RUN pip install --user --upgrade pip setuptools wheel
+# chumpy must be installed with --no-build-isolation BEFORE everything else
+# (its setup.py does `import pip` which fails in pip's default isolated build env)
+RUN pip install --user --no-build-isolation \
+    "chumpy @ git+https://github.com/mattloper/chumpy.git@580566eafc9ac68b2614b64d6f7aaa84eebb70da"
+# Copy app files
+COPY --chown=user . $HOME/app
+# Install remaining requirements (chumpy already satisfied above)
+RUN pip install --user --no-cache-dir -r requirements.txt \
+    "torch<=2.9.1" \
+    "gradio[oauth,mcp]==6.11.0" \
+    "uvicorn>=0.14.0" \
+    "websockets>=10.4" \
+    "spaces==0.48.1"
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+---
+title: Image2Model
+emoji: 🎭
+colorFrom: purple
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+license: apache-2.0
+hardware: zero-a10g
+---
+# Image2Model
+Portrait-to-mesh pipeline on HuggingFace ZeroGPU.
+Upload a photo → rigged, textured, animation-ready GLB in minutes.
+**Pipeline stages**
+1. Background removal — RMBG-2.0
+2. 3D shape generation — TripoSG (diffusion SDF)
+3. Multiview texturing — MV-Adapter + SDXL
+4. Face enhancement — HyperSwap 1A 256 + RealESRGAN x4plus
+5. Rigging — YOLO-pose → 3D joints → LBS weights
+6. SKEL anatomy layer — anatomical bone mesh
+7. MDM animation — text-to-motion
+8. Surface enhancement — StableNormal normal maps + Depth-Anything displacement

app.py ADDED Viewed

	@@ -0,0 +1,728 @@

+import sys
+import os
+import tempfile
+import shutil
+import traceback
+import json
+import random
+from pathlib import Path
+import cv2
+import gradio as gr
+import spaces
+import torch
+import numpy as np
+from PIL import Image
+# ── Paths ─────────────────────────────────────────────────────────────────────
+HERE        = Path(__file__).parent
+PIPELINE_DIR = HERE / "pipeline"
+CKPT_DIR     = Path(os.environ.get("CKPT_DIR", "/tmp/checkpoints"))
+CKPT_DIR.mkdir(parents=True, exist_ok=True)
+# Add pipeline dir so local overrides (patched files) take priority
+sys.path.insert(0, str(HERE))
+sys.path.insert(0, str(PIPELINE_DIR))
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Lazy-loaded models (persist between ZeroGPU calls when Space is warm)
+_triposg_pipe  = None
+_rmbg_net      = None
+_rmbg_version  = None
+_last_glb_path = None
+_init_seed     = random.randint(0, 2**31 - 1)
+ARCFACE_256 = (np.array([[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+                          [41.5493, 92.3655], [70.7299, 92.2041]], dtype=np.float32)
+               * (256 / 112) + (256 - 112 * (256 / 112)) / 2)
+VIEW_NAMES = ["front", "3q_front", "side", "back", "3q_back"]
+VIEW_PATHS = [f"/tmp/render_{n}.png" for n in VIEW_NAMES]
+# ── Weight download helpers ────────────────────────────────────────────────────
+def _ensure_weight(url: str, dest: Path) -> Path:
+    """Download a file if not already cached."""
+    if not dest.exists():
+        import urllib.request
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        print(f"[weights] Downloading {dest.name} ...")
+        urllib.request.urlretrieve(url, dest)
+        print(f"[weights] Saved → {dest}")
+    return dest
+def _ensure_ckpts():
+    """Download all face-enhancement checkpoints to CKPT_DIR."""
+    weights = {
+        "hyperswap_1a_256.onnx": "https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/hyperswap_1a_256.onnx",
+        "inswapper_128.onnx":    "https://huggingface.co/ezioruan/inswapper_128.onnx/resolve/main/inswapper_128.onnx",
+        "RealESRGAN_x4plus.pth": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x4plus.pth",
+        "GFPGANv1.4.pth":        "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth",
+    }
+    for name, url in weights.items():
+        _ensure_weight(url, CKPT_DIR / name)
+# ── Model loaders ─────────────────────────────────────────────────────────────
+def load_triposg():
+    global _triposg_pipe, _rmbg_net, _rmbg_version
+    if _triposg_pipe is not None:
+        _triposg_pipe.to(DEVICE)
+        if _rmbg_net is not None:
+            _rmbg_net.to(DEVICE)
+        return _triposg_pipe, _rmbg_net
+    print("[load_triposg] Loading TripoSG pipeline...")
+    from huggingface_hub import snapshot_download
+    weights_path = snapshot_download("VAST-AI/TripoSG")
+    # TripoSG ships its own pipeline — add to path
+    triposg_pkg = Path(weights_path)
+    if (triposg_pkg / "triposg").exists():
+        sys.path.insert(0, str(triposg_pkg))
+    else:
+        # Try installed package from the cloned repo (if installed with pip -e)
+        import importlib.util
+        if importlib.util.find_spec("triposg") is None:
+            import subprocess
+            subprocess.run([sys.executable, "-m", "pip", "install", "-e", str(triposg_pkg), "-q"], check=False)
+    from triposg.pipelines.pipeline_triposg import TripoSGPipeline
+    _triposg_pipe = TripoSGPipeline.from_pretrained(
+        weights_path, torch_dtype=torch.float16
+    ).to(DEVICE)
+    try:
+        from transformers import AutoModelForImageSegmentation
+        _rmbg_net = AutoModelForImageSegmentation.from_pretrained(
+            "1038lab/RMBG-2.0", trust_remote_code=True, low_cpu_mem_usage=False
+        ).to(DEVICE)
+        _rmbg_net.eval()
+        _rmbg_version = "2.0"
+        print("[load_triposg] TripoSG + RMBG-2.0 loaded.")
+    except Exception as e:
+        print(f"[load_triposg] RMBG-2.0 failed ({e}). BG removal disabled.")
+        _rmbg_net = None
+    return _triposg_pipe, _rmbg_net
+# ── Background removal helper ─────────────────────────────────────────────────
+def _remove_bg_rmbg(img_pil, threshold=0.5, erode_px=2):
+    if _rmbg_net is None:
+        return img_pil
+    import torchvision.transforms.functional as TF
+    from torchvision import transforms
+    img_tensor = transforms.ToTensor()(img_pil.resize((1024, 1024)))
+    if _rmbg_version == "2.0":
+        img_tensor = TF.normalize(img_tensor, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]).unsqueeze(0)
+    else:
+        img_tensor = TF.normalize(img_tensor, [0.5, 0.5, 0.5], [1.0, 1.0, 1.0]).unsqueeze(0)
+    with torch.no_grad():
+        result = _rmbg_net(img_tensor)
+    if isinstance(result, (list, tuple)):
+        candidate = result[-1] if _rmbg_version == "2.0" else result[0]
+        if isinstance(candidate, (list, tuple)):
+            candidate = candidate[0]
+    else:
+        candidate = result
+    mask_tensor = candidate.sigmoid()[0, 0].cpu()
+    mask = np.array(transforms.ToPILImage()(mask_tensor).resize(img_pil.size, Image.BILINEAR),
+                    dtype=np.float32) / 255.0
+    mask = (mask >= threshold).astype(np.float32) * mask
+    if erode_px > 0:
+        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (erode_px * 2 + 1,) * 2)
+        mask = cv2.erode((mask * 255).astype(np.uint8), kernel).astype(np.float32) / 255.0
+    rgb   = np.array(img_pil.convert("RGB"), dtype=np.float32) / 255.0
+    alpha = mask[:, :, np.newaxis]
+    comp  = (rgb * alpha + 0.5 * (1.0 - alpha) * 255).clip(0, 255).astype(np.uint8)
+    return Image.fromarray(comp)
+def preview_rembg(input_image, do_remove_bg, threshold, erode_px):
+    if input_image is None or not do_remove_bg or _rmbg_net is None:
+        return input_image
+    try:
+        return np.array(_remove_bg_rmbg(Image.fromarray(input_image).convert("RGB"),
+                                         threshold=float(threshold), erode_px=int(erode_px)))
+    except Exception:
+        return input_image
+# ── Stage 1: Shape generation ─────────────────────────────────────────────────
+@spaces.GPU(duration=180)
+def generate_shape(input_image, remove_background, num_steps, guidance_scale,
+                   seed, face_count, progress=gr.Progress()):
+    if input_image is None:
+        return None, "Please upload an image."
+    try:
+        progress(0.1, desc="Loading TripoSG...")
+        # Add TripoSG scripts to path after model download
+        from huggingface_hub import snapshot_download
+        weights_path = snapshot_download("VAST-AI/TripoSG")
+        sys.path.insert(0, weights_path)
+        pipe, rmbg_net = load_triposg()
+        img = Image.fromarray(input_image).convert("RGB")
+        img_path = "/tmp/triposg_input.png"
+        img.save(img_path)
+        progress(0.5, desc="Generating shape (SDF diffusion)...")
+        from scripts.inference_triposg import run_triposg
+        mesh = run_triposg(
+            pipe=pipe,
+            image_input=img_path,
+            rmbg_net=rmbg_net if remove_background else None,
+            seed=int(seed),
+            num_inference_steps=int(num_steps),
+            guidance_scale=float(guidance_scale),
+            faces=int(face_count) if int(face_count) > 0 else -1,
+        )
+        out_path = "/tmp/triposg_shape.glb"
+        mesh.export(out_path)
+        # Offload to CPU before next stage
+        _triposg_pipe.to("cpu")
+        if _rmbg_net is not None:
+            _rmbg_net.to("cpu")
+        torch.cuda.empty_cache()
+        return out_path, "Shape generated!"
+    except Exception:
+        return None, f"Error:\n{traceback.format_exc()}"
+# ── Stage 2: Texture ──────────────────────────────────────────────────────────
+@spaces.GPU(duration=300)
+def apply_texture(glb_path, input_image, remove_background, variant, tex_seed,
+                  enhance_face, rembg_threshold=0.5, rembg_erode=2,
+                  progress=gr.Progress()):
+    if glb_path is None:
+        glb_path = "/tmp/triposg_shape.glb"
+    if not os.path.exists(glb_path):
+        return None, None, "Generate a shape first."
+    if input_image is None:
+        return None, None, "Please upload an image."
+    try:
+        progress(0.1, desc="Preprocessing image...")
+        img = Image.fromarray(input_image).convert("RGB")
+        face_ref_path = "/tmp/triposg_face_ref.png"
+        img.save(face_ref_path)
+        if remove_background and _rmbg_net is not None:
+            img = _remove_bg_rmbg(img, threshold=float(rembg_threshold), erode_px=int(rembg_erode))
+        img = img.resize((768, 768), Image.LANCZOS)
+        img_path = "/tmp/tex_input_768.png"
+        img.save(img_path)
+        out_dir = "/tmp/tex_out"
+        os.makedirs(out_dir, exist_ok=True)
+        # ── Run MV-Adapter in-process ─────────────────────────────────────
+        progress(0.3, desc="Loading MV-Adapter pipeline...")
+        import importlib
+        from huggingface_hub import snapshot_download
+        mvadapter_weights = snapshot_download("huanngzh/mv-adapter")
+        # Resolve SD pipeline
+        if variant == "sdxl":
+            from diffusers import StableDiffusionXLPipeline
+            sd_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        else:
+            from diffusers import StableDiffusionPipeline
+            sd_id = "stabilityai/stable-diffusion-2-1-base"
+        from mvadapter.pipelines.pipeline_mvadapter_i2mv_sdxl import MVAdapterI2MVSDXLPipeline
+        from mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler
+        from mvadapter.utils import get_orthogonal_camera, get_ipadapter_image
+        import torchvision.transforms.functional as TF
+        progress(0.4, desc=f"Running MV-Adapter ({variant})...")
+        pipe = MVAdapterI2MVSDXLPipeline.from_pretrained(
+            sd_id,
+            torch_dtype=torch.float16,
+        ).to(DEVICE)
+        pipe.init_adapter(
+            image_encoder_path="openai/clip-vit-large-patch14",
+            ipa_weight_path=os.path.join(mvadapter_weights, "mvadapter_i2mv_sdxl.safetensors"),
+            adapter_tokens=256,
+        )
+        ref_pil = Image.open(img_path).convert("RGB")
+        cameras = get_orthogonal_camera(
+            elevation_deg=[0, 0, 0, 0, 0, 0],
+            distance=[1.8] * 6,
+            left=-0.55, right=0.55, bottom=-0.55, top=0.55,
+            azimuth_deg=[x - 90 for x in [0, 45, 90, 135, 180, 270]],
+            device=DEVICE,
+        )
+        with torch.autocast(DEVICE):
+            out = pipe(
+                image=ref_pil,
+                height=768, width=768,
+                num_images_per_prompt=6,
+                guidance_scale=3.0,
+                num_inference_steps=30,
+                generator=torch.Generator(device=DEVICE).manual_seed(int(tex_seed)),
+                cameras=cameras,
+            )
+        mv_grid = out.images  # list of 6 PIL images
+        grid_w  = mv_grid[0].width * len(mv_grid)
+        mv_pil  = Image.new("RGB", (grid_w, mv_grid[0].height))
+        for i, v in enumerate(mv_grid):
+            mv_pil.paste(v, (i * mv_grid[0].width, 0))
+        mv_path = os.path.join(out_dir, "multiview.png")
+        mv_pil.save(mv_path)
+        # Offload before face-enhance (saves VRAM)
+        del pipe
+        torch.cuda.empty_cache()
+        # ── Face enhancement ─────────────────────────────────────────────
+        if enhance_face:
+            progress(0.75, desc="Running face enhancement...")
+            _ensure_ckpts()
+            try:
+                from pipeline.face_enhance import enhance_multiview
+                enh_path = os.path.join(out_dir, "multiview_enhanced.png")
+                enhance_multiview(
+                    multiview_path=mv_path,
+                    reference_path=face_ref_path,
+                    output_path=enh_path,
+                    ckpt_dir=str(CKPT_DIR),
+                )
+                mv_path = enh_path
+            except Exception as _fe:
+                print(f"[apply_texture] face enhance failed: {_fe}")
+        # ── Bake textures onto mesh ─────────────────────────────────────
+        progress(0.85, desc="Baking UV texture onto mesh...")
+        from mvadapter.utils.mesh_utils import (
+            NVDiffRastContextWrapper, load_mesh, bake_texture,
+        )
+        ctx  = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda")
+        mesh = load_mesh(glb_path, rescale=True, device=DEVICE)
+        tex_pil = Image.open(mv_path)
+        baked = bake_texture(ctx, mesh, tex_pil, cameras=cameras, height=1024, width=1024)
+        out_glb = os.path.join(out_dir, "textured_shaded.glb")
+        baked.export(out_glb)
+        final_path = "/tmp/triposg_textured.glb"
+        shutil.copy(out_glb, final_path)
+        global _last_glb_path
+        _last_glb_path = final_path
+        torch.cuda.empty_cache()
+        return final_path, mv_path, "Texture applied!"
+    except Exception:
+        return None, None, f"Error:\n{traceback.format_exc()}"
+# ── Stage 3a: SKEL Anatomy ────────────────────────────────────────────────────
+@spaces.GPU(duration=90)
+def gradio_tpose(glb_state_path, export_skel_flag, progress=gr.Progress()):
+    try:
+        glb = glb_state_path or _last_glb_path or "/tmp/triposg_textured.glb"
+        if not os.path.exists(glb):
+            return None, None, "No GLB found — run Generate + Texture first."
+        progress(0.1, desc="YOLO pose detection + rigging...")
+        from pipeline.rig_yolo import rig_yolo
+        out_dir = "/tmp/rig_out"
+        os.makedirs(out_dir, exist_ok=True)
+        rigged, _rigged_skel = rig_yolo(glb, os.path.join(out_dir, "anatomy_rigged.glb"), debug_dir=None)
+        bones = None
+        if export_skel_flag:
+            progress(0.7, desc="Generating SKEL bone mesh...")
+            from pipeline.tpose_smpl import export_skel_bones
+            bones = export_skel_bones(torch.zeros(10), "/tmp/tposed_bones.glb", gender="male")
+        status = f"Rigged surface: {os.path.getsize(rigged)//1024} KB"
+        if bones:
+            status += f"\nSKEL bone mesh: {os.path.getsize(bones)//1024} KB"
+        elif export_skel_flag:
+            status += "\nSKEL bone mesh: failed (check logs)"
+        torch.cuda.empty_cache()
+        return rigged, bones, status
+    except Exception:
+        return None, None, f"Error:\n{traceback.format_exc()}"
+# ── Stage 3b: Rig & Export ────────────────────────────────────────────────────
+@spaces.GPU(duration=180)
+def gradio_rig(glb_state_path, export_fbx_flag, mdm_prompt, mdm_n_frames,
+               progress=gr.Progress()):
+    try:
+        from pipeline.rig_yolo import rig_yolo
+        from pipeline.rig_stage import export_fbx
+        glb = glb_state_path or _last_glb_path or "/tmp/triposg_textured.glb"
+        if not os.path.exists(glb):
+            return None, None, None, "No GLB found — run Generate + Texture first.", None, None, None
+        out_dir = "/tmp/rig_out"
+        os.makedirs(out_dir, exist_ok=True)
+        progress(0.1, desc="YOLO pose detection + rigging...")
+        rigged, rigged_skel = rig_yolo(glb, os.path.join(out_dir, "rigged.glb"),
+                                        debug_dir=os.path.join(out_dir, "debug"))
+        fbx = None
+        if export_fbx_flag:
+            progress(0.7, desc="Exporting FBX...")
+            fbx_path = os.path.join(out_dir, "rigged.fbx")
+            fbx = fbx_path if export_fbx(rigged, fbx_path) else None
+        animated = None
+        if mdm_prompt.strip():
+            progress(0.75, desc="Generating MDM animation...")
+            from pipeline.rig_stage import run_rig_pipeline
+            mdm_result = run_rig_pipeline(
+                glb_path=glb,
+                reference_image_path="/tmp/triposg_face_ref.png",
+                out_dir=out_dir,
+                device=DEVICE,
+                export_fbx_flag=False,
+                mdm_prompt=mdm_prompt.strip(),
+                mdm_n_frames=int(mdm_n_frames),
+            )
+            animated = mdm_result.get("animated_glb")
+        parts = ["Rigged: " + os.path.basename(rigged)]
+        if fbx:     parts.append("FBX: " + os.path.basename(fbx))
+        if animated: parts.append("Animation: " + os.path.basename(animated))
+        torch.cuda.empty_cache()
+        return rigged, animated, fbx, "  |  ".join(parts), rigged, rigged, rigged_skel
+    except Exception:
+        return None, None, None, f"Error:\n{traceback.format_exc()}", None, None, None
+# ── Stage 4: Surface enhancement ─────────────────────────────────────────────
+@spaces.GPU(duration=120)
+def gradio_enhance(glb_path, ref_img_np, do_normal, norm_res, norm_strength,
+                   do_depth, dep_res, disp_scale):
+    if not glb_path:
+        yield None, None, None, None, "No GLB loaded — run Generate first."
+        return
+    if ref_img_np is None:
+        yield None, None, None, None, "No reference image — run Generate first."
+        return
+    try:
+        from pipeline.enhance_surface import (
+            run_stable_normal, run_depth_anything,
+            bake_normal_into_glb, bake_depth_as_occlusion,
+        )
+        import pipeline.enhance_surface as _enh_mod
+        ref_pil  = Image.fromarray(ref_img_np.astype(np.uint8))
+        out_path = glb_path.replace(".glb", "_enhanced.glb")
+        shutil.copy2(glb_path, out_path)
+        normal_out = depth_out = None
+        log = []
+        if do_normal:
+            log.append("[StableNormal] Running...")
+            yield None, None, None, None, "\n".join(log)
+            normal_out = run_stable_normal(ref_pil, resolution=norm_res)
+            out_path = bake_normal_into_glb(out_path, normal_out, out_path,
+                                             normal_strength=norm_strength)
+            log.append(f"[StableNormal] Done → normalTexture (strength {norm_strength})")
+            yield normal_out, depth_out, None, None, "\n".join(log)
+        if do_depth:
+            log.append("[Depth-Anything] Running...")
+            yield normal_out, depth_out, None, None, "\n".join(log)
+            depth_out = run_depth_anything(ref_pil, resolution=dep_res)
+            out_path  = bake_depth_as_occlusion(out_path, depth_out, out_path,
+                                                 displacement_scale=disp_scale)
+            log.append(f"[Depth-Anything] Done → occlusionTexture (scale {disp_scale})")
+            yield normal_out, depth_out.convert("L").convert("RGB"), None, None, "\n".join(log)
+        torch.cuda.empty_cache()
+        log.append("Enhancement complete.")
+        yield normal_out, (depth_out.convert("L").convert("RGB") if depth_out else None), out_path, out_path, "\n".join(log)
+    except Exception:
+        yield None, None, None, None, f"Error:\n{traceback.format_exc()}"
+# ── Render views ──────────────────────────────────────────────────────────────
+@spaces.GPU(duration=60)
+def render_views(glb_file):
+    if not glb_file:
+        return []
+    glb_path = glb_file if isinstance(glb_file, str) else (glb_file.get("path") if isinstance(glb_file, dict) else str(glb_file))
+    if not glb_path or not os.path.exists(glb_path):
+        return []
+    try:
+        from mvadapter.utils.mesh_utils import (
+            NVDiffRastContextWrapper, load_mesh, render, get_orthogonal_camera,
+        )
+        ctx  = NVDiffRastContextWrapper(device="cuda", context_type="cuda")
+        mesh = load_mesh(glb_path, rescale=True, device="cuda")
+        cams = get_orthogonal_camera(
+            elevation_deg=[0]*5, distance=[1.8]*5,
+            left=-0.55, right=0.55, bottom=-0.55, top=0.55,
+            azimuth_deg=[x - 90 for x in [0, 45, 90, 180, 315]],
+            device="cuda",
+        )
+        out = render(ctx, mesh, cams, height=1024, width=768, render_attr=True, normal_background=0.0)
+        save_dir = os.path.dirname(glb_path)
+        results  = []
+        for i, name in enumerate(VIEW_NAMES):
+            arr  = (out.attr[i].cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+            path = os.path.join(save_dir, f"render_{name}.png")
+            Image.fromarray(arr).save(path)
+            results.append((path, name))
+        torch.cuda.empty_cache()
+        return results
+    except Exception:
+        print(f"render_views FAILED:\n{traceback.format_exc()}")
+        return []
+# ── Full pipeline ─────────────────────────────────────────────────────────────
+def run_full_pipeline(input_image, remove_background, num_steps, guidance, seed, face_count,
+                      variant, tex_seed, enhance_face, rembg_threshold, rembg_erode,
+                      export_fbx, mdm_prompt, mdm_n_frames, progress=gr.Progress()):
+    progress(0.0, desc="Stage 1/3: Generating shape...")
+    glb, status = generate_shape(input_image, remove_background, num_steps, guidance, seed, face_count)
+    if not glb:
+        return None, None, None, None, None, None, status
+    progress(0.33, desc="Stage 2/3: Applying texture...")
+    glb, mv_img, status = apply_texture(glb, input_image, remove_background, variant, tex_seed,
+                                         enhance_face, rembg_threshold, rembg_erode)
+    if not glb:
+        return None, None, None, None, None, None, status
+    progress(0.66, desc="Stage 3/3: Rigging + animation...")
+    rigged, animated, fbx, rig_status, _, _, _ = gradio_rig(glb, export_fbx, mdm_prompt, mdm_n_frames)
+    progress(1.0, desc="Pipeline complete!")
+    return glb, glb, mv_img, rigged, animated, fbx, f"[Texture] {status}\n[Rig] {rig_status}"
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="Image2Model", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Image2Model — Portrait to Rigged 3D Mesh")
+    glb_state = gr.State(None)
+    with gr.Tabs():
+        # ════════════════════════════════════════════════════════════════════
+        with gr.Tab("Generate"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    input_image    = gr.Image(label="Input Image", type="numpy")
+                    remove_bg_check = gr.Checkbox(label="Remove Background", value=True)
+                    with gr.Row():
+                        rembg_threshold = gr.Slider(0.1, 0.95, value=0.5, step=0.05,
+                                                    label="BG Threshold")
+                        rembg_erode     = gr.Slider(0, 8, value=2, step=1,
+                                                    label="Edge Erode (px)")
+                    with gr.Accordion("Shape Settings", open=True):
+                        num_steps  = gr.Slider(20, 100, value=50, step=5,  label="Inference Steps")
+                        guidance   = gr.Slider(1.0, 20.0, value=7.0, step=0.5, label="Guidance Scale")
+                        seed       = gr.Number(value=_init_seed, label="Seed", precision=0)
+                        face_count = gr.Number(value=0, label="Max Faces (0 = unlimited)", precision=0)
+                    with gr.Accordion("Texture Settings", open=True):
+                        variant    = gr.Radio(["sdxl", "sd21"], value="sdxl",
+                                              label="Model (sdxl = quality, sd21 = less VRAM)")
+                        tex_seed   = gr.Number(value=_init_seed, label="Texture Seed", precision=0)
+                        enhance_face_check = gr.Checkbox(
+                            label="Enhance Face (HyperSwap + RealESRGAN)", value=True)
+                    with gr.Row():
+                        shape_btn   = gr.Button("Generate Shape",  variant="primary",   scale=2, interactive=False)
+                        texture_btn = gr.Button("Apply Texture",   variant="secondary", scale=2)
+                        render_btn  = gr.Button("Render Views",    variant="secondary", scale=1)
+                    run_all_btn = gr.Button("▶ Run Full Pipeline", variant="primary", interactive=False)
+                with gr.Column(scale=1):
+                    rembg_preview  = gr.Image(label="BG Removed Preview", type="numpy", interactive=False)
+                    status         = gr.Textbox(label="Status", lines=3, interactive=False)
+                    model_3d       = gr.Model3D(label="3D Preview", clear_color=[0.9, 0.9, 0.9, 1.0])
+                    download_file  = gr.File(label="Download GLB")
+                    multiview_img  = gr.Image(label="Multiview", type="filepath", interactive=False)
+            render_gallery = gr.Gallery(label="Rendered Views", columns=5, height=300)
+            _rembg_inputs  = [input_image, remove_bg_check, rembg_threshold, rembg_erode]
+            _pipeline_btns = [shape_btn, run_all_btn]
+            input_image.upload(
+                fn=lambda: (gr.update(interactive=True), gr.update(interactive=True)),
+                inputs=[], outputs=_pipeline_btns,
+            )
+            input_image.clear(
+                fn=lambda: (gr.update(interactive=False), gr.update(interactive=False)),
+                inputs=[], outputs=_pipeline_btns,
+            )
+            input_image.upload(fn=preview_rembg,      inputs=_rembg_inputs, outputs=[rembg_preview])
+            remove_bg_check.change(fn=preview_rembg,  inputs=_rembg_inputs, outputs=[rembg_preview])
+            rembg_threshold.release(fn=preview_rembg, inputs=_rembg_inputs, outputs=[rembg_preview])
+            rembg_erode.release(fn=preview_rembg,     inputs=_rembg_inputs, outputs=[rembg_preview])
+            shape_btn.click(
+                fn=generate_shape,
+                inputs=[input_image, remove_bg_check, num_steps, guidance, seed, face_count],
+                outputs=[glb_state, status],
+            ).then(
+                fn=lambda p: (p, p) if p else (None, None),
+                inputs=[glb_state], outputs=[model_3d, download_file],
+            )
+            texture_btn.click(
+                fn=apply_texture,
+                inputs=[glb_state, input_image, remove_bg_check, variant, tex_seed,
+                        enhance_face_check, rembg_threshold, rembg_erode],
+                outputs=[glb_state, multiview_img, status],
+            ).then(
+                fn=lambda p: (p, p) if p else (None, None),
+                inputs=[glb_state], outputs=[model_3d, download_file],
+            )
+            render_btn.click(fn=render_views, inputs=[download_file], outputs=[render_gallery])
+        # ════════════════════════════════════════════════════════════════════
+        with gr.Tab("Rig & Export"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Step 1 — SKEL Anatomy Layer")
+                    tpose_skel_check  = gr.Checkbox(label="Export SKEL bone mesh", value=False)
+                    tpose_btn         = gr.Button("Rig + SKEL Anatomy", variant="secondary")
+                    tpose_status      = gr.Textbox(label="Anatomy Status", lines=3, interactive=False)
+                    with gr.Row():
+                        tpose_surface_dl = gr.File(label="Rigged Surface GLB")
+                        tpose_bones_dl   = gr.File(label="SKEL Bone Mesh GLB")
+                    gr.Markdown("---")
+                    gr.Markdown("### Step 2 — Rig & Export")
+                    export_fbx_check  = gr.Checkbox(label="Export FBX (requires Blender)", value=True)
+                    mdm_prompt_box    = gr.Textbox(label="Motion Prompt (MDM)",
+                                                   placeholder="a person walks forward", value="")
+                    mdm_frames_slider = gr.Slider(60, 300, value=120, step=30,
+                                                  label="Animation Frames (at 20 fps)")
+                    rig_btn           = gr.Button("Rig Mesh", variant="primary")
+                with gr.Column(scale=2):
+                    rig_status      = gr.Textbox(label="Rig Status", lines=4, interactive=False)
+                    show_skel_check = gr.Checkbox(label="Show Skeleton", value=False)
+                    rig_model_3d    = gr.Model3D(label="Preview", clear_color=[0.9, 0.9, 0.9, 1.0])
+                    with gr.Row():
+                        rig_glb_dl      = gr.File(label="Download Rigged GLB")
+                        rig_animated_dl = gr.File(label="Download Animated GLB")
+                        rig_fbx_dl      = gr.File(label="Download FBX")
+            rigged_base_state = gr.State(None)
+            skel_glb_state    = gr.State(None)
+            tpose_btn.click(
+                fn=gradio_tpose,
+                inputs=[glb_state, tpose_skel_check],
+                outputs=[tpose_surface_dl, tpose_bones_dl, tpose_status],
+            ).then(
+                fn=lambda p: (p["path"] if isinstance(p, dict) else p) if p else None,
+                inputs=[tpose_surface_dl], outputs=[rig_model_3d],
+            )
+            rig_btn.click(
+                fn=gradio_rig,
+                inputs=[glb_state, export_fbx_check, mdm_prompt_box, mdm_frames_slider],
+                outputs=[rig_glb_dl, rig_animated_dl, rig_fbx_dl, rig_status,
+                         rig_model_3d, rigged_base_state, skel_glb_state],
+            )
+            show_skel_check.change(
+                fn=lambda show, base, skel: skel if (show and skel) else base,
+                inputs=[show_skel_check, rigged_base_state, skel_glb_state],
+                outputs=[rig_model_3d],
+            )
+        # ════════════════════════════════════════════════════════════════════
+        with gr.Tab("Enhancement"):
+            gr.Markdown("**Surface Enhancement** — bakes normal + depth maps into the GLB as PBR textures.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### StableNormal")
+                    run_normal_check   = gr.Checkbox(label="Run StableNormal", value=True)
+                    normal_res         = gr.Slider(512, 1024, value=768, step=128, label="Resolution")
+                    normal_strength    = gr.Slider(0.1, 3.0, value=1.0, step=0.1, label="Normal Strength")
+                    gr.Markdown("### Depth-Anything V2")
+                    run_depth_check    = gr.Checkbox(label="Run Depth-Anything V2", value=True)
+                    depth_res          = gr.Slider(512, 1024, value=768, step=128, label="Resolution")
+                    displacement_scale = gr.Slider(0.1, 3.0, value=1.0, step=0.1, label="Displacement Scale")
+                    enhance_btn = gr.Button("Run Enhancement", variant="primary")
+                with gr.Column(scale=2):
+                    enhance_status    = gr.Textbox(label="Status", lines=5, interactive=False)
+                    with gr.Row():
+                        normal_map_img = gr.Image(label="Normal Map", type="pil")
+                        depth_map_img  = gr.Image(label="Depth Map", type="pil")
+                    enhanced_glb_dl   = gr.File(label="Download Enhanced GLB")
+                    enhanced_model_3d = gr.Model3D(label="Preview", clear_color=[0.9, 0.9, 0.9, 1.0])
+            enhance_btn.click(
+                fn=gradio_enhance,
+                inputs=[glb_state, input_image,
+                        run_normal_check, normal_res, normal_strength,
+                        run_depth_check, depth_res, displacement_scale],
+                outputs=[normal_map_img, depth_map_img,
+                         enhanced_glb_dl, enhanced_model_3d, enhance_status],
+            )
+        # ── Run All wiring ────────────────────────────────────────────────
+        run_all_btn.click(
+            fn=run_full_pipeline,
+            inputs=[
+                input_image, remove_bg_check, num_steps, guidance, seed, face_count,
+                variant, tex_seed, enhance_face_check, rembg_threshold, rembg_erode,
+                export_fbx_check, mdm_prompt_box, mdm_frames_slider,
+            ],
+            outputs=[glb_state, download_file, multiview_img,
+                     rig_glb_dl, rig_animated_dl, rig_fbx_dl, status],
+        ).then(
+            fn=lambda p: (p, p) if p else (None, None),
+            inputs=[glb_state], outputs=[model_3d, download_file],
+        )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

packages.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+libgl1-mesa-glx
+libglib2.0-0
+libsm6
+libxext6
+libxrender-dev
+ffmpeg
+cmake
+ninja-build
+build-essential
+pkg-config

pipeline/__init__.py ADDED Viewed

File without changes

pipeline/enhance_surface.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Surface enhancement for TripoSG GLB outputs.
+StableNormal  — high-quality normal map from portrait reference
+Depth-Anything V2 — metric depth map → displacement intensity
+Both run on the reference portrait, produce calibrated maps that
+are baked as PBR textures (normalTexture + occlusion/displacement)
+into the output GLB.
+"""
+import os
+import numpy as np
+import torch
+from PIL import Image
+STABLE_NORMAL_PATH  = "/root/models/stable-normal"
+DEPTH_ANYTHING_PATH = "/root/models/depth-anything-v2"
+_normal_pipe  = None
+_depth_pipe   = None
+# ── model loading ──────────────────────────────────────────────────────────────
+def load_normal_model():
+    global _normal_pipe
+    if _normal_pipe is not None:
+        return _normal_pipe
+    from stablenormal.pipeline_yoso_normal import YOSONormalsPipeline
+    from stablenormal.scheduler.heuristics_ddimsampler import HEURI_DDIMScheduler
+    import torch
+    x_start_pipeline = YOSONormalsPipeline.from_pretrained(
+        STABLE_NORMAL_PATH,
+        torch_dtype=torch.float16,
+        variant="fp16",
+        t_start=int(0.3 * 1000),
+    ).to("cuda")
+    _normal_pipe = YOSONormalsPipeline.from_pretrained(
+        STABLE_NORMAL_PATH,
+        torch_dtype=torch.float16,
+        variant="fp16",
+        scheduler=HEURI_DDIMScheduler.from_pretrained(
+            STABLE_NORMAL_PATH, subfolder="scheduler",
+            ddim_timestep_respacing="ddim10", x_start_pipeline=x_start_pipeline,
+        ),
+    ).to("cuda")
+    _normal_pipe.set_progress_bar_config(disable=True)
+    return _normal_pipe
+def load_depth_model():
+    global _depth_pipe
+    if _depth_pipe is not None:
+        return _depth_pipe
+    from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+    processor = AutoImageProcessor.from_pretrained(DEPTH_ANYTHING_PATH)
+    model = AutoModelForDepthEstimation.from_pretrained(
+        DEPTH_ANYTHING_PATH, torch_dtype=torch.float16
+    ).to("cuda")
+    _depth_pipe = (processor, model)
+    return _depth_pipe
+def unload_models():
+    global _normal_pipe, _depth_pipe
+    if _normal_pipe is not None:
+        del _normal_pipe; _normal_pipe = None
+    if _depth_pipe is not None:
+        del _depth_pipe; _depth_pipe = None
+    torch.cuda.empty_cache()
+# ── inference ──────────────────────────────────────────────────────────────────
+def run_stable_normal(image: Image.Image, resolution: int = 768) -> Image.Image:
+    """Returns normal map as RGB PIL image ([-1,1] encoded as [0,255])."""
+    pipe = load_normal_model()
+    img = image.convert("RGB").resize((resolution, resolution), Image.LANCZOS)
+    with torch.inference_mode(), torch.autocast("cuda"):
+        result = pipe(img)
+    normal_img = result.prediction  # numpy [H,W,3] in [-1,1]
+    normal_rgb = ((normal_img + 1) / 2 * 255).clip(0, 255).astype(np.uint8)
+    return Image.fromarray(normal_rgb)
+def run_depth_anything(image: Image.Image, resolution: int = 768) -> Image.Image:
+    """Returns depth map as 16-bit grayscale PIL image (normalized 0–65535)."""
+    processor, model = load_depth_model()
+    img = image.convert("RGB").resize((resolution, resolution), Image.LANCZOS)
+    inputs = processor(images=img, return_tensors="pt")
+    inputs = {k: v.to("cuda", dtype=torch.float16) for k, v in inputs.items()}
+    with torch.inference_mode():
+        depth = model(**inputs).predicted_depth[0].float().cpu().numpy()
+    # Normalize to 0–1
+    depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)
+    depth_16 = (depth * 65535).astype(np.uint16)
+    return Image.fromarray(depth_16, mode="I;16")
+# ── GLB baking ─────────────────────────────────────────────────────────────────
+def bake_normal_into_glb(
+    glb_path: str,
+    normal_img: Image.Image,
+    out_path: str,
+    normal_strength: float = 1.0,
+) -> str:
+    """
+    Adds normalTexture to the first material of the GLB.
+    Normal map is resized to match the existing base color texture resolution.
+    """
+    import pygltflib, struct, io
+    gltf = pygltflib.GLTF2().load(glb_path)
+    # Find existing base color texture size for matching resolution
+    target_size = 1024
+    if gltf.materials and gltf.materials[0].pbrMetallicRoughness:
+        pbr = gltf.materials[0].pbrMetallicRoughness
+        if pbr.baseColorTexture is not None:
+            tex_idx = pbr.baseColorTexture.index
+            img_idx = gltf.textures[tex_idx].source
+            blob = gltf.binary_blob()
+            bv = gltf.bufferViews[gltf.images[img_idx].bufferView]
+            img_bytes = blob[bv.byteOffset: bv.byteOffset + bv.byteLength]
+            existing = Image.open(io.BytesIO(img_bytes))
+            target_size = existing.width
+    normal_resized = normal_img.resize((target_size, target_size), Image.LANCZOS)
+    # Encode normal map as PNG and append to binary blob
+    buf = io.BytesIO()
+    normal_resized.save(buf, format="PNG")
+    png_bytes = buf.getvalue()
+    blob = bytearray(gltf.binary_blob() or b"")
+    byte_offset = len(blob)
+    blob.extend(png_bytes)
+    # Pad to 4-byte alignment
+    while len(blob) % 4:
+        blob.append(0)
+    # Add bufferView, image, texture
+    bv_idx = len(gltf.bufferViews)
+    gltf.bufferViews.append(pygltflib.BufferView(
+        buffer=0, byteOffset=byte_offset, byteLength=len(png_bytes),
+    ))
+    img_idx = len(gltf.images)
+    gltf.images.append(pygltflib.Image(
+        bufferView=bv_idx, mimeType="image/png",
+    ))
+    tex_idx = len(gltf.textures)
+    gltf.textures.append(pygltflib.Texture(source=img_idx))
+    # Update material
+    if gltf.materials:
+        gltf.materials[0].normalTexture = pygltflib.NormalMaterialTexture(
+            index=tex_idx, scale=normal_strength,
+        )
+    # Update buffer length
+    gltf.buffers[0].byteLength = len(blob)
+    gltf.set_binary_blob(bytes(blob))
+    gltf.save(out_path)
+    return out_path
+def bake_depth_as_occlusion(
+    glb_path: str,
+    depth_img: Image.Image,
+    out_path: str,
+    displacement_scale: float = 1.0,
+) -> str:
+    """
+    Bakes depth map as occlusionTexture (R channel) — approximates displacement
+    in PBR renderers. Depth is inverted and normalized for AO-style use.
+    """
+    import pygltflib, io
+    gltf = pygltflib.GLTF2().load(glb_path)
+    target_size = 1024
+    if gltf.materials and gltf.materials[0].pbrMetallicRoughness:
+        pbr = gltf.materials[0].pbrMetallicRoughness
+        if pbr.baseColorTexture is not None:
+            tex_idx = pbr.baseColorTexture.index
+            img_idx = gltf.textures[tex_idx].source
+            blob = gltf.binary_blob()
+            bv = gltf.bufferViews[gltf.images[img_idx].bufferView]
+            img_bytes = blob[bv.byteOffset: bv.byteOffset + bv.byteLength]
+            existing = Image.open(io.BytesIO(img_bytes))
+            target_size = existing.width
+    # Convert 16-bit depth to 8-bit RGB occlusion (inverted, scaled)
+    depth_arr = np.array(depth_img).astype(np.float32) / 65535.0
+    depth_arr = 1.0 - depth_arr  # invert: close = bright
+    depth_arr = np.clip(depth_arr * displacement_scale, 0, 1)
+    occ_8 = (depth_arr * 255).astype(np.uint8)
+    occ_rgb = Image.fromarray(np.stack([occ_8, occ_8, occ_8], axis=-1))
+    occ_rgb = occ_rgb.resize((target_size, target_size), Image.LANCZOS)
+    buf = io.BytesIO()
+    occ_rgb.save(buf, format="PNG")
+    png_bytes = buf.getvalue()
+    blob = bytearray(gltf.binary_blob() or b"")
+    byte_offset = len(blob)
+    blob.extend(png_bytes)
+    while len(blob) % 4:
+        blob.append(0)
+    bv_idx = len(gltf.bufferViews)
+    gltf.bufferViews.append(pygltflib.BufferView(
+        buffer=0, byteOffset=byte_offset, byteLength=len(png_bytes),
+    ))
+    img_idx = len(gltf.images)
+    gltf.images.append(pygltflib.Image(
+        bufferView=bv_idx, mimeType="image/png",
+    ))
+    tex_idx = len(gltf.textures)
+    gltf.textures.append(pygltflib.Texture(source=img_idx))
+    if gltf.materials:
+        gltf.materials[0].occlusionTexture = pygltflib.OcclusionTextureInfo(
+            index=tex_idx, strength=displacement_scale,
+        )
+    gltf.buffers[0].byteLength = len(blob)
+    gltf.set_binary_blob(bytes(blob))
+    gltf.save(out_path)
+    return out_path

pipeline/face_enhance.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+Face enhancement for MV-Adapter multiview textures.
+Pipeline per visible-face view:
+  1. InsightFace buffalo_l  — detect faces, extract 5-pt landmarks & 512-d embeddings
+  2. HyperSwap 1A 256       — swap reference identity (embedding) onto each view face
+     (falls back to inswapper_128 if hyperswap not present)
+  3. RealESRGAN x4plus      — upscale face bbox 4x, resize back (real detail,
+     identity-preserving). Falls back to GFPGAN v1.4 if weights not present.
+HyperSwap I/O:
+    source  [1, 512]          — face embedding from recognition model
+    target  [1, 3, 256, 256]  — aligned face crop (float32, RGB, [0,1])
+    output  [1, 3, 256, 256]  — swapped face crop
+    mask    [1, 1, 256, 256]  — alpha mask for seamless paste-back
+Usage (standalone):
+    python -m pipeline.face_enhance \
+        --multiview  /tmp/user_tex4/result.png \
+        --reference  /tmp/tex_input_768.png \
+        --output     /tmp/user_tex4/result_enhanced.png \
+        --checkpoints /root/MV-Adapter/checkpoints
+"""
+import argparse
+import os
+import cv2
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+# ── helpers ────────────────────────────────────────────────────────────────────
+def pil_to_bgr(img: Image.Image) -> np.ndarray:
+    return cv2.cvtColor(np.array(img.convert("RGB")), cv2.COLOR_RGB2BGR)
+def bgr_to_pil(arr: np.ndarray) -> Image.Image:
+    return Image.fromarray(cv2.cvtColor(arr, cv2.COLOR_BGR2RGB))
+def split_multiview(mv: Image.Image, n: int = 6):
+    w_each = mv.width // n
+    return [mv.crop((i * w_each, 0, (i + 1) * w_each, mv.height)) for i in range(n)]
+def stitch_views(views):
+    total_w = sum(v.width for v in views)
+    out = Image.new("RGB", (total_w, views[0].height))
+    x = 0
+    for v in views:
+        out.paste(v, (x, 0))
+        x += v.width
+    return out
+# ── HyperSwap 1A 256 — custom ONNX wrapper ────────────────────────────────────
+class HyperSwapper:
+    """
+    Direct ONNX inference for HyperSwap 1A 256.
+    source [1,512] × target [1,3,256,256] → output [1,3,256,256], mask [1,1,256,256]
+    """
+    # Standard 5-point face alignment template (112×112 base, scaled to crop_size)
+    _TEMPLATE_112 = np.array([
+        [38.2946, 51.6963],
+        [73.5318, 51.5014],
+        [56.0252, 71.7366],
+        [41.5493, 92.3655],
+        [70.7299, 92.2041],
+    ], dtype=np.float32)
+    def __init__(self, ckpt_path: str, providers=None):
+        self.crop_size = 256
+        self.providers = providers or ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        self.sess = ort.InferenceSession(ckpt_path, providers=self.providers)
+        print(f"[HyperSwapper] Loaded {os.path.basename(ckpt_path)} "
+              f"(providers: {self.sess.get_providers()})")
+    def _get_affine(self, kps: np.ndarray) -> np.ndarray:
+        """Estimate affine transform from 5 face keypoints to standard template."""
+        template = self._TEMPLATE_112 / 112.0 * self.crop_size
+        from cv2 import estimateAffinePartial2D
+        M, _ = estimateAffinePartial2D(kps, template, method=cv2.RANSAC)
+        return M  # [2, 3]
+    def _crop_face(self, img_bgr: np.ndarray, kps: np.ndarray):
+        """Crop and align face to crop_size × crop_size."""
+        M = self._get_affine(kps)
+        crop = cv2.warpAffine(img_bgr, M, (self.crop_size, self.crop_size),
+                              flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
+        return crop, M
+    def _paste_back(self, img_bgr: np.ndarray, crop_bgr: np.ndarray,
+                    mask: np.ndarray, M: np.ndarray) -> np.ndarray:
+        """Paste swapped face crop back into the original frame using the mask."""
+        h, w = img_bgr.shape[:2]
+        IM = cv2.invertAffineTransform(M)
+        warped = cv2.warpAffine(crop_bgr, IM, (w, h),
+                                flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
+        mask_img = (mask * 255).clip(0, 255).astype(np.uint8)
+        mask_warped = cv2.warpAffine(mask_img, IM, (w, h), flags=cv2.INTER_LINEAR)
+        mask_f = mask_warped.astype(np.float32)[:, :, np.newaxis] / 255.0
+        result = img_bgr.astype(np.float32) * (1.0 - mask_f) + warped.astype(np.float32) * mask_f
+        return result.clip(0, 255).astype(np.uint8)
+    def get(self, img_bgr: np.ndarray, target_face, source_face,
+            paste_back: bool = True):
+        """
+        Swap source_face identity onto target_face in img_bgr.
+        face objects are InsightFace Face instances with .embedding and .kps.
+        """
+        # 1. Source embedding [1, 512]
+        emb = source_face.embedding.astype(np.float32)
+        emb /= np.linalg.norm(emb)          # L2-normalise
+        source_input = emb.reshape(1, -1)   # [1, 512]
+        # 2. Crop and align target face to 256×256
+        kps = target_face.kps.astype(np.float32)
+        crop_bgr, M = self._crop_face(img_bgr, kps)
+        # Convert BGR→RGB, normalize to [-1, 1], HWC→CHW, add batch dim
+        crop_rgb = crop_bgr[:, :, ::-1].astype(np.float32) / 255.0
+        crop_rgb = (crop_rgb - 0.5) / 0.5                         # [−1, 1]
+        target_input = crop_rgb.transpose(2, 0, 1)[np.newaxis]   # [1, 3, 256, 256]
+        # 3. Inference
+        outputs = self.sess.run(None, {"source": source_input, "target": target_input})
+        out_tensor = outputs[0][0]   # [3, 256, 256]  values in [-1, 1]
+        mask_tensor = outputs[1][0, 0]  # [256, 256]
+        # 4. Convert output back to BGR uint8  ([-1,1] → [0,255])
+        out_rgb = ((out_tensor.transpose(1, 2, 0) + 1) / 2 * 255).clip(0, 255).astype(np.uint8)
+        out_bgr = out_rgb[:, :, ::-1]
+        if not paste_back:
+            return out_bgr, mask_tensor
+        # 5. Paste back into the original frame
+        return self._paste_back(img_bgr, out_bgr, mask_tensor, M)
+# ── model loading ─────────────────────────────────────────────────────────────
+_ORT_PROVIDERS = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+def load_face_analyzer():
+    from insightface.app import FaceAnalysis
+    app = FaceAnalysis(name="buffalo_l", providers=_ORT_PROVIDERS)
+    app.prepare(ctx_id=0, det_size=(640, 640))
+    return app
+def load_swapper(ckpt_dir: str):
+    """HyperSwap 1A 256 if present, else fall back to inswapper_128."""
+    import insightface.model_zoo as model_zoo
+    hyperswap = os.path.join(ckpt_dir, "hyperswap_1a_256.onnx")
+    inswapper = os.path.join(ckpt_dir, "inswapper_128.onnx")
+    if os.path.exists(hyperswap):
+        print(f"[face_enhance] Using HyperSwap 1A 256")
+        return HyperSwapper(hyperswap, providers=_ORT_PROVIDERS)
+    if os.path.exists(inswapper):
+        print(f"[face_enhance] Using inswapper_128 (fallback)")
+        return model_zoo.get_model(inswapper, providers=_ORT_PROVIDERS)
+    raise FileNotFoundError(
+        f"No swapper model found in {ckpt_dir}. "
+        "Add hyperswap_1a_256.onnx or inswapper_128.onnx."
+    )
+def load_realesrgan(model_path: str, scale: int = 4, half: bool = False):
+    """Load RealESRGAN x4plus — full float32 (half=False), no tiling (tile=0)."""
+    from basicsr.archs.rrdbnet_arch import RRDBNet
+    from realesrgan import RealESRGANer
+    model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64,
+                    num_block=23, num_grow_ch=32, scale=scale)
+    return RealESRGANer(
+        scale=scale, model_path=model_path, model=model,
+        tile=0, tile_pad=10, pre_pad=0, half=half,
+    )
+def load_gfpgan(ckpt_dir: str, upscale: int = 1):
+    from gfpgan import GFPGANer
+    model_path = os.path.join(ckpt_dir, "GFPGANv1.4.pth")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"GFPGANv1.4.pth not found in {ckpt_dir}")
+    return GFPGANer(model_path=model_path, upscale=upscale,
+                    arch="clean", channel_multiplier=2, bg_upsampler=None)
+def load_restorer(ckpt_dir: str):
+    """
+    Prefer RealESRGAN x4plus (full float32, no tiling, unsharp mask post-pass).
+    Falls back to GFPGAN v1.4 if RealESRGAN weights are absent.
+    Returns (restorer, 'realesrgan' | 'gfpgan').
+    """
+    realesr_path = os.path.join(ckpt_dir, "RealESRGAN_x4plus.pth")
+    if os.path.exists(realesr_path):
+        try:
+            r = load_realesrgan(realesr_path, scale=4, half=False)
+            print("[face_enhance] Restorer: RealESRGAN x4plus (float32, tile=0)")
+            return r, "realesrgan"
+        except Exception as e:
+            print(f"[face_enhance] RealESRGAN load failed ({e}), falling back to GFPGAN")
+    r = load_gfpgan(ckpt_dir, upscale=1)
+    print("[face_enhance] Restorer: GFPGAN v1.4 (fallback)")
+    return r, "gfpgan"
+# ── core enhancement ──────────────────────────────────────────────────────────
+def get_reference_face(analyzer, ref_bgr: np.ndarray):
+    faces = analyzer.get(ref_bgr)
+    if not faces:
+        raise RuntimeError("No face detected in reference image.")
+    faces.sort(key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]), reverse=True)
+    return faces[0]
+def _enhance_face_bbox(frame_bgr: np.ndarray, faces, restorer, restorer_type: str,
+                       pad: float = 0.4) -> np.ndarray:
+    """
+    Crop each face bbox (+ padding), enhance with restorer, blend back.
+    RealESRGAN: upscale 4x → resize back → unsharp mask → feathered blend.
+    GFPGAN: restore in-place on crop → resize back → hard paste.
+    """
+    result = frame_bgr.copy()
+    h, w = frame_bgr.shape[:2]
+    for face in faces:
+        x1, y1, x2, y2 = face.bbox[:4].astype(int)
+        bw, bh = x2 - x1, y2 - y1
+        px, py = int(bw * pad), int(bh * pad)
+        cx1 = max(0, x1 - px);  cy1 = max(0, y1 - py)
+        cx2 = min(w, x2 + px);  cy2 = min(h, y2 + py)
+        crop = frame_bgr[cy1:cy2, cx1:cx2].copy()
+        if crop.size == 0:
+            continue
+        cw, ch = cx2 - cx1, cy2 - cy1
+        try:
+            if restorer_type == "realesrgan":
+                enhanced, _ = restorer.enhance(crop, outscale=4)
+                enhanced = cv2.resize(enhanced, (cw, ch), interpolation=cv2.INTER_LANCZOS4)
+                # Unsharp mask — strength 1.8
+                blur = cv2.GaussianBlur(enhanced, (0, 0), 2)
+                enhanced = cv2.addWeighted(enhanced, 1.8, blur, -0.8, 0)
+            else:
+                _, _, enhanced = restorer.enhance(
+                    crop, has_aligned=False, only_center_face=True,
+                    paste_back=True, weight=0.5)
+                if enhanced.shape[:2] != (ch, cw):
+                    enhanced = cv2.resize(enhanced, (cw, ch), interpolation=cv2.INTER_LANCZOS4)
+        except Exception as e:
+            import traceback as _tb
+            print(f"[enhance_view] {restorer_type} failed on face bbox: {e}\n{_tb.format_exc()}")
+            continue
+        # Feathered blend at edges
+        feather = max(3, int(min(cw, ch) * 0.08))
+        mask = np.ones((ch, cw), dtype=np.float32)
+        for f in range(feather):
+            a = (f + 1) / feather
+            mask[f, :] = a;  mask[-(f+1), :] = a
+            mask[:, f] = np.minimum(mask[:, f], a)
+            mask[:, -(f+1)] = np.minimum(mask[:, -(f+1)], a)
+        mask = mask[:, :, np.newaxis]
+        result[cy1:cy2, cx1:cx2] = (
+            result[cy1:cy2, cx1:cx2].astype(np.float32) * (1 - mask) +
+            enhanced.astype(np.float32) * mask
+        ).clip(0, 255).astype(np.uint8)
+    return result
+def enhance_view(view_bgr, analyzer, swapper, restorer, restorer_type,
+                 source_face) -> np.ndarray:
+    target_faces = analyzer.get(view_bgr)
+    if not target_faces:
+        return view_bgr
+    swapped = view_bgr.copy()
+    for face in target_faces:
+        swapped = swapper.get(swapped, face, source_face, paste_back=True)
+    print(f"[enhance_view] HyperSwap applied to {len(target_faces)} face(s)")
+    # Re-detect in swapped image for accurate bboxes
+    swapped_faces = analyzer.get(swapped) or target_faces
+    result = _enhance_face_bbox(swapped, swapped_faces, restorer, restorer_type)
+    print(f"[enhance_view] {restorer_type} enhanced {len(swapped_faces)} face(s)")
+    return result
+def enhance_multiview(
+    multiview_path: str,
+    reference_path: str,
+    output_path: str,
+    ckpt_dir: str,
+    n_views: int = 6,
+    gfpgan_upscale: int = 1,
+    face_views: tuple = (0, 1, 3, 4),
+):
+    print("[face_enhance] Loading models...")
+    analyzer = load_face_analyzer()
+    swapper  = load_swapper(ckpt_dir)
+    restorer, restorer_type = load_restorer(ckpt_dir)
+    print("[face_enhance] Models loaded.")
+    ref_pil = Image.open(reference_path).convert("RGB")
+    ref_bgr = pil_to_bgr(ref_pil)
+    source_face = get_reference_face(analyzer, ref_bgr)
+    print(f"[face_enhance] Reference face bbox={source_face.bbox.astype(int)}")
+    mv = Image.open(multiview_path).convert("RGB")
+    views = split_multiview(mv, n=n_views)
+    enhanced = []
+    for i, view_pil in enumerate(views):
+        if i in face_views:
+            view_bgr = pil_to_bgr(view_pil)
+            result_bgr = enhance_view(view_bgr, analyzer, swapper, restorer,
+                                      restorer_type, source_face)
+            enhanced.append(bgr_to_pil(result_bgr))
+            n_faces = len(analyzer.get(view_bgr))
+            print(f"[face_enhance] View {i}: {n_faces} face(s) processed.")
+        else:
+            enhanced.append(view_pil)
+    stitch_views(enhanced).save(output_path)
+    print(f"[face_enhance] Saved → {output_path}")
+    return output_path
+# ── CLI ───────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--multiview",   required=True)
+    parser.add_argument("--reference",   required=True)
+    parser.add_argument("--output",      required=True)
+    parser.add_argument("--checkpoints", default="./checkpoints")
+    parser.add_argument("--n_views",     type=int, default=6)
+    args = parser.parse_args()
+    enhance_multiview(
+        multiview_path=args.multiview,
+        reference_path=args.reference,
+        output_path=args.output,
+        ckpt_dir=args.checkpoints,
+        n_views=args.n_views,
+    )

pipeline/rig_stage.py ADDED Viewed

	@@ -0,0 +1,1282 @@

+"""
+Stage 7 — Multi-view pose estimation + mesh rigging
+Three progressive phases, each feeding the next:
+  Phase 1 (Easy)   — Multi-view beta averaging
+    Run HMR 2.0 on front / 3q_front / side renders + reference photo
+    Average shape betas weighted by detection confidence
+  Phase 2 (Better) — Silhouette fitting
+    Project SMPL mesh orthographically into each of the 5 views
+    Optimise betas so the SMPL silhouette matches the TripoSG render mask
+    Uses known orthographic camera matrices (exact same params as nvdiffrast)
+  Phase 3 (Best)   — Multi-view joint triangulation
+    For each view where HMR 2.0 fired, project its 2D keypoints back to 3D
+    using the known orthographic camera → set up linear system per joint
+    Least-squares triangulation gives world-space joint positions used
+    directly as the skeleton, overriding the regressed SMPL joints
+Output: rigged GLB (SMPL 24-joint skeleton + skin weights) + FBX via Blender
+"""
+import os, sys, json, struct, traceback, subprocess, tempfile
+# Must be set before any OpenGL/pyrender import (triggered by hmr2)
+os.environ.setdefault("PYOPENGL_PLATFORM", "egl")
+import numpy as np
+# ── SMPL constants ────────────────────────────────────────────────────────────
+SMPL_JOINT_NAMES = [
+    "pelvis","left_hip","right_hip","spine1",
+    "left_knee","right_knee","spine2",
+    "left_ankle","right_ankle","spine3",
+    "left_foot","right_foot","neck",
+    "left_collar","right_collar","head",
+    "left_shoulder","right_shoulder",
+    "left_elbow","right_elbow",
+    "left_wrist","right_wrist",
+    "left_hand","right_hand",
+]
+SMPL_PARENTS = [-1,0,0,0,1,2,3,4,5,6,7,8,9,9,9,
+                12,13,14,16,17,18,19,20,21]
+# Orthographic camera parameters — must match render_views in triposg_app.py
+ORTHO_LEFT, ORTHO_RIGHT = -0.55, 0.55
+ORTHO_BOT,  ORTHO_TOP   = -0.55, 0.55
+RENDER_W, RENDER_H      = 768, 1024
+# Azimuths passed to get_orthogonal_camera: [x-90 for x in [0,45,90,180,315]]
+VIEW_AZIMUTHS_DEG = [-90.0, -45.0, 0.0, 90.0, 225.0]
+VIEW_NAMES        = ["front", "3q_front", "side", "back", "3q_back"]
+VIEW_PATHS        = [f"/tmp/render_{n}.png" for n in VIEW_NAMES]
+# Views with a clearly visible front body (used for Phase 1 beta averaging)
+FRONT_VIEW_INDICES = [0, 1, 2]   # front, 3q_front, side
+# ══════════════════════════════════════════════════════════════════════════════
+# Camera utilities
+# ══════════════════════════════════════════════════════════════════════════════
+def _R_y(deg: float) -> np.ndarray:
+    """Rotation matrix around Y axis (right-hand, degrees)."""
+    t = np.radians(deg)
+    c, s = np.cos(t), np.sin(t)
+    return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float64)
+def world_to_cam(pts: np.ndarray, azimuth_deg: float) -> np.ndarray:
+    """
+    Orthographic projection: world (N,3) → camera (N,2) in world-unit space.
+    Convention: camera right = (cos θ, 0, -sin θ),  up = (0,1,0)
+    """
+    t = np.radians(azimuth_deg)
+    right = np.array([np.cos(t),  0.0, -np.sin(t)])
+    up    = np.array([0.0,        1.0,  0.0       ])
+    return np.stack([pts @ right, pts @ up], axis=-1)   # (N, 2)
+def cam_to_pixel(cam_xy: np.ndarray) -> np.ndarray:
+    """Camera world-unit coords → pixel coords (u, v) in 768×1024 image."""
+    u = (cam_xy[:, 0] - ORTHO_LEFT) / (ORTHO_RIGHT - ORTHO_LEFT) * RENDER_W
+    v = (ORTHO_TOP  - cam_xy[:, 1]) / (ORTHO_TOP   - ORTHO_BOT ) * RENDER_H
+    return np.stack([u, v], axis=-1)
+def pixel_to_cam(uv: np.ndarray) -> np.ndarray:
+    """Pixel coords → camera world-unit coords."""
+    cx = uv[:, 0] / RENDER_W * (ORTHO_RIGHT - ORTHO_LEFT) + ORTHO_LEFT
+    cy = ORTHO_TOP - uv[:, 1] / RENDER_H * (ORTHO_TOP - ORTHO_BOT)
+    return np.stack([cx, cy], axis=-1)
+def triangulate_joint(obs: list[tuple]) -> np.ndarray:
+    """
+    Triangulate a single joint from multi-view 2D observations.
+    obs: list of (azimuth_deg, pixel_u, pixel_v)
+    Returns world (x, y, z).
+    For orthographic cameras, Y is directly measured; X and Z satisfy:
+      px*cos(θ) - pz*sin(θ) = cx   for each view
+    → overdetermined linear system solved with lstsq.
+    """
+    ys, rows_A, rhs = [], [], []
+    for az_deg, pu, pv in obs:
+        cx, cy = pixel_to_cam(np.array([[pu, pv]]))[0]
+        ys.append(cy)
+        t = np.radians(az_deg)
+        rows_A.append([np.cos(t), -np.sin(t)])
+        rhs.append(cx)
+    A  = np.array(rows_A, dtype=np.float64)
+    b  = np.array(rhs,    dtype=np.float64)
+    wy = float(np.mean(ys))
+    if len(obs) >= 2:
+        xz, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
+        wx, wz = xz
+    else:
+        wx, wz = 0.0, 0.0
+    return np.array([wx, wy, wz], dtype=np.float32)
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 1 — Multi-view HMR 2.0 + beta averaging
+# ══════════════════════════════════════════════════════════════════════════════
+def _load_hmr2(device):
+    from hmr2.models import download_models, load_hmr2, DEFAULT_CHECKPOINT
+    download_models()   # downloads to CACHE_DIR_4DHUMANS (no-op if already done)
+    model, cfg = load_hmr2(DEFAULT_CHECKPOINT)
+    return model.to(device).eval(), cfg
+def _load_detector():
+    from detectron2.config import LazyConfig
+    from hmr2.utils.utils_detectron2 import DefaultPredictor_Lazy
+    import hmr2
+    cfg = LazyConfig.load(str(os.path.join(
+        os.path.dirname(hmr2.__file__),
+        "configs/cascade_mask_rcnn_vitdet_h_75ep.py")))
+    cfg.train.init_checkpoint = (
+        "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/"
+        "cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl")
+    for i in range(3):
+        cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
+    return DefaultPredictor_Lazy(cfg)
+def _run_hmr2_on_image(img_bgr, model, model_cfg, detector, device):
+    """
+    Run HMR 2.0 on a BGR image. Returns dict or None.
+    Keys: betas (10,), body_pose (23,3,3), global_orient (1,3,3),
+          kp2d (44,2) in [0,1] normalised, kp3d (44,3), score (float)
+    """
+    import torch
+    from hmr2.utils import recursive_to
+    from hmr2.datasets.vitdet_dataset import ViTDetDataset
+    det_out = detector(img_bgr)
+    instances = det_out["instances"]
+    valid = (instances.pred_classes == 0) & (instances.scores > 0.5)
+    if not valid.any():
+        return None
+    boxes = instances.pred_boxes.tensor[valid].cpu().numpy()
+    score = float(instances.scores[valid].max().cpu())
+    best  = boxes[np.argmax((boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1]))]
+    ds = ViTDetDataset(model_cfg, img_bgr, [best])
+    dl = torch.utils.data.DataLoader(ds, batch_size=1, shuffle=False)
+    batch = recursive_to(next(iter(dl)), device)
+    with torch.no_grad():
+        out = model(batch)
+    p = out["pred_smpl_params"]
+    return {
+        "betas":         p["betas"][0].cpu().numpy(),
+        "body_pose":     p["body_pose"][0].cpu().numpy(),
+        "global_orient": p["global_orient"][0].cpu().numpy(),
+        "kp2d":          out["pred_keypoints_2d"][0].cpu().numpy(),  # (44,2) [-1,1]
+        "kp3d":          out.get("pred_keypoints_3d", [None]*1)[0],
+        "score":         score,
+        "detected":      True,
+    }
+def estimate_betas_multiview(view_paths: list[str],
+                              ref_path: str,
+                              device: str = "cuda") -> tuple[np.ndarray, list]:
+    """
+    Phase 1: run HMR 2.0 on reference photo + front/3q/side renders.
+    Returns (averaged_betas [10,], list_of_all_results).
+    Falls back to zero betas (average body shape) if HMR2 is unavailable.
+    """
+    import cv2
+    print("[rig P1] Loading HMR2 + detector...")
+    try:
+        model, model_cfg = _load_hmr2(device)
+        detector = _load_detector()
+    except Exception as e:
+        print(f"[rig P1] HMR2 unavailable ({e}) — using zero betas (average body shape)")
+        return np.zeros(10, dtype=np.float32), []
+    sources = [(ref_path, None)]   # (path, azimuth_deg_or_None)
+    for idx in FRONT_VIEW_INDICES:
+        if idx < len(view_paths) and os.path.exists(view_paths[idx]):
+            sources.append((view_paths[idx], VIEW_AZIMUTHS_DEG[idx]))
+    results = []
+    weighted_betas, total_w = np.zeros(10, dtype=np.float64), 0.0
+    for path, az in sources:
+        img = cv2.imread(path)
+        if img is None:
+            continue
+        r = _run_hmr2_on_image(img, model, model_cfg, detector, device)
+        if r is None:
+            print(f"[rig P1]   {os.path.basename(path)}: no person detected")
+            continue
+        r["azimuth_deg"] = az
+        r["path"]        = path
+        results.append(r)
+        w = r["score"]
+        weighted_betas += r["betas"] * w
+        total_w        += w
+        print(f"[rig P1]   {os.path.basename(path)}: detected (score={w:.2f}), "
+              f"betas[:3]={r['betas'][:3]}")
+    avg_betas = (weighted_betas / total_w).astype(np.float32) if total_w > 0 \
+                else np.zeros(10, dtype=np.float32)
+    print(f"[rig P1] Averaged betas over {len(results)} detections.")
+    return avg_betas, results
+# ═════════════════════════════════════════════════��════════════════════════════
+# SMPL helpers
+# ══════════════════════════════════════════════════════════════════════════════
+def get_smpl_tpose(betas: np.ndarray, smpl_dir: str = "/root/smpl_models"):
+    """Returns (verts [N,3], faces [M,3], joints [24,3], lbs_weights [N,24]).
+    Uses smplx if SMPL_NEUTRAL.pkl is available, else falls back to a synthetic
+    proxy skeleton with proximity-based skinning weights."""
+    import torch
+    model_path = os.path.join(smpl_dir, "SMPL_NEUTRAL.pkl")
+    if not os.path.exists(model_path) or os.path.getsize(model_path) < 1000:
+        # Try download first, silently fall through to synthetic on failure
+        try:
+            _download_smpl_neutral(smpl_dir)
+        except Exception:
+            pass
+    if os.path.exists(model_path) and os.path.getsize(model_path) > 100_000:
+        import smplx
+        smpl = smplx.create(smpl_dir, model_type="smpl", gender="neutral", num_betas=10)
+        betas_t = torch.tensor(betas[:10], dtype=torch.float32).unsqueeze(0)
+        with torch.no_grad():
+            out = smpl(betas=betas_t, return_verts=True)
+        verts   = out.vertices[0].numpy().astype(np.float32)
+        joints  = out.joints[0, :24].numpy().astype(np.float32)
+        faces   = smpl.faces.astype(np.int32)
+        weights = smpl.lbs_weights.numpy().astype(np.float32)
+        return verts, faces, joints, weights
+    print("[rig] SMPL_NEUTRAL.pkl unavailable — using synthetic proxy skeleton")
+    return _synthetic_smpl_tpose()
+def _synthetic_smpl_tpose():
+    """Synthetic SMPL substitute: hardcoded T-pose joint positions + proximity weights.
+    Gives a rough but functional rig for pipeline testing when SMPL is unavailable.
+    For production, provide SMPL_NEUTRAL.pkl from https://smpl.is.tue.mpg.de/."""
+    # 24 SMPL T-pose joint positions (metres, Y-up, facing +Z)
+    joints = np.array([
+        [ 0.00,  0.92,  0.00],  # 0  pelvis
+        [-0.09,  0.86,  0.00],  # 1  left_hip
+        [ 0.09,  0.86,  0.00],  # 2  right_hip
+        [ 0.00,  1.05,  0.00],  # 3  spine1
+        [-0.09,  0.52,  0.00],  # 4  left_knee
+        [ 0.09,  0.52,  0.00],  # 5  right_knee
+        [ 0.00,  1.17,  0.00],  # 6  spine2
+        [-0.09,  0.10,  0.00],  # 7  left_ankle
+        [ 0.09,  0.10,  0.00],  # 8  right_ankle
+        [ 0.00,  1.29,  0.00],  # 9  spine3
+        [-0.09,  0.00,  0.07],  # 10 left_foot
+        [ 0.09,  0.00,  0.07],  # 11 right_foot
+        [ 0.00,  1.46,  0.00],  # 12 neck
+        [-0.07,  1.42,  0.00],  # 13 left_collar
+        [ 0.07,  1.42,  0.00],  # 14 right_collar
+        [ 0.00,  1.62,  0.00],  # 15 head
+        [-0.17,  1.40,  0.00],  # 16 left_shoulder
+        [ 0.17,  1.40,  0.00],  # 17 right_shoulder
+        [-0.42,  1.40,  0.00],  # 18 left_elbow
+        [ 0.42,  1.40,  0.00],  # 19 right_elbow
+        [-0.65,  1.40,  0.00],  # 20 left_wrist
+        [ 0.65,  1.40,  0.00],  # 21 right_wrist
+        [-0.72,  1.40,  0.00],  # 22 left_hand
+        [ 0.72,  1.40,  0.00],  # 23 right_hand
+    ], dtype=np.float32)
+    # Build synthetic proxy vertices: ~300 points clustered around each joint
+    rng = np.random.default_rng(42)
+    n_per_joint = 300
+    proxy_v = []
+    proxy_w = []
+    for ji, jpos in enumerate(joints):
+        pts = jpos + rng.normal(0, 0.06, (n_per_joint, 3)).astype(np.float32)
+        proxy_v.append(pts)
+        w = np.zeros((n_per_joint, 24), np.float32)
+        w[:, ji] = 1.0
+        proxy_w.append(w)
+    proxy_v = np.concatenate(proxy_v, axis=0)   # (7200, 3)
+    proxy_w = np.concatenate(proxy_w, axis=0)   # (7200, 24)
+    proxy_f = np.zeros((0, 3), dtype=np.int32)  # no faces needed for KNN transfer
+    return proxy_v, proxy_f, joints, proxy_w
+def _download_smpl_neutral(out_dir: str):
+    os.makedirs(out_dir, exist_ok=True)
+    url  = ("https://huggingface.co/spaces/TMElyralab/MusePose/resolve/main"
+            "/models/smpl/SMPL_NEUTRAL.pkl")
+    dest = os.path.join(out_dir, "SMPL_NEUTRAL.pkl")
+    print("[rig] Downloading SMPL_NEUTRAL.pkl...")
+    subprocess.run(["wget", "-q", url, "-O", dest], check=True)
+def _smpl_to_render_space(verts: np.ndarray, joints: np.ndarray):
+    """
+    Normalise SMPL vertices to fit inside the [-0.55, 0.55] orthographic
+    frustum used by the nvdiffrast renders (same as align_mesh_to_smpl).
+    Returns (verts_norm, joints_norm, scale, offset).
+    """
+    ymin, ymax = verts[:, 1].min(), verts[:, 1].max()
+    height = ymax - ymin
+    scale  = (ORTHO_TOP - ORTHO_BOT) / max(height, 1e-6)
+    # Centre on pelvis (joint 0) horizontally, floor-align vertically
+    v = verts  * scale
+    j = joints * scale
+    cx = (v[:, 0].max() + v[:, 0].min()) * 0.5
+    cz = (v[:, 2].max() + v[:, 2].min()) * 0.5
+    v[:, 0] -= cx;  j[:, 0] -= cx
+    v[:, 2] -= cz;  j[:, 2] -= cz
+    v[:, 1] -= v[:, 1].min() + ORTHO_BOT   # floor at ORTHO_BOT
+    j[:, 1] -= (verts[:, 1].min() * scale) - ORTHO_BOT
+    return v, j, scale, np.array([-cx, -v[:,1].min() + ORTHO_BOT, -cz])
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 2 — Silhouette fitting
+# ══════════════════════════════════════════════════════════════════════════════
+def _extract_silhouette(render_path: str, threshold: int = 20) -> np.ndarray:
+    """Binary mask (H×W bool) from a render: foreground = any channel > threshold."""
+    import cv2
+    img = cv2.imread(render_path)
+    if img is None:
+        return np.zeros((RENDER_H, RENDER_W), dtype=bool)
+    return img.max(axis=2) > threshold
+def _render_smpl_silhouette(verts_norm: np.ndarray, faces: np.ndarray,
+                              azimuth_deg: float) -> np.ndarray:
+    """
+    Rasterise SMPL mesh silhouette for given azimuth (orthographic).
+    Returns binary mask (H×W bool).
+    """
+    from PIL import Image, ImageDraw
+    cam_xy = world_to_cam(verts_norm, azimuth_deg)
+    pix    = cam_to_pixel(cam_xy)  # (N, 2)
+    img = Image.new("L", (RENDER_W, RENDER_H), 0)
+    draw = ImageDraw.Draw(img)
+    for f in faces:
+        pts = [(float(pix[i, 0]), float(pix[i, 1])) for i in f]
+        draw.polygon(pts, fill=255)
+    return np.array(img) > 0
+def _sil_loss(betas: np.ndarray, target_masks: list,
+               valid_views: list[int], faces: np.ndarray) -> float:
+    """1 - mean IoU between SMPL silhouettes and TripoSG render masks."""
+    try:
+        verts, _, _, _ = get_smpl_tpose(betas.astype(np.float32))
+        verts_n, _, _, _ = _smpl_to_render_space(verts, verts.copy())
+        iou_sum = 0.0
+        for i in valid_views:
+            pred = _render_smpl_silhouette(verts_n, faces, VIEW_AZIMUTHS_DEG[i])
+            tgt  = target_masks[i]
+            inter = (pred & tgt).sum()
+            union = (pred | tgt).sum()
+            iou_sum += inter / max(union, 1)
+        return 1.0 - iou_sum / len(valid_views)
+    except Exception:
+        return 1.0
+def fit_betas_silhouette(betas_init: np.ndarray, view_paths: list[str],
+                          max_iter: int = 60) -> np.ndarray:
+    """
+    Phase 2: optimise SMPL betas to match TripoSG render silhouettes.
+    Only uses views whose render file exists.
+    """
+    from scipy.optimize import minimize
+    valid = [i for i, p in enumerate(view_paths) if os.path.exists(p)]
+    if not valid:
+        print("[rig P2] No render files found — skipping silhouette fit")
+        return betas_init
+    print(f"[rig P2] Extracting silhouettes from {len(valid)} views...")
+    masks = [_extract_silhouette(view_paths[i]) if i in valid
+             else np.zeros((RENDER_H, RENDER_W), bool)
+             for i in range(len(VIEW_NAMES))]
+    # Use only back-facing views for shape, not back (which shows less shape info)
+    fit_views = [i for i in valid if i in [0, 1, 2]]
+    if not fit_views:
+        fit_views = valid
+    # Pre-fetch faces (constant across iterations)
+    verts0, faces0, _, _ = get_smpl_tpose(betas_init)
+    loss0 = _sil_loss(betas_init, masks, fit_views, faces0)
+    print(f"[rig P2] Initial silhouette loss: {loss0:.4f}")
+    result = minimize(
+        fun=lambda b: _sil_loss(b, masks, fit_views, faces0),
+        x0=betas_init.astype(np.float64),
+        method="L-BFGS-B",
+        bounds=[(-3.0, 3.0)] * 10,
+        options={"maxiter": max_iter, "ftol": 1e-4, "gtol": 1e-3},
+    )
+    refined = result.x.astype(np.float32)
+    loss1   = _sil_loss(refined, masks, fit_views, faces0)
+    print(f"[rig P2] Silhouette fit done: loss {loss0:.4f} → {loss1:.4f}  "
+          f"({result.nit} iters, {'converged' if result.success else 'stopped'})")
+    return refined
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 3 — Multi-view joint triangulation
+# ══════════════════════════════════════════════════════════════════════════════
+# HMR 2.0 outputs 44 keypoints; first 24 map to SMPL joints
+HMR2_TO_SMPL = list(range(24))
+def triangulate_joints_multiview(hmr2_results: list) -> np.ndarray | None:
+    """
+    Phase 3: triangulate world-space SMPL joints from multi-view HMR 2.0 2D keypoints.
+    hmr2_results: list of dicts from _run_hmr2_on_image, each with
+      kp2d (44,2) in [-1,1] normalised NDC  and  azimuth_deg (float or None).
+    Only uses results from rendered views (azimuth_deg is not None).
+    Returns (24,3) world joint positions, or None if < 2 valid views.
+    """
+    view_results = [r for r in hmr2_results
+                    if r.get("azimuth_deg") is not None and r.get("kp2d") is not None]
+    if len(view_results) < 2:
+        print(f"[rig P3] Only {len(view_results)} render views with detections "
+              "— need ≥2 for triangulation, skipping")
+        return None
+    print(f"[rig P3] Triangulating from {len(view_results)} views: "
+          + ", ".join(os.path.basename(r["path"]) for r in view_results))
+    # Convert HMR2 NDC keypoints → pixel coords
+    # kp2d is (44,2) in [-1,1]; pixel = (kp+1)/2 * [W, H]
+    joints_world = np.zeros((24, 3), dtype=np.float32)
+    for j in range(24):
+        obs = []
+        for r in view_results:
+            kp = r["kp2d"][j]            # (2,) in [-1,1]
+            pu = (kp[0] + 1.0) / 2.0 * RENDER_W
+            pv = (kp[1] + 1.0) / 2.0 * RENDER_H
+            obs.append((r["azimuth_deg"], pu, pv))
+        joints_world[j] = triangulate_joint(obs)
+    print(f"[rig P3] Triangulated 24 joints. "
+          f"Pelvis: {joints_world[0].round(3)}, "
+          f"Head: {joints_world[15].round(3)}")
+    return joints_world
+# ══════════════════════════════════════════════════════════════════════════════
+# Skinning weight transfer
+# ══════════════════════════════════════════════════════════════════════════════
+def transfer_skinning(smpl_verts: np.ndarray, smpl_weights: np.ndarray,
+                       target_verts: np.ndarray, k: int = 4) -> np.ndarray:
+    from scipy.spatial import cKDTree
+    tree = cKDTree(smpl_verts)
+    dists, idxs = tree.query(target_verts, k=k, workers=-1)
+    dists  = np.maximum(dists, 1e-8)
+    inv_d  = 1.0 / dists
+    inv_d /= inv_d.sum(axis=1, keepdims=True)
+    transferred = np.einsum("nk,nkj->nj", inv_d, smpl_weights[idxs])
+    row_sums = transferred.sum(axis=1, keepdims=True)
+    transferred /= np.where(row_sums > 0, row_sums, 1.0)
+    return transferred.astype(np.float32)
+def align_mesh_to_smpl(mesh_verts: np.ndarray, smpl_verts: np.ndarray,
+                        smpl_joints: np.ndarray) -> np.ndarray:
+    smpl_h = smpl_verts[:, 1].max() - smpl_verts[:, 1].min()
+    mesh_h = mesh_verts[:, 1].max() - mesh_verts[:, 1].min()
+    scale  = smpl_h / max(mesh_h, 1e-6)
+    v = mesh_verts * scale
+    cx = (v[:, 0].max() + v[:, 0].min()) * 0.5
+    cz = (v[:, 2].max() + v[:, 2].min()) * 0.5
+    v[:, 0] += smpl_joints[0, 0] - cx
+    v[:, 2] += smpl_joints[0, 2] - cz
+    v[:, 1] -= v[:, 1].min()
+    return v
+# ══════════════════════════════════════════════════════════════════════════════
+# GLB export
+# ══════════════════════════════════════════════════════════════════════════════
+def export_rigged_glb(verts, faces, uv, texture_img, joints, skin_weights, out_path):
+    import pygltflib
+    from pygltflib import (GLTF2, Scene, Node, Mesh, Primitive, Accessor,
+                            BufferView, Buffer, Material, Texture,
+                            Image as GImage, Sampler, Skin, Asset)
+    from pygltflib import (ARRAY_BUFFER, ELEMENT_ARRAY_BUFFER, FLOAT,
+                            UNSIGNED_INT, UNSIGNED_SHORT, LINEAR,
+                            LINEAR_MIPMAP_LINEAR, REPEAT, SCALAR, VEC2,
+                            VEC3, VEC4, MAT4)
+    gltf   = GLTF2()
+    gltf.asset = Asset(version="2.0", generator="rig_stage.py")
+    blobs  = []
+    def _add(data: np.ndarray, comp, acc_type, target=None):
+        b   = data.tobytes()
+        pad = (4 - len(b) % 4) % 4
+        off = sum(len(x) for x in blobs)
+        blobs.append(b + b"\x00" * pad)
+        bv  = len(gltf.bufferViews)
+        gltf.bufferViews.append(BufferView(buffer=0, byteOffset=off,
+                                            byteLength=len(b), target=target))
+        ac  = len(gltf.accessors)
+        flat = data.flatten()
+        gltf.accessors.append(Accessor(
+            bufferView=bv, byteOffset=0, componentType=comp,
+            type=acc_type, count=len(data),
+            min=[float(flat.min())], max=[float(flat.max())]))
+        return ac
+    pos_acc = _add(verts.astype(np.float32),  FLOAT,          VEC3,  ARRAY_BUFFER)
+    v0,v1,v2 = verts[faces[:,0]], verts[faces[:,1]], verts[faces[:,2]]
+    fn = np.cross(v1-v0, v2-v0); fn /= (np.linalg.norm(fn,axis=1,keepdims=True)+1e-8)
+    vn = np.zeros_like(verts)
+    for i in range(3): np.add.at(vn, faces[:,i], fn)
+    vn /= (np.linalg.norm(vn,axis=1,keepdims=True)+1e-8)
+    nor_acc = _add(vn.astype(np.float32),      FLOAT,          VEC3,  ARRAY_BUFFER)
+    if uv is None: uv = np.zeros((len(verts),2), np.float32)
+    uv_acc  = _add(uv.astype(np.float32),      FLOAT,          VEC2,  ARRAY_BUFFER)
+    idx_acc = _add(faces.astype(np.uint32).flatten(), UNSIGNED_INT, SCALAR, ELEMENT_ARRAY_BUFFER)
+    top4_idx = np.argsort(-skin_weights, axis=1)[:,:4].astype(np.uint16)
+    top4_w   = np.take_along_axis(skin_weights, top4_idx.astype(np.int64), axis=1).astype(np.float32)
+    top4_w  /= top4_w.sum(axis=1,keepdims=True).clip(1e-8,None)
+    j_acc   = _add(top4_idx, UNSIGNED_SHORT, "VEC4", ARRAY_BUFFER)
+    w_acc   = _add(top4_w,   FLOAT,          "VEC4", ARRAY_BUFFER)
+    if texture_img is not None:
+        import io
+        buf = io.BytesIO(); texture_img.save(buf, format="PNG"); ib = buf.getvalue()
+        off = sum(len(x) for x in blobs); pad = (4-len(ib)%4)%4
+        blobs.append(ib + b"\x00"*pad)
+        gltf.bufferViews.append(BufferView(buffer=0,byteOffset=off,byteLength=len(ib)))
+        gltf.images.append(GImage(mimeType="image/png",bufferView=len(gltf.bufferViews)-1))
+        gltf.samplers.append(Sampler(magFilter=LINEAR,minFilter=LINEAR_MIPMAP_LINEAR,
+                                      wrapS=REPEAT,wrapT=REPEAT))
+        gltf.textures.append(Texture(sampler=0,source=0))
+        gltf.materials.append(Material(name="body",
+            pbrMetallicRoughness={"baseColorTexture":{"index":0},
+                                   "metallicFactor":0.0,"roughnessFactor":0.8},
+            doubleSided=True))
+    else:
+        gltf.materials.append(Material(name="body",doubleSided=True))
+    prim = Primitive(attributes={"POSITION":pos_acc,"NORMAL":nor_acc,
+                                  "TEXCOORD_0":uv_acc,"JOINTS_0":j_acc,"WEIGHTS_0":w_acc},
+                     indices=idx_acc, material=0)
+    gltf.meshes.append(Mesh(name="body",primitives=[prim]))
+    jnodes = []
+    for i,(name,parent) in enumerate(zip(SMPL_JOINT_NAMES,SMPL_PARENTS)):
+        t = joints[i].tolist() if parent==-1 else (joints[i]-joints[parent]).tolist()
+        n = Node(name=name,translation=t,children=[])
+        jnodes.append(len(gltf.nodes)); gltf.nodes.append(n)
+    for i,p in enumerate(SMPL_PARENTS):
+        if p!=-1: gltf.nodes[jnodes[p]].children.append(jnodes[i])
+    ibms = np.stack([np.eye(4,dtype=np.float32) for _ in range(len(joints))])
+    for i in range(len(joints)): ibms[i,:3,3] = -joints[i]
+    ibm_acc = _add(ibms.astype(np.float32), FLOAT, MAT4)
+    skin_idx = len(gltf.skins)
+    gltf.skins.append(Skin(name="smpl_skin",skeleton=jnodes[0],
+                            joints=jnodes,inverseBindMatrices=ibm_acc))
+    mesh_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name="body_mesh",mesh=0,skin=skin_idx))
+    root_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name="root",children=[jnodes[0],mesh_node]))
+    gltf.scenes.append(Scene(name="Scene",nodes=[root_node]))
+    gltf.scene = 0
+    bin_data = b"".join(blobs)
+    gltf.buffers.append(Buffer(byteLength=len(bin_data)))
+    gltf.set_binary_blob(bin_data)
+    gltf.save_binary(out_path)
+    print(f"[rig] Rigged GLB → {out_path}  ({os.path.getsize(out_path)//1024} KB)")
+# ══════════════════════════════════════════════════════════════════════════════
+# FBX export via Blender headless
+# ══════════════════════════════════════════════════════════════════════════════
+_BLENDER_SCRIPT = """\
+import bpy, sys
+args = sys.argv[sys.argv.index('--') + 1:]
+glb_in, fbx_out = args[0], args[1]
+bpy.ops.wm.read_factory_settings(use_empty=True)
+bpy.ops.import_scene.gltf(filepath=glb_in)
+bpy.ops.export_scene.fbx(
+    filepath=fbx_out, use_selection=False,
+    add_leaf_bones=False, bake_anim=False,
+    path_mode='COPY', embed_textures=True,
+)
+print('FBX OK:', fbx_out)
+"""
+def export_fbx(rigged_glb: str, out_path: str) -> bool:
+    blender = next((c for c in ["/usr/bin/blender","/usr/local/bin/blender"]
+                    if os.path.exists(c)), None)
+    if blender is None:
+        r = subprocess.run(["which","blender"],capture_output=True,text=True)
+        blender = r.stdout.strip() or None
+    if blender is None:
+        print("[rig] Blender not found — skipping FBX")
+        return False
+    try:
+        with tempfile.NamedTemporaryFile("w",suffix=".py",delete=False) as f:
+            f.write(_BLENDER_SCRIPT); script = f.name
+        r = subprocess.run([blender,"--background","--python",script,
+                             "--",rigged_glb,out_path],
+                            capture_output=True,text=True,timeout=120)
+        ok = os.path.exists(out_path)
+        if not ok: print(f"[rig] Blender stderr:\n{r.stderr[-800:]}")
+        return ok
+    except Exception:
+        print(f"[rig] export_fbx:\n{traceback.format_exc()}")
+        return False
+    finally:
+        try: os.unlink(script)
+        except: pass
+# ══════════════════════════════════════════════════════════════════════════════
+# MDM — Motion Diffusion Model
+# ══════════════════════════════════════════════════════════════════════════════
+MDM_DIR  = "/root/MDM"
+MDM_CKPT = f"{MDM_DIR}/save/humanml_trans_enc_512/model000200000.pt"
+# HumanML3D 22-joint parent array (matches SMPL joints 0-21)
+_MDM_PARENTS = [-1,0,0,0,1,2,3,4,5,6,7,8,9,9,9,12,13,14,16,17,18,19]
+def setup_mdm() -> bool:
+    """Clone MDM repo, install deps, download checkpoint. Idempotent."""
+    if os.path.exists(MDM_CKPT):
+        return True
+    print("[MDM] First-time setup...")
+    if not os.path.exists(MDM_DIR):
+        r = subprocess.run(
+            ["git", "clone", "--depth=1",
+             "https://github.com/GuyTevet/motion-diffusion-model.git", MDM_DIR],
+            capture_output=True, text=True, timeout=120)
+        if r.returncode != 0:
+            print(f"[MDM] git clone failed:\n{r.stderr}")
+            return False
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q",
+        "git+https://github.com/openai/CLIP.git",
+        "einops", "rotary-embedding-torch", "gdown"], check=False, timeout=300)
+    # HumanML3D normalisation stats (small .npy files needed for inference)
+    stats_dir = f"{MDM_DIR}/dataset/HumanML3D"
+    os.makedirs(stats_dir, exist_ok=True)
+    base = "https://github.com/EricGuo5513/HumanML3D/raw/main/HumanML3D"
+    for fn in ["Mean.npy", "Std.npy"]:
+        dest = f"{stats_dir}/{fn}"
+        if not os.path.exists(dest):
+            subprocess.run(["wget", "-q", f"{base}/{fn}", "-O", dest],
+                           check=False, timeout=60)
+    # Checkpoint (~1.3 GB) — try HuggingFace mirror first, then gdown
+    ckpt_dir = os.path.dirname(MDM_CKPT)
+    os.makedirs(ckpt_dir, exist_ok=True)
+    hf = ("https://huggingface.co/Mathux/motion-diffusion-model/resolve/main/"
+          "humanml_trans_enc_512/model000200000.pt")
+    r = subprocess.run(["wget", "-q", "--show-progress", hf, "-O", MDM_CKPT],
+                        capture_output=True, timeout=3600)
+    if r.returncode != 0 or not os.path.exists(MDM_CKPT) or \
+            os.path.getsize(MDM_CKPT) < 10_000_000:
+        print("[MDM] HF download failed — trying gdown (official Google Drive)...")
+        subprocess.run([sys.executable, "-m", "gdown",
+                        "--id", "1PE0PK8e5a5j-7-Xhs5YET5U5pGh0c821",
+                        "-O", MDM_CKPT], check=False, timeout=3600)
+    ok = os.path.exists(MDM_CKPT) and os.path.getsize(MDM_CKPT) > 10_000_000
+    print(f"[MDM] Setup {'OK' if ok else 'FAILED'}")
+    return ok
+def generate_motion_mdm(text_prompt: str, n_frames: int = 120,
+                          fps: int = 20, device: str = "cuda") -> dict | None:
+    """
+    Run MDM text-to-motion. Returns {'positions': (n_frames,22,3), 'fps': fps}
+    or None on failure. First call runs setup_mdm() which may take ~10 min.
+    """
+    if not setup_mdm():
+        return None
+    out_dir = tempfile.mkdtemp(prefix="mdm_")
+    motion_len = round(n_frames / fps, 2)
+    # Minimal inline driver — avoids MDM's argparse setup entirely
+    driver_src = f"""
+import sys, os
+sys.path.insert(0, {repr(MDM_DIR)})
+os.chdir({repr(MDM_DIR)})
+import numpy as np, torch
+from utils.fixseed import fixseed
+from utils.model_util import create_model_and_diffusion
+from utils import dist_util
+from data_loaders.humanml.utils.paramUtil import t2m_kinematic_chain
+from data_loaders.humanml.scripts.motion_process import recover_from_ric
+import clip as clip_lib
+fixseed(42)
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+dist_util.dev = lambda: device
+import argparse
+args = argparse.Namespace(
+    arch='trans_enc', emb_trans_dec=False,
+    layers=8, latent_dim=512, ff_size=1024, num_heads=4,
+    dropout=0.1, activation='gelu', data_rep='rot6d',
+    dataset='humanml', cond_mode='text', cond_mask_prob=0.1,
+    lambda_rcxyz=0, lambda_vel=0, lambda_fc=0,
+    njoints=263, nfeats=1,
+    num_actions=1, translation=True, pose_rep='rot6d',
+    glob=True, glob_rot=True, npose=315,
+    device=0, seed=42, batch_size=1, num_samples=1,
+    num_repetitions=1, motion_length={motion_len!r},
+    input_text='', text_prompt='', action_file='', action_name='',
+    output_dir={repr(out_dir)}, guidance_param=2.5,
+    unconstrained=False,
+    # additional args required by get_model_args / create_gaussian_diffusion
+    text_encoder_type='clip',
+    pos_embed_max_len=5000,
+    mask_frames=False,
+    pred_len=0,
+    context_len=0,
+    diffusion_steps=1000,
+    noise_schedule='cosine',
+    sigma_small=True,
+    lambda_target_loc=0,
+)
+class _MockData:
+    class dataset:
+        pass
+model, diffusion = create_model_and_diffusion(args, _MockData())
+state = torch.load({repr(MDM_CKPT)}, map_location='cpu', weights_only=False)
+missing, unexpected = model.load_state_dict(state, strict=False)
+model.eval().to(device)
+max_frames = int({n_frames})
+shape = (1, model.njoints, model.nfeats, max_frames)
+clip_model, _ = clip_lib.load('ViT-B/32', device=device, jit=False)
+clip_model.eval()
+tokens = clip_lib.tokenize([{repr(text_prompt)}]).to(device)
+with torch.no_grad():
+    text_emb = clip_model.encode_text(tokens).float()
+model_kwargs = {{
+    'y': {{
+        'mask': torch.ones(1, 1, 1, max_frames).to(device),
+        'lengths': torch.tensor([max_frames]).to(device),
+        'text': [{repr(text_prompt)}],
+        'tokens': [''],
+        'scale': torch.ones(1).to(device) * 2.5,
+    }}
+}}
+with torch.no_grad():
+    sample = diffusion.p_sample_loop(
+        model, shape, clip_denoised=False,
+        model_kwargs=model_kwargs, skip_timesteps=0,
+        init_image=None, progress=False, dump_steps=None,
+        noise=None, const_noise=False,
+    )  # (1, 263, 1, n_frames)
+# Convert HumanML3D features → joint XYZ using recover_from_ric (no SMPL needed)
+# sample: (1, 263, 1, n_frames) → (1, n_frames, 263)
+sample_ric = sample[:, :, 0, :].permute(0, 2, 1)
+xyz = recover_from_ric(sample_ric, 22)  # (1, n_frames, 22, 3)
+positions = xyz[0].cpu().numpy()        # (n_frames, 22, 3)
+np.save(os.path.join({repr(out_dir)}, 'positions.npy'), positions)
+print('MDM_DONE')
+"""
+    driver_f = None
+    try:
+        with tempfile.NamedTemporaryFile('w', suffix='.py', delete=False) as f:
+            f.write(driver_src)
+            driver_f = f.name
+        r = subprocess.run(
+            [sys.executable, driver_f],
+            capture_output=True, text=True, timeout=600,
+            env={**os.environ, "PYTHONPATH": MDM_DIR, "CUDA_VISIBLE_DEVICES": "0"},
+        )
+        print(f"[MDM] stdout: {r.stdout[-400:]}")
+        if r.returncode != 0:
+            print(f"[MDM] FAILED:\n{r.stderr[-600:]}")
+            return None
+        npy = os.path.join(out_dir, "positions.npy")
+        if not os.path.exists(npy):
+            print("[MDM] positions.npy not found")
+            return None
+        arr = np.load(npy)                       # (n_frames, 22, 3)
+        positions = arr                          # already (n_frames, 22, 3)
+        print(f"[MDM] Motion: {positions.shape}, fps={fps}")
+        return {"positions": positions, "fps": fps, "n_frames": positions.shape[0]}
+    except Exception:
+        print(f"[MDM] Exception:\n{traceback.format_exc()}")
+        return None
+    finally:
+        if driver_f:
+            try: os.unlink(driver_f)
+            except: pass
+# ══════════════════════════════════════════════════════════════════════════════
+# FK Inversion — joint world-positions → local quaternions per frame
+# ══════════════════════════════════════════════════════════════════════════════
+def _quat_between(v0: np.ndarray, v1: np.ndarray) -> np.ndarray:
+    """Shortest-arc quaternion [x,y,z,w] that rotates unit vector v0 → v1."""
+    cross = np.cross(v0, v1)
+    dot   = float(np.clip(np.dot(v0, v1), -1.0, 1.0))
+    cn    = np.linalg.norm(cross)
+    if cn < 1e-8:
+        return np.array([0., 0., 0., 1.], np.float32) if dot > 0 \
+               else np.array([1., 0., 0., 0.], np.float32)
+    axis  = cross / cn
+    angle = np.arctan2(cn, dot)
+    s     = np.sin(angle * 0.5)
+    return np.array([axis[0]*s, axis[1]*s, axis[2]*s, np.cos(angle*0.5)], np.float32)
+def _quat_mul(q1: np.ndarray, q2: np.ndarray) -> np.ndarray:
+    """Hamilton product of two [x,y,z,w] quaternions."""
+    x1,y1,z1,w1 = q1; x2,y2,z2,w2 = q2
+    return np.array([
+        w1*x2 + x1*w2 + y1*z2 - z1*y2,
+        w1*y2 - x1*z2 + y1*w2 + z1*x2,
+        w1*z2 + x1*y2 - y1*x2 + z1*w2,
+        w1*w2 - x1*x2 - y1*y2 - z1*z2,
+    ], np.float32)
+def _quat_inv(q: np.ndarray) -> np.ndarray:
+    return np.array([-q[0], -q[1], -q[2], q[3]], np.float32)
+def _quat_rotate(q: np.ndarray, v: np.ndarray) -> np.ndarray:
+    """Rotate vector v by quaternion q."""
+    qv = np.array([v[0], v[1], v[2], 0.], np.float32)
+    return _quat_mul(_quat_mul(q, qv), _quat_inv(q))[:3]
+def positions_to_local_quats(positions: np.ndarray,
+                               t_pose_joints: np.ndarray,
+                               parents: list) -> np.ndarray:
+    """
+    Derive per-joint local quaternions from world-space joint positions.
+    positions   : (n_frames, n_joints, 3)
+    t_pose_joints : (n_joints, 3)  — SMPL T-pose joints in same scale/space
+    parents     : list of length n_joints, parent index (-1 for root)
+    Returns     : (n_frames, n_joints, 4) XYZW local quaternions
+    """
+    n_frames, n_joints, _ = positions.shape
+    quats = np.zeros((n_frames, n_joints, 4), np.float32)
+    quats[:, :, 3] = 1.0  # default identity
+    # Compute global quats first, then convert to local
+    global_quats = np.zeros_like(quats)
+    global_quats[:, :, 3] = 1.0
+    for j in range(n_joints):
+        p = parents[j]
+        if p < 0:
+            # Root: no rotation relative to world (translation handles it)
+            global_quats[:, j] = [0, 0, 0, 1]
+            continue
+        # T-pose parent→child bone direction
+        tp_dir = t_pose_joints[j] - t_pose_joints[p]
+        tp_len = np.linalg.norm(tp_dir)
+        if tp_len < 1e-6:
+            continue
+        tp_dir /= tp_len
+        for f in range(n_frames):
+            an_dir = positions[f, j] - positions[f, p]
+            an_len = np.linalg.norm(an_dir)
+            if an_len < 1e-6:
+                global_quats[f, j] = global_quats[f, p]
+                continue
+            an_dir /= an_len
+            # Global rotation = parent_global ∘ local
+            # We want global bone direction to match an_dir
+            # global_bone_tpose = rotate(global_parent, tp_dir_in_parent_space)
+            # For SMPL T-pose, bone dirs are in world space already
+            gq = _quat_between(tp_dir, an_dir)
+            global_quats[f, j] = gq
+    # Convert global → local (local = inv_parent_global ∘ global)
+    for j in range(n_joints):
+        p = parents[j]
+        if p < 0:
+            quats[:, j] = global_quats[:, j]
+        else:
+            for f in range(n_frames):
+                quats[f, j] = _quat_mul(_quat_inv(global_quats[f, p]),
+                                         global_quats[f, j])
+    return quats
+# ══════════════════════════════════════════════════════════════════════════════
+# Animated GLB export
+# ══════════════════════════════════════════════════════════════════════════════
+def export_animated_glb(verts, faces, uv, texture_img,
+                          joints,         # (24, 3) T-pose joint world positions
+                          skin_weights,   # (N_verts, 24)
+                          joint_quats,    # (n_frames, 24, 4) XYZW local quaternions
+                          root_trans,     # (n_frames, 3) world translation of root
+                          fps: int,
+                          out_path: str):
+    """
+    Export fully animated rigged GLB.
+    Skeleton + skin weights identical to export_rigged_glb;
+    adds a GLTF animation with per-joint rotation channels + root translation.
+    """
+    import pygltflib
+    from pygltflib import (GLTF2, Scene, Node, Mesh, Primitive, Accessor,
+                            BufferView, Buffer, Material, Texture,
+                            Image as GImage, Sampler, Skin, Asset,
+                            Animation, AnimationChannel, AnimationChannelTarget,
+                            AnimationSampler)
+    from pygltflib import (ARRAY_BUFFER, ELEMENT_ARRAY_BUFFER, FLOAT,
+                            UNSIGNED_INT, UNSIGNED_SHORT, LINEAR,
+                            LINEAR_MIPMAP_LINEAR, REPEAT, SCALAR, VEC2,
+                            VEC3, VEC4, MAT4)
+    n_frames, n_joints_anim, _ = joint_quats.shape
+    n_joints = len(joints)
+    gltf  = GLTF2()
+    gltf.asset = Asset(version="2.0", generator="rig_stage.py/animated")
+    blobs = []
+    def _add(data: np.ndarray, comp, acc_type, target=None,
+             set_min_max=False):
+        b   = data.tobytes()
+        pad = (4 - len(b) % 4) % 4
+        off = sum(len(x) for x in blobs)
+        blobs.append(b + b"\x00" * pad)
+        bv  = len(gltf.bufferViews)
+        gltf.bufferViews.append(BufferView(buffer=0, byteOffset=off,
+                                            byteLength=len(b), target=target))
+        ac  = len(gltf.accessors)
+        flat = data.flatten().astype(np.float32)
+        kw = {}
+        if set_min_max:
+            kw = {"min": [float(flat.min())], "max": [float(flat.max())]}
+        gltf.accessors.append(Accessor(
+            bufferView=bv, byteOffset=0, componentType=comp,
+            type=acc_type, count=len(data), **kw))
+        return ac
+    # ── Mesh geometry ──────────────────────────────────────────────────────
+    pos_acc = _add(verts.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)
+    v0,v1,v2 = verts[faces[:,0]], verts[faces[:,1]], verts[faces[:,2]]
+    fn = np.cross(v1-v0, v2-v0)
+    fn /= (np.linalg.norm(fn, axis=1, keepdims=True) + 1e-8)
+    vn = np.zeros_like(verts)
+    for i in range(3): np.add.at(vn, faces[:,i], fn)
+    vn /= (np.linalg.norm(vn, axis=1, keepdims=True) + 1e-8)
+    nor_acc = _add(vn.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)
+    if uv is None: uv = np.zeros((len(verts), 2), np.float32)
+    uv_acc  = _add(uv.astype(np.float32), FLOAT, VEC2, ARRAY_BUFFER)
+    idx_acc = _add(faces.astype(np.uint32).flatten(), UNSIGNED_INT,
+                   SCALAR, ELEMENT_ARRAY_BUFFER)
+    top4_idx = np.argsort(-skin_weights, axis=1)[:, :4].astype(np.uint16)
+    top4_w   = np.take_along_axis(skin_weights, top4_idx.astype(np.int64), axis=1).astype(np.float32)
+    top4_w  /= top4_w.sum(axis=1, keepdims=True).clip(1e-8, None)
+    j_acc = _add(top4_idx, UNSIGNED_SHORT, "VEC4", ARRAY_BUFFER)
+    w_acc = _add(top4_w,   FLOAT,          "VEC4", ARRAY_BUFFER)
+    # ── Texture ────────────────────────────────────────────────────────────
+    if texture_img is not None:
+        import io
+        buf = io.BytesIO(); texture_img.save(buf, format="PNG"); ib = buf.getvalue()
+        off = sum(len(x) for x in blobs); pad2 = (4 - len(ib) % 4) % 4
+        blobs.append(ib + b"\x00" * pad2)
+        gltf.bufferViews.append(BufferView(buffer=0, byteOffset=off, byteLength=len(ib)))
+        gltf.images.append(GImage(mimeType="image/png", bufferView=len(gltf.bufferViews)-1))
+        gltf.samplers.append(Sampler(magFilter=LINEAR, minFilter=LINEAR_MIPMAP_LINEAR,
+                                      wrapS=REPEAT, wrapT=REPEAT))
+        gltf.textures.append(Texture(sampler=0, source=0))
+        gltf.materials.append(Material(name="body",
+            pbrMetallicRoughness={"baseColorTexture": {"index": 0},
+                                   "metallicFactor": 0.0, "roughnessFactor": 0.8},
+            doubleSided=True))
+    else:
+        gltf.materials.append(Material(name="body", doubleSided=True))
+    prim = Primitive(
+        attributes={"POSITION": pos_acc, "NORMAL": nor_acc,
+                    "TEXCOORD_0": uv_acc, "JOINTS_0": j_acc, "WEIGHTS_0": w_acc},
+        indices=idx_acc, material=0)
+    gltf.meshes.append(Mesh(name="body", primitives=[prim]))
+    # ── Skeleton nodes ─────────────────────────────────────────────────────
+    jnodes = []
+    for i, (name, parent) in enumerate(zip(SMPL_JOINT_NAMES, SMPL_PARENTS)):
+        t = joints[i].tolist() if parent == -1 else (joints[i] - joints[parent]).tolist()
+        n = Node(name=name, translation=t, children=[])
+        jnodes.append(len(gltf.nodes)); gltf.nodes.append(n)
+    for i, p in enumerate(SMPL_PARENTS):
+        if p != -1: gltf.nodes[jnodes[p]].children.append(jnodes[i])
+    ibms = np.stack([np.eye(4, dtype=np.float32) for _ in range(n_joints)])
+    for i in range(n_joints): ibms[i, :3, 3] = -joints[i]
+    ibm_acc  = _add(ibms.astype(np.float32), FLOAT, MAT4)
+    skin_idx = len(gltf.skins)
+    gltf.skins.append(Skin(name="smpl_skin", skeleton=jnodes[0],
+                            joints=jnodes, inverseBindMatrices=ibm_acc))
+    mesh_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name="body_mesh", mesh=0, skin=skin_idx))
+    root_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name="root", children=[jnodes[0], mesh_node]))
+    gltf.scenes.append(Scene(name="Scene", nodes=[root_node]))
+    gltf.scene = 0
+    # ── Animation ──────────────────────────────────────────────────────────
+    dt   = 1.0 / fps
+    times = np.arange(n_frames, dtype=np.float32) * dt   # (n_frames,)
+    time_acc = _add(times, FLOAT, SCALAR, set_min_max=True)
+    channels, samplers = [], []
+    # Per-joint rotation tracks
+    for j in range(min(n_joints_anim, n_joints)):
+        q = joint_quats[:, j, :].astype(np.float32)   # (n_frames, 4) XYZW
+        q_acc = _add(q, FLOAT, VEC4)
+        si = len(samplers)
+        samplers.append(AnimationSampler(input=time_acc, output=q_acc,
+                                          interpolation="LINEAR"))
+        channels.append(AnimationChannel(
+            sampler=si,
+            target=AnimationChannelTarget(node=jnodes[j], path="rotation")))
+    # Root translation track
+    if root_trans is not None:
+        tr = root_trans.astype(np.float32)   # (n_frames, 3)
+        tr_acc = _add(tr, FLOAT, VEC3)
+        si = len(samplers)
+        samplers.append(AnimationSampler(input=time_acc, output=tr_acc,
+                                          interpolation="LINEAR"))
+        channels.append(AnimationChannel(
+            sampler=si,
+            target=AnimationChannelTarget(node=jnodes[0], path="translation")))
+    gltf.animations.append(Animation(name="mdm_motion",
+                                      channels=channels, samplers=samplers))
+    # ── Finalise ───────────────────────────────────────────────────────────
+    bin_data = b"".join(blobs)
+    gltf.buffers.append(Buffer(byteLength=len(bin_data)))
+    gltf.set_binary_blob(bin_data)
+    gltf.save_binary(out_path)
+    dur = times[-1] if len(times) else 0
+    print(f"[rig] Animated GLB → {out_path}  "
+          f"({os.path.getsize(out_path)//1024} KB, {n_frames} frames @ {fps}fps = {dur:.1f}s)")
+# ══════════════════════════════════════════════════════════════════════════════
+# Main pipeline
+# ══════════════════════════════════════════════════════════════════════════════
+def run_rig_pipeline(glb_path: str, reference_image_path: str,
+                      out_dir: str, device: str = "cuda",
+                      export_fbx_flag: bool = True,
+                      mdm_prompt: str = "",
+                      mdm_n_frames: int = 120,
+                      mdm_fps: int = 20) -> dict:
+    import trimesh
+    os.makedirs(out_dir, exist_ok=True)
+    result = {"rigged_glb": None, "animated_glb": None, "fbx": None,
+              "smpl_params": None, "status": "", "phases": {}}
+    try:
+        # ── load TripoSG mesh ─────────────────────────────────────────────
+        print("[rig] Loading TripoSG mesh...")
+        scene = trimesh.load(glb_path, force="scene")
+        if isinstance(scene, trimesh.Scene):
+            geom = list(scene.geometry.values())
+            mesh = trimesh.util.concatenate(geom) if len(geom)>1 else geom[0]
+        else:
+            mesh = scene
+        verts = np.array(mesh.vertices, dtype=np.float32)
+        faces = np.array(mesh.faces,    dtype=np.int32)
+        # UV + texture: try source geoms before concatenation (more reliable)
+        uv, tex = None, None
+        src_geoms = list(scene.geometry.values()) if isinstance(scene, trimesh.Scene) else [scene]
+        for g in src_geoms:
+            if not hasattr(g.visual, "uv") or g.visual.uv is None:
+                continue
+            try:
+                candidate_uv = np.array(g.visual.uv, dtype=np.float32)
+                if len(candidate_uv) == len(verts):
+                    uv = candidate_uv
+                    mat = getattr(g.visual, "material", None)
+                    if mat is not None:
+                        for attr in ("image", "baseColorTexture", "diffuse"):
+                            img = getattr(mat, attr, None)
+                            if img is not None:
+                                from PIL import Image as _PILImage
+                                tex = img if isinstance(img, _PILImage.Image) else None
+                                break
+                    break
+            except Exception:
+                pass
+        if uv is None:
+            print("[rig] WARNING: UV not found or vertex count mismatch — mesh will be untextured")
+        print(f"[rig] Mesh: {len(verts)} verts, {len(faces)} faces, "
+              f"UV={'yes' if uv is not None else 'no'}, "
+              f"texture={'yes' if tex is not None else 'no'}")
+        # ── Phase 1: multi-view beta averaging ───────────────────────────
+        print("\n[rig] ── Phase 1: multi-view beta averaging ──")
+        betas, hmr2_results = estimate_betas_multiview(VIEW_PATHS, reference_image_path, device)
+        result["phases"]["p1_betas"] = betas.tolist()
+        # ── Phase 2: silhouette fitting ───────────────────────────────────
+        print("\n[rig] ── Phase 2: silhouette fitting ──")
+        betas = fit_betas_silhouette(betas, VIEW_PATHS)
+        result["phases"]["p2_betas"] = betas.tolist()
+        # ── Phase 3: multi-view joint triangulation ───────────────────────
+        print("\n[rig] ── Phase 3: multi-view joint triangulation ──")
+        tri_joints = triangulate_joints_multiview(hmr2_results)
+        result["phases"]["p3_triangulated"] = tri_joints is not None
+        # ── build SMPL T-pose with refined betas ──────────────────────────
+        print("\n[rig] Building SMPL T-pose...")
+        smpl_v, smpl_f, smpl_j, smpl_w = get_smpl_tpose(betas)
+        # Override with triangulated joints if available
+        if tri_joints is not None:
+            # Triangulated joints are in render-normalised space; convert to SMPL scale
+            _, _, scale, _ = _smpl_to_render_space(smpl_v.copy(), smpl_j.copy())
+            smpl_j = tri_joints / scale          # back to SMPL metric space
+            print("[rig] Using triangulated skeleton joints.")
+        # ── align TripoSG mesh to SMPL ────────────────────────────────────
+        verts_aligned = align_mesh_to_smpl(verts, smpl_v, smpl_j)
+        # ── skinning weight transfer ──────────────────────────────────────
+        print("[rig] Transferring skinning weights...")
+        skin_w = transfer_skinning(smpl_v, smpl_w, verts_aligned)
+        # ── export rigged GLB ─────────────────────────────────────────────
+        rigged_glb = os.path.join(out_dir, "rigged.glb")
+        export_rigged_glb(verts_aligned, faces, uv, tex, smpl_j, skin_w, rigged_glb)
+        result["rigged_glb"] = rigged_glb
+        # ── export FBX ────────────────────────────────────────────────────
+        if export_fbx_flag:
+            fbx = os.path.join(out_dir, "rigged.fbx")
+            result["fbx"] = fbx if export_fbx(rigged_glb, fbx) else None
+        # ── MDM animation ─────────────────────────────────────────────────
+        if mdm_prompt.strip():
+            print(f"\n[rig] ── MDM animation: {mdm_prompt!r} ({mdm_n_frames} frames) ──")
+            mdm_out = generate_motion_mdm(mdm_prompt, n_frames=mdm_n_frames,
+                                           fps=mdm_fps, device=device)
+            if mdm_out is not None:
+                pos = mdm_out["positions"]   # (n_frames, 22, 3)
+                actual_frames = pos.shape[0]
+                # Align MDM joint positions to SMPL scale/space
+                # MDM outputs in metres roughly matching SMPL metric
+                # Scale so pelvis height matches our SMPL pelvis
+                mdm_pelvis_h = float(np.median(pos[:, 0, 1]))
+                smpl_pelvis_h = float(smpl_j[0, 1])
+                if abs(mdm_pelvis_h) > 1e-4:
+                    pos = pos * (smpl_pelvis_h / mdm_pelvis_h)
+                # FK inversion: positions → local quaternions for joints 0-21
+                t_pose_22 = smpl_j[:22]
+                quats_22  = positions_to_local_quats(pos, t_pose_22, _MDM_PARENTS)
+                # Pad to 24 joints (SMPL hands = identity)
+                quats_24  = np.zeros((actual_frames, 24, 4), np.float32)
+                quats_24[:, :, 3] = 1.0
+                quats_24[:, :22, :] = quats_22
+                # Root translation: MDM root XZ + SMPL Y offset
+                root_trans = pos[:, 0, :].copy()   # (n_frames, 3)
+                anim_glb = os.path.join(out_dir, "animated.glb")
+                export_animated_glb(
+                    verts_aligned, faces, uv, tex,
+                    smpl_j, skin_w,
+                    quats_24, root_trans, mdm_fps, anim_glb
+                )
+                result["animated_glb"] = anim_glb
+                print(f"[rig] MDM animation complete → {anim_glb}")
+            else:
+                print("[rig] MDM generation failed — static GLB only")
+        result["smpl_params"] = {
+            "betas": betas.tolist(),
+            "p1_sources": len(hmr2_results),
+            "p3_triangulated": tri_joints is not None,
+        }
+        p3_note  = " + triangulated skeleton" if tri_joints is not None else ""
+        fbx_note = " + FBX" if result["fbx"] else ""
+        anim_note = f" + MDM({mdm_n_frames}f)" if result.get("animated_glb") else ""
+        result["status"] = (
+            f"Rigged ({len(hmr2_results)} views used{p3_note}{fbx_note}{anim_note}). "
+            f"{len(verts)} verts, 24 joints."
+        )
+    except Exception:
+        err = traceback.format_exc()
+        print(f"[rig] FAILED:\n{err}")
+        result["status"] = f"Rigging failed:\n{err[-600:]}"
+    return result

pipeline/rig_yolo.py ADDED Viewed

	@@ -0,0 +1,679 @@

+"""
+rig_yolo.py — Rig a humanoid mesh using YOLO-pose joint detection.
+Instead of estimating T-pose rotations (which failed), detect where joints
+actually ARE in the mesh's current pose and use those positions as the bind pose.
+Pipeline:
+  1. Render front view (azimuth=-90, same camera as triposg_app.py views)
+  2. YOLOv8x-pose → COCO-17 2D keypoints
+  3. Unproject to 3D in original mesh coordinate space
+  4. Map COCO-17 → SMPL-24 (interpolate spine, collar, hand, foot joints)
+  5. LBS weights: proximity-based (k=4 nearest joints per vertex)
+  6. Export rigged GLB — bind pose = current pose
+Usage:
+    python rig_yolo.py --body /tmp/triposg_textured.glb \
+                       --out  /tmp/rig_out/rigged.glb \
+                       [--debug_dir /tmp/rig_debug]
+"""
+import os, sys, argparse, warnings
+warnings.filterwarnings('ignore')
+import numpy as np
+import cv2
+import trimesh
+from scipy.spatial import cKDTree
+sys.path.insert(0, '/root/MV-Adapter')
+# ── Camera constants — MUST match triposg_app.py ──────────────────────────────
+ORTHO_LEFT, ORTHO_RIGHT = -0.55, 0.55
+ORTHO_BOT,  ORTHO_TOP   = -0.55, 0.55
+RENDER_W, RENDER_H      = 768, 1024
+FRONT_AZ                = -90       # azimuth that gives front view
+# Orthographic proj scale: 2/(right-left) = 1.818...
+PROJ_SCALE = 2.0 / (ORTHO_RIGHT - ORTHO_LEFT)
+SMPL_PARENTS = [-1,0,0,0,1,2,3,4,5,6,7,8,9,9,9,
+                12,13,14,16,17,18,19,20,21]
+SMPL_JOINT_NAMES = [
+    'pelvis','left_hip','right_hip','spine1',
+    'left_knee','right_knee','spine2',
+    'left_ankle','right_ankle','spine3',
+    'left_foot','right_foot','neck',
+    'left_collar','right_collar','head',
+    'left_shoulder','right_shoulder',
+    'left_elbow','right_elbow',
+    'left_wrist','right_wrist',
+    'left_hand','right_hand',
+]
+# COCO-17 order
+COCO_NAMES = ['nose','L_eye','R_eye','L_ear','R_ear',
+              'L_shoulder','R_shoulder','L_elbow','R_elbow','L_wrist','R_wrist',
+              'L_hip','R_hip','L_knee','R_knee','L_ankle','R_ankle']
+# ── Step 0: Load mesh directly from GLB (correct UV channel) ─────────────────
+def load_mesh_from_gltf(body_glb):
+    """
+    Load mesh from GLB using pygltflib, reading the UV channel the material
+    actually references (TEXCOORD_0 or TEXCOORD_1).
+    Returns: verts (N,3) float64, faces (F,3) int32,
+             uv (N,2) float32 or None, texture_pil PIL.Image or None
+    """
+    import pygltflib
+    from PIL import Image as PILImage
+    import io
+    gltf = pygltflib.GLTF2().load(body_glb)
+    blob = gltf.binary_blob()
+    # componentType → (numpy dtype, bytes per element)
+    _DTYPE = {5120: np.int8, 5121: np.uint8, 5122: np.int16,
+              5123: np.uint16, 5125: np.uint32, 5126: np.float32}
+    _NCOMP = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT4': 16}
+    def read_accessor(idx):
+        if idx is None:
+            return None
+        acc = gltf.accessors[idx]
+        bv  = gltf.bufferViews[acc.bufferView]
+        dtype  = _DTYPE[acc.componentType]
+        n_comp = _NCOMP[acc.type]
+        bv_off  = bv.byteOffset  or 0
+        acc_off = acc.byteOffset or 0
+        elem_bytes = np.dtype(dtype).itemsize * n_comp
+        stride = bv.byteStride if (bv.byteStride and bv.byteStride != elem_bytes) else elem_bytes
+        if stride == elem_bytes:
+            start = bv_off + acc_off
+            size  = acc.count * elem_bytes
+            arr   = np.frombuffer(blob[start:start + size], dtype=dtype)
+        else:
+            # interleaved buffer
+            rows = []
+            for i in range(acc.count):
+                start = bv_off + acc_off + i * stride
+                rows.append(np.frombuffer(blob[start:start + elem_bytes], dtype=dtype))
+            arr = np.concatenate(rows)
+        return arr.reshape(acc.count, n_comp) if n_comp > 1 else arr
+    # ── Find which texCoord index the material references ──────────────────────
+    texcoord_idx = 0
+    if gltf.materials:
+        pbr = gltf.materials[0].pbrMetallicRoughness
+        if pbr and pbr.baseColorTexture:
+            texcoord_idx = getattr(pbr.baseColorTexture, 'texCoord', 0) or 0
+    print(f'  material uses TEXCOORD_{texcoord_idx}')
+    # ── Read primitive ─────────────────────────────────────────────────────────
+    prim  = gltf.meshes[0].primitives[0]
+    attrs = prim.attributes
+    verts = read_accessor(attrs.POSITION).astype(np.float64)
+    idx_data = read_accessor(prim.indices).flatten()
+    faces = idx_data.reshape(-1, 3).astype(np.int32)
+    # Read the correct UV channel; fall back to TEXCOORD_0
+    uv_acc_idx = getattr(attrs, f'TEXCOORD_{texcoord_idx}', None)
+    if uv_acc_idx is None and texcoord_idx != 0:
+        uv_acc_idx = getattr(attrs, 'TEXCOORD_0', None)
+    uv_raw = read_accessor(uv_acc_idx)
+    uv = uv_raw.astype(np.float32) if uv_raw is not None else None
+    print(f'  verts={len(verts)}  faces={len(faces)}  uv={len(uv) if uv is not None else None}')
+    # ── Extract embedded texture ───────────────────────────────────────────────
+    texture_pil = None
+    try:
+        pbr = gltf.materials[0].pbrMetallicRoughness
+        if pbr and pbr.baseColorTexture is not None:
+            tex_idx = pbr.baseColorTexture.index
+            if tex_idx is not None and tex_idx < len(gltf.textures):
+                src_idx = gltf.textures[tex_idx].source
+                if src_idx is not None and src_idx < len(gltf.images):
+                    img_obj = gltf.images[src_idx]
+                    if img_obj.bufferView is not None:
+                        bv = gltf.bufferViews[img_obj.bufferView]
+                        bv_off = bv.byteOffset or 0
+                        img_bytes = blob[bv_off:bv_off + bv.byteLength]
+                        texture_pil = PILImage.open(io.BytesIO(img_bytes)).convert('RGBA')
+                        print(f'  texture: {texture_pil.size}')
+    except Exception as e:
+        print(f'  texture extraction failed: {e}')
+    return verts, faces, uv, texture_pil
+# ── Step 1: Render front view ─────────────────────────────────────────────────
+def render_front(body_glb, debug_dir=None):
+    """
+    Render front view using MV-Adapter.
+    Returns (img_bgr, scale_factor) where scale_factor = max_abs / 0.5
+    (used to convert std-space back to original mesh space).
+    """
+    from mvadapter.utils.mesh_utils import (
+        NVDiffRastContextWrapper, load_mesh, get_orthogonal_camera, render,
+    )
+    ctx = NVDiffRastContextWrapper(device='cuda', context_type='cuda')
+    mesh_mv, _offset, scale_factor = load_mesh(
+        body_glb, rescale=True, return_transform=True, device='cuda')
+    camera = get_orthogonal_camera(
+        elevation_deg=[0], distance=[1.8],
+        left=ORTHO_LEFT, right=ORTHO_RIGHT,
+        bottom=ORTHO_BOT, top=ORTHO_TOP,
+        azimuth_deg=[FRONT_AZ], device='cuda')
+    out = render(ctx, mesh_mv, camera,
+                 height=RENDER_H, width=RENDER_W,
+                 render_attr=True, render_depth=False, render_normal=False,
+                 attr_background=0.5)
+    img_np  = (out.attr[0].cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+    img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+    if debug_dir:
+        cv2.imwrite(os.path.join(debug_dir, 'front_render.png'), img_bgr)
+    print(f'  render: {RENDER_W}x{RENDER_H}, scale_factor={scale_factor:.4f}')
+    return img_bgr, scale_factor
+# ── Step 2: YOLO-pose keypoints ───────────────────────────────────────────────
+def detect_keypoints(img_bgr, debug_dir=None):
+    """
+    Run YOLOv8x-pose on the rendered image.
+    Returns (17, 3) array: [pixel_x, pixel_y, confidence] for COCO-17 joints.
+    Picks the largest detected bounding box (the character body).
+    """
+    from ultralytics import YOLO
+    model = YOLO('yolov8x-pose.pt')
+    results = model(img_bgr, verbose=False)
+    if not results or results[0].keypoints is None or len(results[0].boxes) == 0:
+        raise RuntimeError('YOLO: no person detected in front render')
+    r = results[0]
+    boxes = r.boxes.xyxy.cpu().numpy()
+    areas = (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1])
+    idx   = int(areas.argmax())
+    kp_xy   = r.keypoints[idx].xy[0].cpu().numpy()    # (17, 2) pixel
+    kp_conf = r.keypoints[idx].conf[0].cpu().numpy()  # (17,) confidence
+    kp      = np.concatenate([kp_xy, kp_conf[:,None]], axis=1)  # (17, 3)
+    print('  YOLO detections: %d boxes, using largest' % len(boxes))
+    for i, name in enumerate(COCO_NAMES):
+        if kp_conf[i] > 0.3:
+            print('    [%d] %-14s  px=(%.0f, %.0f)  conf=%.2f' % (
+                i, name, kp_xy[i,0], kp_xy[i,1], kp_conf[i]))
+    if debug_dir:
+        vis = img_bgr.copy()
+        for i in range(17):
+            if kp_conf[i] > 0.3:
+                x, y = int(kp_xy[i,0]), int(kp_xy[i,1])
+                cv2.circle(vis, (x, y), 6, (0, 255, 0), -1)
+                cv2.putText(vis, COCO_NAMES[i][:4], (x+4, y-4),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,255,0), 1)
+        cv2.imwrite(os.path.join(debug_dir, 'yolo_keypoints.png'), vis)
+    return kp
+# ── Step 3: Unproject 2D → 3D ────────────────────────────────────────────────
+def unproject_to_3d(kp_2d_conf, scale_factor, mesh_verts_orig):
+    """
+    Convert COCO-17 pixel positions to 3D positions in original mesh space.
+    MV-Adapter orthographic camera at azimuth=-90 maps:
+      pixel_x  →  orig_x   (character lateral axis)
+      pixel_y  →  orig_y   (character height axis, flipped from pixel)
+      orig_z   estimated from k-nearest mesh vertices in image space
+    Forward projection (for reference):
+      std_x = orig_x / scale_factor
+      NDC_x = PROJ_SCALE * std_x
+      pixel_x = (NDC_x + 1) / 2 * W
+      std_z = orig_y / scale_factor        (mesh Y ↔ std Z ↔ image vertical)
+      NDC_y = -PROJ_SCALE * std_z          (Y-flipped by proj matrix)
+      pixel_y = (NDC_y + 1) / 2 * H
+    Inverse:
+      orig_x = (2*px/W - 1) / PROJ_SCALE * scale_factor
+      orig_y = -(2*py/H - 1) / PROJ_SCALE * scale_factor
+    """
+    W, H = RENDER_W, RENDER_H
+    # Project all mesh vertices to image space (for Z lookup)
+    verts_px_x = ((mesh_verts_orig[:,0] / scale_factor * PROJ_SCALE) + 1.0) / 2.0 * W
+    verts_px_y = ((-mesh_verts_orig[:,1] / scale_factor * PROJ_SCALE) + 1.0) / 2.0 * H
+    joints_3d = np.full((17, 3), np.nan)
+    for i in range(17):
+        px, py, conf = kp_2d_conf[i]
+        if conf < 0.15 or px < 1 or py < 1:
+            continue
+        orig_x = (2.0*px/W - 1.0) / PROJ_SCALE * scale_factor
+        orig_y = -(2.0*py/H - 1.0) / PROJ_SCALE * scale_factor
+        # Z: median of k-nearest mesh vertices in image space
+        dist_2d = np.hypot(verts_px_x - px, verts_px_y - py)
+        k = 30
+        near_idx = np.argpartition(dist_2d, k-1)[:k]
+        orig_z   = float(np.median(mesh_verts_orig[near_idx, 2]))
+        joints_3d[i] = [orig_x, orig_y, orig_z]
+    return joints_3d
+# ── Step 4: COCO-17 → SMPL-24 ────────────────────────────────────────────────
+def coco17_to_smpl24(coco_3d, mesh_verts):
+    """
+    Build 24 SMPL joint positions from COCO-17 detections.
+    Spine / collar / hand / foot joints are interpolated.
+    Low-confidence (NaN) COCO joints fall back to mesh geometry.
+    """
+    def lerp(a, b, t):
+        return a + t * (b - a)
+    def valid(i):
+        return not np.any(np.isnan(coco_3d[i]))
+    # Fill NaN joints from mesh geometry (centroid fallback)
+    c = coco_3d.copy()
+    centroid = mesh_verts.mean(axis=0)
+    for i in range(17):
+        if not valid(i):
+            c[i] = centroid
+    # Key anchor points
+    L_shoulder = c[5]
+    R_shoulder = c[6]
+    L_hip      = c[11]
+    R_hip      = c[12]
+    pelvis = lerp(L_hip, R_hip, 0.5)
+    mid_shoulder = lerp(L_shoulder, R_shoulder, 0.5)
+    # Neck: midpoint of shoulders, raised slightly (~ collar bone level)
+    neck   = mid_shoulder + np.array([0.0, 0.04 * (mid_shoulder[1] - pelvis[1]), 0.0])
+    J = np.zeros((24, 3), dtype=np.float64)
+    J[0]  = pelvis                         # pelvis
+    J[1]  = L_hip                          # left_hip
+    J[2]  = R_hip                          # right_hip
+    J[3]  = lerp(pelvis, neck, 0.25)       # spine1
+    J[4]  = c[13]                          # left_knee
+    J[5]  = c[14]                          # right_knee
+    J[6]  = lerp(pelvis, neck, 0.5)        # spine2
+    J[7]  = c[15]                          # left_ankle
+    J[8]  = c[16]                          # right_ankle
+    J[9]  = lerp(pelvis, neck, 0.75)       # spine3
+    J[12] = neck                           # neck
+    # Feet: project ankle downward toward mesh floor
+    mesh_floor_y = mesh_verts[:,1].min()
+    foot_y = mesh_floor_y + 0.02 * (c[15][1] - mesh_floor_y)  # 2% above floor
+    J[10] = np.array([c[15][0], foot_y, c[15][2]])  # left_foot
+    J[11] = np.array([c[16][0], foot_y, c[16][2]])  # right_foot
+    J[13] = lerp(neck, L_shoulder, 0.5)   # left_collar
+    J[14] = lerp(neck, R_shoulder, 0.5)   # right_collar
+    J[15] = c[0]                           # head (nose as proxy)
+    J[16] = L_shoulder                    # left_shoulder
+    J[17] = R_shoulder                    # right_shoulder
+    J[18] = c[7]                           # left_elbow
+    J[19] = c[8]                           # right_elbow
+    J[20] = c[9]                           # left_wrist
+    J[21] = c[10]                          # right_wrist
+    # Hands: extrapolate one step beyond wrist in elbow→wrist direction
+    for side, (elbow_i, wrist_i, hand_i) in enumerate([(7,9,22), (8,10,23)]):
+        elbow = c[elbow_i]; wrist = c[wrist_i]
+        bone  = wrist - elbow
+        blen  = np.linalg.norm(bone)
+        if blen > 1e-3:
+            J[hand_i] = wrist + bone / blen * 0.05
+        else:
+            J[hand_i] = wrist
+    print('  SMPL-24 joints:')
+    print('    pelvis   : (%.3f, %.3f, %.3f)' % tuple(J[0]))
+    print('    L_hip    : (%.3f, %.3f, %.3f)' % tuple(J[1]))
+    print('    R_hip    : (%.3f, %.3f, %.3f)' % tuple(J[2]))
+    print('    neck     : (%.3f, %.3f, %.3f)' % tuple(J[12]))
+    print('    L_shoulder: (%.3f, %.3f, %.3f)' % tuple(J[16]))
+    print('    R_shoulder: (%.3f, %.3f, %.3f)' % tuple(J[17]))
+    print('    head     : (%.3f, %.3f, %.3f)' % tuple(J[15]))
+    return J.astype(np.float32)
+# ── Step 5: LBS skinning weights ─────────────────────────────────────────────
+def compute_skinning_weights(mesh_verts, joints, k=4):
+    """
+    Proximity-based LBS weights: each vertex gets k-nearest joint weights
+    via inverse-distance weighting.
+    Returns (N, 24) float32 full weight matrix.
+    """
+    N = len(mesh_verts)
+    tree = cKDTree(joints)
+    dists, idxs = tree.query(mesh_verts, k=k, workers=-1)
+    # Clamp minimum distance to avoid division by zero
+    inv_d = 1.0 / np.maximum(dists, 1e-6)
+    inv_d /= inv_d.sum(axis=1, keepdims=True)
+    W_full = np.zeros((N, 24), dtype=np.float32)
+    for ki in range(k):
+        W_full[np.arange(N), idxs[:, ki]] += inv_d[:, ki].astype(np.float32)
+    # Normalize (should already be normalized, but just in case)
+    row_sum = W_full.sum(axis=1, keepdims=True)
+    W_full /= np.where(row_sum > 0, row_sum, 1.0)
+    print('  weights: max_joint=%d  mean_support=%.2f joints/vert' % (
+        W_full.argmax(axis=1).max(),
+        (W_full > 0.01).sum(axis=1).mean()))
+    return W_full
+# ── Skeleton mesh builder ─────────────────────────────────────────────────────
+def make_skeleton_mesh(joints, radius=0.008):
+    """
+    Build a mesh of hexagonal-prism cylinders connecting parent→child joints.
+    Returns (verts, faces) as float32 / int32 numpy arrays.
+    """
+    SEG = 6  # hexagonal cross-section
+    angles = np.linspace(0, 2 * np.pi, SEG, endpoint=False)
+    circle = np.stack([np.cos(angles), np.sin(angles)], axis=1)  # (SEG, 2)
+    all_verts, all_faces = [], []
+    vert_offset = 0
+    for i, parent in enumerate(SMPL_PARENTS):
+        if parent == -1:
+            continue
+        p0 = joints[parent].astype(np.float64)
+        p1 = joints[i].astype(np.float64)
+        bone_vec = p1 - p0
+        length = np.linalg.norm(bone_vec)
+        if length < 1e-4:
+            continue
+        z_axis = bone_vec / length
+        ref = np.array([0., 1., 0.]) if abs(z_axis[1]) < 0.9 else np.array([1., 0., 0.])
+        x_axis = np.cross(ref, z_axis)
+        x_axis /= np.linalg.norm(x_axis)
+        y_axis = np.cross(z_axis, x_axis)
+        # Bottom ring at p0, top ring at p1
+        offsets = radius * (circle[:, 0:1] * x_axis + circle[:, 1:2] * y_axis)
+        bottom  = p0 + offsets   # (SEG, 3)
+        top     = p1 + offsets   # (SEG, 3)
+        all_verts.append(np.vstack([bottom, top]).astype(np.float32))
+        for j in range(SEG):
+            j1 = (j + 1) % SEG
+            b0, b1 = vert_offset + j,       vert_offset + j1
+            t0, t1 = vert_offset + SEG + j, vert_offset + SEG + j1
+            all_faces.extend([[b0, b1, t0], [b1, t1, t0]])
+        vert_offset += 2 * SEG
+    if not all_verts:
+        return np.zeros((0, 3), np.float32), np.zeros((0, 3), np.int32)
+    return np.vstack(all_verts), np.array(all_faces, dtype=np.int32)
+# ── Step 6: Export rigged GLB ─────────────────────────────────────────────────
+def export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights,
+                      out_path, skel_verts=None, skel_faces=None):
+    """
+    Export skinned GLB using pygltflib.
+    bind pose = current pose (joints at detected positions).
+    IBM[j] = Translation(-J_world[j])  (pure offset, no rotation).
+    If skel_verts/skel_faces are provided, a second mesh (bright green skeleton
+    sticks) is embedded alongside the body mesh.
+    """
+    import pygltflib
+    from pygltflib import (GLTF2, Scene, Node, Mesh, Primitive, Accessor,
+                            BufferView, Buffer, Material, Texture,
+                            Image as GImage, Sampler, Skin, Asset)
+    from pygltflib import (ARRAY_BUFFER, ELEMENT_ARRAY_BUFFER, FLOAT,
+                            UNSIGNED_INT, UNSIGNED_SHORT, LINEAR,
+                            LINEAR_MIPMAP_LINEAR, REPEAT, SCALAR, VEC2,
+                            VEC3, VEC4, MAT4)
+    gltf  = GLTF2()
+    gltf.asset = Asset(version='2.0', generator='rig_yolo.py')
+    blobs = []
+    def _add(data, comp, acc_type, target=None):
+        b   = data.tobytes()
+        pad = (4 - len(b) % 4) % 4
+        off = sum(len(x) for x in blobs)
+        blobs.append(b + b'\x00' * pad)
+        bv = len(gltf.bufferViews)
+        gltf.bufferViews.append(BufferView(
+            buffer=0, byteOffset=off, byteLength=len(b), target=target))
+        ac = len(gltf.accessors)
+        flat = data.flatten()
+        gltf.accessors.append(Accessor(
+            bufferView=bv, byteOffset=0, componentType=comp,
+            type=acc_type, count=len(data),
+            min=[float(flat.min())], max=[float(flat.max())]))
+        return ac
+    # Geometry
+    pos_acc = _add(verts.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)
+    v0, v1, v2 = verts[faces[:,0]], verts[faces[:,1]], verts[faces[:,2]]
+    fn = np.cross(v1-v0, v2-v0)
+    fn /= (np.linalg.norm(fn, axis=1, keepdims=True) + 1e-8)
+    vn = np.zeros_like(verts)
+    for i in range(3):
+        np.add.at(vn, faces[:,i], fn)
+    vn /= (np.linalg.norm(vn, axis=1, keepdims=True) + 1e-8)
+    nor_acc = _add(vn.astype(np.float32), FLOAT, VEC3, ARRAY_BUFFER)
+    if uv is None:
+        uv = np.zeros((len(verts), 2), np.float32)
+    uv_acc  = _add(uv.astype(np.float32), FLOAT, VEC2, ARRAY_BUFFER)
+    idx_acc = _add(faces.astype(np.uint32).flatten(), UNSIGNED_INT, SCALAR,
+                   ELEMENT_ARRAY_BUFFER)
+    # Skinning: top-4 joints per vertex
+    top4_idx = np.argsort(-skin_weights, axis=1)[:, :4].astype(np.uint16)
+    top4_w   = np.take_along_axis(skin_weights, top4_idx.astype(np.int64), axis=1)
+    top4_w   = top4_w.astype(np.float32)
+    top4_w  /= top4_w.sum(axis=1, keepdims=True).clip(1e-8, None)
+    j_acc   = _add(top4_idx, UNSIGNED_SHORT, VEC4, ARRAY_BUFFER)
+    w_acc   = _add(top4_w,   FLOAT,          VEC4, ARRAY_BUFFER)
+    # Texture
+    if texture_pil is not None:
+        import io
+        buf = io.BytesIO()
+        texture_pil.save(buf, format='PNG')
+        ib  = buf.getvalue()
+        off = sum(len(x) for x in blobs)
+        pad = (4 - len(ib) % 4) % 4
+        blobs.append(ib + b'\x00' * pad)
+        gltf.bufferViews.append(
+            BufferView(buffer=0, byteOffset=off, byteLength=len(ib)))
+        gltf.images.append(
+            GImage(mimeType='image/png', bufferView=len(gltf.bufferViews)-1))
+        gltf.samplers.append(
+            Sampler(magFilter=LINEAR, minFilter=LINEAR_MIPMAP_LINEAR,
+                    wrapS=REPEAT, wrapT=REPEAT))
+        gltf.textures.append(Texture(sampler=0, source=0))
+        gltf.materials.append(Material(
+            name='body',
+            pbrMetallicRoughness={
+                'baseColorTexture': {'index': 0},
+                'metallicFactor': 0.0,
+                'roughnessFactor': 0.8},
+            doubleSided=True))
+    else:
+        gltf.materials.append(Material(name='body', doubleSided=True))
+    body_prim = Primitive(
+        attributes={'POSITION': pos_acc, 'NORMAL': nor_acc,
+                    'TEXCOORD_0': uv_acc, 'JOINTS_0': j_acc, 'WEIGHTS_0': w_acc},
+        indices=idx_acc, material=0)
+    gltf.meshes.append(Mesh(name='body', primitives=[body_prim]))
+    # ── Optional skeleton mesh ─────────────────────────────────────────────────
+    skel_mesh_idx = None
+    if skel_verts is not None and len(skel_verts) > 0:
+        sv = skel_verts.astype(np.float32)
+        sf = skel_faces.astype(np.int32)
+        sv0, sv1, sv2 = sv[sf[:,0]], sv[sf[:,1]], sv[sf[:,2]]
+        sfn = np.cross(sv1-sv0, sv2-sv0)
+        sfn /= (np.linalg.norm(sfn, axis=1, keepdims=True) + 1e-8)
+        svn = np.zeros_like(sv)
+        for i in range(3):
+            np.add.at(svn, sf[:,i], sfn)
+        svn /= (np.linalg.norm(svn, axis=1, keepdims=True) + 1e-8)
+        s_pos_acc = _add(sv,                  FLOAT,        VEC3, ARRAY_BUFFER)
+        s_nor_acc = _add(svn.astype(np.float32), FLOAT,     VEC3, ARRAY_BUFFER)
+        s_idx_acc = _add(sf.astype(np.uint32).flatten(), UNSIGNED_INT, SCALAR,
+                         ELEMENT_ARRAY_BUFFER)
+        # Lime-green unlit material for skeleton sticks
+        mat_idx = len(gltf.materials)
+        gltf.materials.append(Material(
+            name='skeleton',
+            pbrMetallicRoughness={
+                'baseColorFactor': [0.2, 1.0, 0.3, 1.0],
+                'metallicFactor': 0.0,
+                'roughnessFactor': 0.5},
+            doubleSided=True))
+        skel_mesh_idx = len(gltf.meshes)
+        skel_prim = Primitive(
+            attributes={'POSITION': s_pos_acc, 'NORMAL': s_nor_acc},
+            indices=s_idx_acc, material=mat_idx)
+        gltf.meshes.append(Mesh(name='skeleton', primitives=[skel_prim]))
+    # ── Skeleton nodes ─────────────────────────────────────────────────────────
+    jnodes = []
+    for i, (name, parent) in enumerate(zip(SMPL_JOINT_NAMES, SMPL_PARENTS)):
+        t = joints[i].tolist() if parent == -1 else (joints[i] - joints[parent]).tolist()
+        n = Node(name=name, translation=t, children=[])
+        jnodes.append(len(gltf.nodes))
+        gltf.nodes.append(n)
+    for i, p in enumerate(SMPL_PARENTS):
+        if p != -1:
+            gltf.nodes[jnodes[p]].children.append(jnodes[i])
+    # Inverse bind matrices: IBM[j] = Translation(-J_world[j])
+    # glTF MAT4 is column-major; numpy .tobytes() is row-major.
+    # glTF reads the numpy buffer as the TRANSPOSE of what numpy stores.
+    # So we set the translation in the last ROW of the numpy matrix — glTF
+    # reads that as the last COLUMN (translation column) of a 4x4 mat.
+    ibms = np.stack([np.eye(4, dtype=np.float32) for _ in range(len(joints))])
+    for i in range(len(joints)):
+        ibms[i, 3, :3] = -joints[i]
+    ibm_acc = _add(ibms.astype(np.float32), FLOAT, MAT4)
+    skin_idx = len(gltf.skins)
+    gltf.skins.append(Skin(
+        name='smpl_skin', skeleton=jnodes[0],
+        joints=jnodes, inverseBindMatrices=ibm_acc))
+    mesh_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name='body_mesh', mesh=0, skin=skin_idx))
+    root_children = [jnodes[0], mesh_node]
+    if skel_mesh_idx is not None:
+        skel_node_idx = len(gltf.nodes)
+        gltf.nodes.append(Node(name='skeleton_mesh', mesh=skel_mesh_idx))
+        root_children.append(skel_node_idx)
+    root_node = len(gltf.nodes)
+    gltf.nodes.append(Node(name='root', children=root_children))
+    gltf.scenes.append(Scene(name='Scene', nodes=[root_node]))
+    gltf.scene = 0
+    bin_data = b''.join(blobs)
+    gltf.buffers.append(Buffer(byteLength=len(bin_data)))
+    gltf.set_binary_blob(bin_data)
+    gltf.save_binary(out_path)
+    print('  rigged GLB -> %s  (%d KB)' % (out_path, os.path.getsize(out_path) // 1024))
+# ── Main ──────────────────────────────────────────────────────────────────────
+def rig_yolo(body_glb, out_glb, debug_dir=None):
+    """
+    Rig body_glb and write to out_glb.
+    Returns (out_glb, out_skel_glb) where out_skel_glb includes visible
+    skeleton bone sticks alongside the body mesh.
+    """
+    os.makedirs(os.path.dirname(out_glb) or '.', exist_ok=True)
+    if debug_dir:
+        os.makedirs(debug_dir, exist_ok=True)
+    print('[rig_yolo] Rendering front view ...')
+    img_bgr, scale_factor = render_front(body_glb, debug_dir)
+    print('[rig_yolo] Running YOLO-pose ...')
+    kp = detect_keypoints(img_bgr, debug_dir)
+    print('[rig_yolo] Loading original mesh (pygltflib, correct UV channel) ...')
+    verts, faces, uv, texture_pil = load_mesh_from_gltf(body_glb)
+    print('[rig_yolo] Unprojecting YOLO keypoints to 3D ...')
+    coco_3d = unproject_to_3d(kp, scale_factor, verts)
+    print('[rig_yolo] Building SMPL-24 skeleton ...')
+    joints = coco17_to_smpl24(coco_3d, verts)
+    print('[rig_yolo] Computing skinning weights ...')
+    skin_weights = compute_skinning_weights(verts, joints, k=4)
+    print('[rig_yolo] Exporting rigged GLB (no skeleton) ...')
+    export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights, out_glb)
+    print('[rig_yolo] Building skeleton mesh ...')
+    skel_verts, skel_faces = make_skeleton_mesh(joints)
+    out_skel_glb = out_glb.replace('.glb', '_skel.glb')
+    print('[rig_yolo] Exporting rigged GLB (with skeleton) ...')
+    export_rigged_glb(verts, faces, uv, texture_pil, joints, skin_weights,
+                      out_skel_glb, skel_verts=skel_verts, skel_faces=skel_faces)
+    print('[rig_yolo] Done.')
+    return out_glb, out_skel_glb
+if __name__ == '__main__':
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--body',      required=True, help='Input textured GLB')
+    ap.add_argument('--out',       required=True, help='Output rigged GLB')
+    ap.add_argument('--debug_dir', default=None,  help='Save debug renders here')
+    args = ap.parse_args()
+    rigged, rigged_skel = rig_yolo(args.body, args.out, args.debug_dir)
+    print('Rigged:        ', rigged)
+    print('Rigged + skel: ', rigged_skel)

pipeline/tpose_smpl.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+tpose_smpl.py -- T-pose a humanoid GLB via inverse Linear Blend Skinning.
+Pipeline:
+  1. Render front view and run HMR2 -> SMPL body_pose + betas
+  2. Read rigged.glb: mesh verts (rig world space), skinning weights, T-pose joints
+  3. Compute FK transforms in rig world space using HMR2 body_pose
+  4. Apply inverse LBS: v_tpose = (Sum_j W_j * A_j)^-1 * v_posed
+  5. Map T-posed verts back to original mesh coordinate space, preserve UV/texture
+  6. Optionally export SKEL bone mesh in T-pose
+Usage:
+    python tpose_smpl.py --body /tmp/triposg_textured.glb \
+                         --rig  /tmp/rig_out/rigged.glb \
+                         --out  /tmp/tposed_surface.glb \
+                         [--skel_out /tmp/tposed_bones.glb] \
+                         [--debug_dir /tmp/tpose_debug]
+"""
+import os, sys, argparse, struct, json, warnings
+warnings.filterwarnings('ignore')
+import numpy as np
+import cv2
+import torch
+import trimesh
+from trimesh.visual.texture import TextureVisuals
+from trimesh.visual.material import PBRMaterial
+from scipy.spatial.transform import Rotation as R
+sys.path.insert(0, '/root/MV-Adapter')
+SMPL_NEUTRAL = '/root/body_models/smpl/SMPL_NEUTRAL.pkl'
+SKEL_DIR     = '/root/body_models/skel'
+SMPL_PARENTS = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9,
+                12, 13, 14, 16, 17, 18, 19, 20, 21]
+# ---- Step 1: Render front view -----------------------------------------------
+def render_front(body_glb, H=1024, W=768, device='cuda'):
+    from mvadapter.utils.mesh_utils import (
+        NVDiffRastContextWrapper, load_mesh, get_orthogonal_camera, render,
+    )
+    ctx     = NVDiffRastContextWrapper(device=device, context_type='cuda')
+    mesh_mv = load_mesh(body_glb, rescale=True, device=device)
+    camera  = get_orthogonal_camera(
+        elevation_deg=[0], distance=[1.8],
+        left=-0.55, right=0.55, bottom=-0.55, top=0.55,
+        azimuth_deg=[-90], device=device,
+    )
+    out = render(ctx, mesh_mv, camera, height=H, width=W,
+                 render_attr=True, render_depth=False, render_normal=False,
+                 attr_background=0.5)
+    img_np = (out.attr[0].cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+    return cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+# ---- Step 2: HMR2 pose estimation --------------------------------------------
+def run_hmr2(img_bgr, device='cuda'):
+    from pathlib import Path
+    from hmr2.configs import CACHE_DIR_4DHUMANS
+    from hmr2.models import load_hmr2, DEFAULT_CHECKPOINT, download_models
+    from hmr2.utils import recursive_to
+    from hmr2.datasets.vitdet_dataset import ViTDetDataset
+    from hmr2.utils.utils_detectron2 import DefaultPredictor_Lazy
+    from detectron2.config import LazyConfig
+    import hmr2 as hmr2_pkg
+    download_models(CACHE_DIR_4DHUMANS)
+    model, model_cfg = load_hmr2(DEFAULT_CHECKPOINT)
+    model = model.to(device).eval()
+    cfg_path = Path(hmr2_pkg.__file__).parent / 'configs' / 'cascade_mask_rcnn_vitdet_h_75ep.py'
+    det_cfg  = LazyConfig.load(str(cfg_path))
+    det_cfg.train.init_checkpoint = (
+        'https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h'
+        '/f328730692/model_final_f05665.pkl'
+    )
+    for i in range(3):
+        det_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25
+    detector = DefaultPredictor_Lazy(det_cfg)
+    det_out   = detector(img_bgr)
+    instances = det_out['instances']
+    valid     = (instances.pred_classes == 0) & (instances.scores > 0.5)
+    boxes     = instances.pred_boxes.tensor[valid].cpu().numpy()
+    if len(boxes) == 0:
+        raise RuntimeError('HMR2: no person detected in render')
+    areas = (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1])
+    boxes = boxes[areas.argmax():areas.argmax()+1]
+    dataset    = ViTDetDataset(model_cfg, img_bgr, boxes)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
+    for batch in dataloader:
+        batch = recursive_to(batch, device)
+        with torch.no_grad():
+            out = model(batch)
+        sp = out['pred_smpl_params']
+        return {
+            'body_pose': sp['body_pose'][0].cpu(),    # (23, 3, 3)
+            'betas':     sp['betas'][0].cpu(),         # (10,)
+        }
+# ---- Step 3: Read all data from rigged.glb -----------------------------------
+def read_rigged_glb(rig_glb):
+    """
+    Returns dict with:
+      verts        : (N, 3) mesh vertices in rig world space
+      j_idx        : (N, 4) joint indices
+      w_arr        : (N, 4) skinning weights
+      J_bind       : (24, 3) T-pose joint world positions
+    """
+    with open(rig_glb, 'rb') as fh:
+        raw = fh.read()
+    ch_len, _ = struct.unpack_from('<II', raw, 12)
+    gltf = json.loads(raw[20:20+ch_len])
+    bin_data = raw[20+ch_len+8:]
+    def _read(acc_i):
+        acc = gltf['accessors'][acc_i]
+        bv  = gltf['bufferViews'][acc['bufferView']]
+        off = bv.get('byteOffset', 0) + acc.get('byteOffset', 0)
+        cnt = acc['count']
+        n   = {'SCALAR':1,'VEC2':2,'VEC3':3,'VEC4':4,'MAT4':16}[acc['type']]
+        fmt = {5121:'B',5123:'H',5125:'I',5126:'f'}[acc['componentType']]
+        nb  = {'B':1,'H':2,'I':4,'f':4}[fmt]
+        return np.frombuffer(bin_data[off:off+cnt*n*nb],
+                             dtype=np.dtype(fmt)).reshape(cnt, n)
+    prim  = gltf['meshes'][0]['primitives'][0]['attributes']
+    verts = _read(prim['POSITION']).astype(np.float64)   # (N, 3)
+    j_idx = _read(prim['JOINTS_0']).astype(int)          # (N, 4)
+    w_arr = _read(prim['WEIGHTS_0']).astype(np.float64)  # (N, 4)
+    row_sum = w_arr.sum(axis=1, keepdims=True)
+    w_arr /= np.where(row_sum > 0, row_sum, 1.0)
+    # Read T-pose joint world positions by accumulating node translations
+    nodes   = gltf['nodes']
+    skin    = gltf['skins'][0]
+    j_nodes = skin['joints']                             # [0, 1, ..., 23]
+    J_bind  = np.zeros((24, 3), dtype=np.float64)
+    for ji, ni in enumerate(j_nodes):
+        t_local = np.array(nodes[ni].get('translation', [0, 0, 0]))
+        p = SMPL_PARENTS[ji]
+        J_bind[ji] = (J_bind[p] if p >= 0 else np.zeros(3)) + t_local
+    print('  Rig verts: %d  Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
+        len(verts),
+        verts[:,1].min(), verts[:,1].max(),
+        verts[:,0].min(), verts[:,0].max()))
+    print('  J_bind pelvis: (%.3f, %.3f, %.3f)  L_shoulder: (%.3f, %.3f, %.3f)' % (
+        *J_bind[0], *J_bind[16]))
+    return {'verts': verts, 'j_idx': j_idx, 'w_arr': w_arr, 'J_bind': J_bind}
+# ---- Step 4: FK in rig world space -> A matrices -----------------------------
+_FLIP_X = np.diag([-1.0, 1.0, 1.0])   # X-axis mirror matrix
+def _adapt_rotmat_to_flipped_x(R_smpl):
+    """
+    Convert an SO(3) rotation matrix from SMPL convention (left=+X)
+    to rig convention (left=-X).  F @ R @ F  where F = diag(-1,1,1).
+    """
+    return _FLIP_X @ R_smpl @ _FLIP_X
+def compute_rig_fk_transforms(J_bind, body_pose_rotmats):
+    """
+    Compute A_j = G_j_posed * IBM_j in rig world space.
+    A_j maps T-pose -> posed, so A_j^{-1} maps posed -> T-pose.
+    HMR2 returns rotations in SMPL convention (left shoulder at +X).
+    The rig uses the opposite convention (left shoulder at -X).
+    We convert by conjugating with the X-flip matrix before building FK.
+    J_bind          : (24, 3) T-pose joint world positions from rig
+    body_pose_rotmats: (23, 3, 3) HMR2 body pose rotation matrices (joints 1-23)
+    Returns A: (24, 4, 4)
+    """
+    G = [None] * 24
+    for j in range(24):
+        p = SMPL_PARENTS[j]
+        # Convert rotation from SMPL (+X=left) to rig (-X=left) convention
+        R_smpl = body_pose_rotmats[j-1].numpy() if j >= 1 else np.eye(3)
+        R_j    = _adapt_rotmat_to_flipped_x(R_smpl)
+        if p < 0:
+            t_j = J_bind[j]           # root: absolute world position
+        else:
+            t_j = J_bind[j] - J_bind[p]
+        L = np.eye(4, dtype=np.float64)
+        L[:3, :3] = R_j
+        L[:3, 3]  = t_j
+        G[j] = L if p < 0 else G[p] @ L
+    G = np.stack(G)
+    A = np.zeros((24, 4, 4), dtype=np.float64)
+    for j in range(24):
+        IBM = np.eye(4, dtype=np.float64)
+        IBM[:3, 3] = -J_bind[j]
+        A[j] = G[j] @ IBM
+    return A
+# ---- Step 5: Inverse LBS -----------------------------------------------------
+def inverse_lbs(verts, j_idx, w_arr, A):
+    """
+    v_tpose = (Sum_j W_j * A_j)^{-1} * v_posed
+    All inputs in rig world space.
+    Returns (N, 3) T-posed vertices.
+    """
+    N = len(verts)
+    # Blend forward transforms
+    T_fwd = np.zeros((N, 4, 4), dtype=np.float64)
+    for k in range(4):
+        ji   = j_idx[:, k]
+        w    = w_arr[:, k]
+        mask = w > 1e-6
+        if mask.any():
+            T_fwd[mask] += w[mask, None, None] * A[ji[mask]]
+    T_inv = np.linalg.inv(T_fwd)
+    v_h   = np.concatenate([verts, np.ones((N, 1))], axis=1)
+    v_tp  = np.einsum('nij,nj->ni', T_inv, v_h)[:, :3]
+    disp  = np.linalg.norm(v_tp - verts, axis=1)
+    print('  inverse LBS: mean_disp=%.4f  max_disp=%.4f' % (disp.mean(), disp.max()))
+    return v_tp
+# ---- Step 6: Map T-posed rig verts back to original mesh space ---------------
+def rig_to_original_space(rig_verts_tposed, rig_verts_original, orig_mesh_verts):
+    """
+    Rig verts are a scaled + translated version of the original mesh verts.
+    Recover the (scale, offset) from the mapping:
+      rig_vert = orig_vert * scale + offset
+    Estimates scale from height ratio, offset from floor alignment.
+    Returns T-posed vertices in original mesh coordinate space.
+    """
+    rig_h  = rig_verts_original[:, 1].max() - rig_verts_original[:, 1].min()
+    orig_h = orig_mesh_verts[:, 1].max()    - orig_mesh_verts[:, 1].min()
+    scale  = rig_h / max(orig_h, 1e-6)
+    # The rig aligns: orig * scale, then v[:,1] -= v[:,1].min() (floor at 0)
+    # and v[:,0] += smpl_joints[0,0] - cx; v[:,2] += smpl_joints[0,2] - cz
+    # We can recover offset from comparing means/floors
+    # offset = rig_floor_Y - (orig_floor_Y * scale)
+    rig_floor  = rig_verts_original[:, 1].min()
+    orig_floor = orig_mesh_verts[:, 1].min()
+    y_offset   = rig_floor - orig_floor * scale
+    # X, Z: center offset
+    rig_cx  = (rig_verts_original[:, 0].max() + rig_verts_original[:, 0].min()) * 0.5
+    orig_cx = (orig_mesh_verts[:, 0].max()    + orig_mesh_verts[:, 0].min())    * 0.5
+    x_offset = rig_cx - orig_cx * scale
+    rig_cz  = (rig_verts_original[:, 2].max() + rig_verts_original[:, 2].min()) * 0.5
+    orig_cz = (orig_mesh_verts[:, 2].max()    + orig_mesh_verts[:, 2].min())    * 0.5
+    z_offset = rig_cz - orig_cz * scale
+    print('  rig->orig: scale=%.4f  offset=[%.3f, %.3f, %.3f]' % (scale, x_offset, y_offset, z_offset))
+    # Invert: orig_vert = (rig_vert - offset) / scale
+    # For T-posed verts: they're in rig space but T-posed, so same inversion
+    tposed_orig = np.zeros_like(rig_verts_tposed)
+    tposed_orig[:, 0] = (rig_verts_tposed[:, 0] - x_offset) / scale
+    tposed_orig[:, 1] = (rig_verts_tposed[:, 1] - y_offset) / scale
+    tposed_orig[:, 2] = (rig_verts_tposed[:, 2] - z_offset) / scale
+    return tposed_orig
+# ---- SKEL bone geometry ------------------------------------------------------
+def export_skel_bones(betas, out_path, gender='male'):
+    try:
+        from skel.skel_model import SKEL
+    except ImportError:
+        print('  [skel] Not installed')
+        return None
+    skel_file = os.path.join(SKEL_DIR, 'skel_%s.pkl' % gender)
+    if not os.path.exists(skel_file):
+        print('  [skel] Weights not found: %s' % skel_file)
+        return None
+    try:
+        skel_model = SKEL(gender=gender, model_path=SKEL_DIR)
+        betas_t    = betas.unsqueeze(0)[:, :10]
+        poses_zero = torch.zeros(1, 46)
+        trans_zero = torch.zeros(1, 3)
+        with torch.no_grad():
+            out = skel_model(poses=poses_zero, betas=betas_t, trans=trans_zero, skelmesh=True)
+        bone_verts = out.skel_verts[0].numpy()
+        bone_faces = skel_model.skel_f.numpy()
+        mesh = trimesh.Trimesh(vertices=bone_verts, faces=bone_faces, process=False)
+        mesh.export(out_path)
+        print('  [skel] Bone mesh -> %s  (%d verts)' % (out_path, len(bone_verts)))
+        return out_path
+    except Exception as e:
+        print('  [skel] Export failed: %s' % e)
+        return None
+# ---- Main --------------------------------------------------------------------
+def tpose_smpl(body_glb, out_glb, rig_glb=None, debug_dir=None, skel_out=None):
+    device = 'cuda'
+    if not rig_glb or not os.path.exists(rig_glb):
+        raise RuntimeError('--rig is required: provide the rigged.glb from the Rig step.')
+    print('[tpose_smpl] Rendering front view ...')
+    img_bgr = render_front(body_glb, device=device)
+    if debug_dir:
+        cv2.imwrite(os.path.join(debug_dir, 'tpose_render.png'), img_bgr)
+    print('[tpose_smpl] Running HMR2 pose estimation ...')
+    hmr2_out = run_hmr2(img_bgr, device=device)
+    print('  betas: %s' % hmr2_out['betas'].numpy().round(3))
+    print('[tpose_smpl] Reading rigged GLB (rig world space) ...')
+    rig_data = read_rigged_glb(rig_glb)
+    print('[tpose_smpl] Loading original mesh for UV/texture ...')
+    scene = trimesh.load(body_glb)
+    if isinstance(scene, trimesh.Scene):
+        geom_name = list(scene.geometry.keys())[0]
+        orig_mesh  = scene.geometry[geom_name]
+    else:
+        orig_mesh = scene; geom_name = None
+    orig_verts = np.array(orig_mesh.vertices, dtype=np.float64)
+    uvs        = np.array(orig_mesh.visual.uv, dtype=np.float64)
+    orig_tex   = orig_mesh.visual.material.baseColorTexture
+    print('  Orig mesh: %d verts  Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
+        len(orig_verts),
+        orig_verts[:,1].min(), orig_verts[:,1].max(),
+        orig_verts[:,0].min(), orig_verts[:,0].max()))
+    print('[tpose_smpl] Computing FK transforms in rig world space ...')
+    body_pose_rotmats = hmr2_out['body_pose']   # (23, 3, 3)
+    A = compute_rig_fk_transforms(rig_data['J_bind'], body_pose_rotmats)
+    # Verify zero-pose gives identity (sanity check)
+    A_zero = compute_rig_fk_transforms(rig_data['J_bind'],
+                                        torch.zeros(23, 3, 3) + torch.eye(3))
+    v_test = rig_data['verts'][:3]
+    v_h = np.concatenate([v_test, np.ones((3,1))], axis=1)
+    T_fwd_test = np.zeros((3, 4, 4))
+    for k in range(4):
+        ji = rig_data['j_idx'][:3, k]; w = rig_data['w_arr'][:3, k]
+        T_fwd_test += w[:, None, None] * A_zero[ji]
+    identity_err = np.abs(T_fwd_test - np.eye(4)).max()
+    print('  zero-pose identity check: max_err=%.6f (expect ~0)' % identity_err)
+    print('[tpose_smpl] Applying inverse LBS ...')
+    rig_verts_tposed = inverse_lbs(
+        rig_data['verts'], rig_data['j_idx'], rig_data['w_arr'], A)
+    print('[tpose_smpl] T-posed rig verts: Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
+        rig_verts_tposed[:,1].min(), rig_verts_tposed[:,1].max(),
+        rig_verts_tposed[:,0].min(), rig_verts_tposed[:,0].max()))
+    print('[tpose_smpl] Mapping back to original mesh coordinate space ...')
+    tposed_orig = rig_to_original_space(
+        rig_verts_tposed, rig_data['verts'], orig_verts)
+    print('[tpose_smpl] T-posed orig: Y: [%.3f, %.3f]  X: [%.3f, %.3f]' % (
+        tposed_orig[:,1].min(), tposed_orig[:,1].max(),
+        tposed_orig[:,0].min(), tposed_orig[:,0].max()))
+    orig_mesh.vertices = tposed_orig
+    orig_mesh.visual = TextureVisuals(uv=uvs,
+                                      material=PBRMaterial(baseColorTexture=orig_tex))
+    if geom_name and isinstance(scene, trimesh.Scene):
+        scene.geometry[geom_name] = orig_mesh
+        scene.export(out_glb)
+    else:
+        orig_mesh.export(out_glb)
+    print('[tpose_smpl] Saved: %s  (%d KB)' % (out_glb, os.path.getsize(out_glb)//1024))
+    if skel_out:
+        print('[tpose_smpl] Exporting SKEL bone geometry ...')
+        export_skel_bones(hmr2_out['betas'], skel_out)
+    return out_glb
+if __name__ == '__main__':
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--body',      required=True)
+    ap.add_argument('--out',       required=True)
+    ap.add_argument('--rig',       required=True, help='Rigged GLB from rig step')
+    ap.add_argument('--skel_out',  default=None,  help='SKEL BSM bone mesh output')
+    ap.add_argument('--debug_dir', default=None)
+    args = ap.parse_args()
+    os.makedirs(args.debug_dir, exist_ok=True) if args.debug_dir else None
+    tpose_smpl(args.body, args.out, rig_glb=args.rig,
+               debug_dir=args.debug_dir, skel_out=args.skel_out)

requirements.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+# HuggingFace ZeroGPU Space — Docker SDK
+# chumpy is pre-installed in the Dockerfile with --no-build-isolation
+# (its setup.py does `import pip` which breaks in modern pip isolated builds)
+spaces
+# Git-pinned installs
+hmr2 @ git+https://github.com/shubham-goel/4D-Humans.git@efe18deff163b29dff87ddbd575fa29b716a356c
+clip @ git+https://github.com/openai/CLIP.git@d05afc436d78f1c48dc0dbf8e5980a9d471f35f6
+mvadapter @ git+https://github.com/huanngzh/MV-Adapter.git@4277e0018232bac82bb2c103caf0893cedb711be
+chumpy @ git+https://github.com/mattloper/chumpy.git@580566eafc9ac68b2614b64d6f7aaa84eebb70da
+skel @ git+https://github.com/MarilynKeller/SKEL.git@c32cf16581295bff19399379efe5b776d707cd95
+nvdiffrast @ git+https://github.com/NVlabs/nvdiffrast.git@253ac4fcea7de5f396371124af597e6cc957bfae
+diso @ git+https://github.com/SarahWeiii/diso.git@9792ad928ccb09bdec938779651ee03e395758a6
+detectron2 @ git+https://github.com/facebookresearch/detectron2.git@8a9d885b3d4dcf1bef015f0593b872ed8d32b4ab
+# Core ML
+accelerate
+diffusers>=0.37.0
+transformers>=5.0.0
+safetensors
+huggingface_hub
+peft
+einops
+timm
+xformers
+# 3D / Mesh
+trimesh
+open3d
+pymeshlab
+pygltflib
+pyrender
+moderngl
+moderngl-window
+# Body model
+smplx
+smplpytorch
+# Pose / Motion
+ultralytics
+pyquaternion
+kornia
+# Face enhancement
+insightface
+onnxruntime-gpu
+basicsr
+realesrgan
+gfpgan
+facexlib
+face-alignment
+# Surface enhancement
+stablenormal
+controlnet_aux
+# CV
+opencv-python-headless
+scikit-image
+albumentations
+# Scientific
+numpy
+scipy
+scikit-learn
+pandas
+# Utils
+easydict
+omegaconf
+yacs
+gdown
+pycocotools