Spaces:
Running
on
Zero
Running
on
Zero
| from pathlib import Path | |
| import gradio as gr | |
| import torch | |
| from datetime import datetime | |
| import tempfile | |
| from tqdm import tqdm | |
| from textwrap import dedent | |
| import spaces | |
| from motion_latent_diffusion_standalone import MotionLatentDiffusionModel | |
| from visualize import create_video_from_joints | |
| model = MotionLatentDiffusionModel( | |
| vae_repo_id="blanchon/motion-latent-diffusion-standalone-vae", | |
| denoiser_repo_id="blanchon/motion-latent-diffusion-standalone-denoiser", | |
| text_encoder_repo_id="openai/clip-vit-large-patch14", | |
| ) | |
| model.to("cuda") | |
| model.eval() | |
| model.requires_grad_(False) | |
| def generate_motion( | |
| text_prompt: str, motion_length: int, progress=gr.Progress(track_tqdm=True) | |
| ) -> tuple[Path, str, Path]: | |
| try: | |
| # Create temporary files | |
| temp_dir = tempfile.gettempdir() | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"motion_{timestamp}" | |
| pt_path = Path(temp_dir) / f"{filename}.pt" | |
| video_path = Path(temp_dir) / f"{filename}.mp4" | |
| print("π¬ Generating motion...") | |
| with tqdm( | |
| total=motion_length, | |
| desc="Generating motion", | |
| # disable=not progress.is_tracked(), | |
| ) as pbar: | |
| def callback_on_step_end(i: int, latents: torch.Tensor): | |
| pbar.update(i) | |
| # Generate motion (returns PyTorch tensor) | |
| joints, latent = model.generate( | |
| text_prompt, | |
| motion_length, | |
| return_latent=True, | |
| callback_on_step_end=callback_on_step_end, | |
| ) | |
| # Save motion data as PyTorch tensor | |
| torch.save(joints, pt_path) | |
| print("π₯ Creating visualization...") | |
| # Create video visualization | |
| video_path = create_video_from_joints(joints, video_path.as_posix(), fps=20) | |
| print("β Done!") | |
| # Generate info text | |
| info_text = dedent(""" | |
| β **Generation Complete!** | |
| **Prompt:** {text_prompt} | |
| **Motion Length:** {motion_length} frames ({motion_length / 20:.1f}s at 20fps) | |
| **Output Shape:** {joints.shape} (frames Γ joints Γ coords) | |
| The video shows a 3D skeleton performing the motion. | |
| You can download both the video and the raw motion data below. | |
| """) | |
| return video_path, info_text, pt_path.as_posix() | |
| except Exception as e: | |
| error_msg = f"Error during generation: {str(e)}" | |
| import traceback | |
| traceback.print_exc() | |
| return None, error_msg, None | |
| def create_example_prompts(): | |
| """Return example prompts for the interface""" | |
| return [ | |
| ["a person walks forward slowly", 80], | |
| ["jumping up and down", 100], | |
| ["a person waves hello", 60], | |
| ["running in place", 100], | |
| ["a person does jumping jacks", 120], | |
| ["someone performs a cartwheel", 140], | |
| ["walking backwards carefully", 90], | |
| ["a person stretches their arms", 80], | |
| ] | |
| with gr.Blocks(title="MLD Text-to-Motion Generator", theme=gr.themes.Soft()) as demo: | |
| # Header | |
| gr.Markdown(""" | |
| # π¬ MLD Text-to-Motion Generator | |
| Generate realistic human motion animations from text descriptions! | |
| Powered by Motion Latent Diffusion (MLD). | |
| ### π‘ Tips for Best Results: | |
| - Be specific: "a person walks forward slowly" works better than just "walking" | |
| - Use present tense: "walks" or "is walking" | |
| - Describe single continuous actions | |
| - Recommended length: 40-60 frames for short actions, 80-120 for walking/running | |
| """) | |
| with gr.Row(): | |
| # Left column - Inputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("## π Input") | |
| text_input = gr.Textbox( | |
| label="Text Prompt", | |
| placeholder="Enter motion description (e.g., 'a person walks forward slowly')", | |
| lines=3, | |
| value="a person walks forward", | |
| ) | |
| with gr.Row(): | |
| length_slider = gr.Slider( | |
| minimum=16, | |
| maximum=196, | |
| value=100, | |
| step=1, | |
| label="Motion Length (frames)", | |
| info="20 frames = 1 second", | |
| ) | |
| generate_btn = gr.Button("π¬ Generate Motion", variant="primary", size="lg") | |
| gr.Markdown("### π Example Prompts") | |
| gr.Examples( | |
| examples=create_example_prompts(), | |
| inputs=[text_input, length_slider], | |
| label=None, | |
| ) | |
| # Right column - Outputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("## π₯ Output") | |
| info_output = gr.Markdown( | |
| "Generate a motion to see the results here.", | |
| elem_classes=["output-info"], | |
| ) | |
| video_output = gr.Video( | |
| label="Generated Motion Video", | |
| elem_classes=["output-video"], | |
| autoplay=True, | |
| show_share_button=True, | |
| ) | |
| with gr.Row(): | |
| pt_download = gr.File(label="Download Motion Data (.pt)", visible=False) | |
| # Footer | |
| gr.Markdown( | |
| dedent(""" | |
| --- | |
| ### βΉοΈ About | |
| **Motion Latent Diffusion (MLD)** generates 3D human motion by: | |
| 1. Encoding text with CLIP | |
| 2. Generating motion in latent space via diffusion (50 steps) | |
| 3. Decoding to 3D joint positions (22 joints) | |
| 4. Visualizing as a 3D skeleton animation | |
| **Citation:** Chen et al., "Executing your Commands via Motion Diffusion in Latent Space", CVPR 2023 | |
| **Repository:** [motion-latent-diffusion](https://github.com/ChenFengYe/motion-latent-diffusion) | |
| """) | |
| ) | |
| # Event handlers | |
| def generate_and_update(text, length): | |
| video, info, pt = generate_motion(text, length) | |
| if pt: | |
| return video, info, gr.update(value=pt, visible=True) | |
| return video, info, gr.update(visible=False) | |
| generate_btn.click( | |
| fn=generate_and_update, | |
| inputs=[text_input, length_slider], | |
| outputs=[video_output, info_output, pt_download], | |
| ) | |
| demo.launch( | |
| server_name="0.0.0.0", # Allow external access | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| ) | |