Instructions to use FastVideo/Waypoint-1-Small-Diffusers with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use FastVideo/Waypoint-1-Small-Diffusers with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("FastVideo/Waypoint-1-Small-Diffusers", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| # Copyright (C) 2025 Hugging Face Team and Overworld | |
| # | |
| # This program is free software: you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License as published by | |
| # the Free Software Foundation, either version 3 of the License, or | |
| # (at your option) any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program. If not, see <https://www.gnu.org/licenses/>. | |
| """VAE model for WorldEngine frame encoding/decoding.""" | |
| from dataclasses import dataclass | |
| from typing import List, Tuple | |
| import torch | |
| from torch import Tensor | |
| from diffusers.configuration_utils import ConfigMixin, register_to_config | |
| from diffusers.models.modeling_utils import ModelMixin | |
| from .dcae import Encoder, Decoder, bake_weight_norm | |
| class EncoderDecoderConfig: | |
| """Config object for Encoder/Decoder initialization.""" | |
| channels: int | |
| latent_channels: int | |
| ch_0: int | |
| ch_max: int | |
| encoder_blocks_per_stage: List[int] | |
| decoder_blocks_per_stage: List[int] | |
| skip_logvar: bool = False | |
| class WorldEngineVAE(ModelMixin, ConfigMixin): | |
| """ | |
| VAE for encoding/decoding video frames using DCAE architecture. | |
| Encodes RGB uint8 images to latent space and decodes latents back to RGB. | |
| """ | |
| _supports_gradient_checkpointing = False | |
| def __init__( | |
| self, | |
| # Common parameters | |
| sample_size: Tuple[int, int] = (360, 640), | |
| channels: int = 3, | |
| latent_channels: int = 16, | |
| # Encoder parameters | |
| encoder_ch_0: int = 64, | |
| encoder_ch_max: int = 256, | |
| encoder_blocks_per_stage: List[int] = None, | |
| # Decoder parameters | |
| decoder_ch_0: int = 128, | |
| decoder_ch_max: int = 1024, | |
| decoder_blocks_per_stage: List[int] = None, | |
| # Shared parameters | |
| skip_logvar: bool = False, | |
| # Scaling factors | |
| scale_factor: float = 1.0, | |
| shift_factor: float = 0.0, | |
| ): | |
| super().__init__() | |
| # Default blocks per stage | |
| if encoder_blocks_per_stage is None: | |
| encoder_blocks_per_stage = [1, 1, 1, 1] | |
| if decoder_blocks_per_stage is None: | |
| decoder_blocks_per_stage = [1, 1, 1, 1] | |
| # Create encoder config | |
| encoder_config = EncoderDecoderConfig( | |
| channels=channels, | |
| latent_channels=latent_channels, | |
| ch_0=encoder_ch_0, | |
| ch_max=encoder_ch_max, | |
| encoder_blocks_per_stage=list(encoder_blocks_per_stage), | |
| decoder_blocks_per_stage=list(decoder_blocks_per_stage), | |
| skip_logvar=skip_logvar, | |
| ) | |
| # Create decoder config | |
| decoder_config = EncoderDecoderConfig( | |
| channels=channels, | |
| latent_channels=latent_channels, | |
| ch_0=decoder_ch_0, | |
| ch_max=decoder_ch_max, | |
| encoder_blocks_per_stage=list(encoder_blocks_per_stage), | |
| decoder_blocks_per_stage=list(decoder_blocks_per_stage), | |
| skip_logvar=skip_logvar, | |
| ) | |
| self.encoder = Encoder(encoder_config) | |
| self.decoder = Decoder(decoder_config) | |
| def encode(self, img: Tensor): | |
| """RGB -> RGB+D -> latent""" | |
| assert img.dim() == 3, "Expected [H, W, C] image tensor" | |
| img = img.unsqueeze(0).to(device=self.device, dtype=self.dtype) | |
| rgb = img.permute(0, 3, 1, 2).contiguous().div(255).mul(2).sub(1) | |
| return self.encoder(rgb) | |
| def decode(self, latent: Tensor): | |
| decoded = self.decoder(latent) | |
| decoded = (decoded / 2 + 0.5).clamp(0, 1) | |
| decoded = (decoded * 255).round().to(torch.uint8) | |
| return decoded.squeeze(0).permute(1, 2, 0)[..., :3] | |
| def forward(self, x: Tensor, encode: bool = True) -> Tensor: | |
| """ | |
| Forward pass - encode or decode based on flag. | |
| Args: | |
| x: Input tensor (image for encode, latent for decode) | |
| encode: If True, encode; if False, decode | |
| Returns: | |
| Encoded latent or decoded image | |
| """ | |
| if encode: | |
| return self.encode(x) | |
| else: | |
| return self.decode(x) | |
| def bake_weight_norm(self): | |
| """Remove weight_norm parametrizations, baking normalized weights into regular tensors. | |
| Call this after loading weights and before torch.compile to avoid | |
| CUDA graph capture errors from in-place weight updates. | |
| """ | |
| bake_weight_norm(self) | |
| return self | |