BonanDing commited on Feb 10

Commit

8652b14

1 Parent(s): f40de20

update lfs

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
LICENSE.md +14 -0
README.md +201 -0
algorithms/README.md +21 -0
algorithms/__init__.py +0 -0
algorithms/common/README.md +5 -0
algorithms/common/__init__.py +0 -0
algorithms/common/base_algo.py +21 -0
algorithms/common/base_pytorch_algo.py +252 -0
algorithms/common/metrics/__init__.py +3 -0
algorithms/common/metrics/fid.py +1 -0
algorithms/common/metrics/fvd.py +158 -0
algorithms/common/metrics/lpips.py +1 -0
algorithms/common/models/__init__.py +0 -0
algorithms/common/models/cnn.py +141 -0
algorithms/common/models/mlp.py +22 -0
algorithms/worldmem/__init__.py +2 -0
algorithms/worldmem/df_base.py +307 -0
algorithms/worldmem/df_video.py +926 -0
algorithms/worldmem/models/attention.py +342 -0
algorithms/worldmem/models/cameractrl_module.py +12 -0
algorithms/worldmem/models/diffusion.py +520 -0
algorithms/worldmem/models/dit.py +572 -0
algorithms/worldmem/models/pose_prediction.py +42 -0
algorithms/worldmem/models/rotary_embedding_torch.py +302 -0
algorithms/worldmem/models/utils.py +163 -0
algorithms/worldmem/models/vae.py +359 -0
algorithms/worldmem/pose_prediction.py +374 -0
app.py +576 -0
assets/desert.png +3 -0
assets/ice_plains.png +3 -0
assets/place.png +3 -0
assets/plains.png +3 -0
assets/rain_sunflower_plains.png +3 -0
assets/savanna.png +3 -0
assets/sunflower_plains.png +3 -0
assets/worldmem_logo.png +3 -0
calculate_fid.py +277 -0
configurations/algorithm/base_algo.yaml +3 -0
configurations/algorithm/base_pytorch_algo.yaml +4 -0
configurations/algorithm/df_base.yaml +42 -0
configurations/algorithm/df_video_worldmemminecraft.yaml +38 -0
configurations/dataset/base_dataset.yaml +3 -0
configurations/dataset/base_video.yaml +14 -0
configurations/dataset/video_minecraft.yaml +14 -0
configurations/experiment/base_experiment.yaml +2 -0
configurations/experiment/base_pytorch.yaml +50 -0
configurations/experiment/exp_video.yaml +31 -0
configurations/huggingface.yaml +60 -0
configurations/training.yaml +16 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+<<<<<<< HEAD
+=======
+assets/* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+>>>>>>> def529c (Baseline WorldMem)

LICENSE.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# S-Lab License 1.0
+Copyright 2025 S-Lab
+Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+4. In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work.
+---
+For the commercial use of the code, please consult Prof. Chen Change Loy (ccloy@ntu.edu.sg)

README.md ADDED Viewed

	@@ -0,0 +1,201 @@

+<br>
+<p align="center">
+<p align="center">
+  <img src="assets/worldmem_logo.png" alt="WORLDMEM Icon" width="80"/>
+</p>
+<h1 align="center"><strong>WorldMem: Long-term Consistent World Simulation <br> with Memory</strong></h1>
+  <p align="center"><span><a href=""></a></span>
+              <a href="https://xizaoqu.github.io">Zeqi Xiao<sup>1</sup></a>
+              <a href="https://nirvanalan.github.io/">Yushi Lan<sup>1</sup></a>
+              <a href="https://zhouyifan.net/about/">Yifan Zhou<sup>1</sup></a>
+              <a href="https://vicky0522.github.io/Wenqi-Ouyang/">Wenqi Ouyang<sup>1</sup></a>
+              <a href="https://williamyang1991.github.io/">Shuai Yang<sup>2</sup></a>
+              <a href="https://zengyh1900.github.io/">Yanhong Zeng<sup>3</sup></a>
+              <a href="https://xingangpan.github.io/">Xingang Pan<sup>1</sup></a>    <br>
+    <sup>1</sup>S-Lab, Nanyang Technological University, <br> <sup>2</sup>Wangxuan Institute of Computer Technology, Peking University,<br>  <sup>3</sup>Shanghai AI Laboratory
+    </p>
+</p>
+<p align="center">
+  <a href="https://arxiv.org/abs/2504.12369" target='_blank'>
+    <img src="https://img.shields.io/badge/arXiv-2504.12369-blue?">
+  </a>
+  <a href="https://xizaoqu.github.io/worldmem/" target='_blank'>
+    <img src="https://img.shields.io/badge/Project-&#x1F680-blue">
+  </a>
+<a href="https://huggingface.co/spaces/yslan/worldmem" target="_blank">
+  <img src="https://img.shields.io/badge/🤗 HuggingFace-Demo-orange" />
+</a>
+</p>
+https://github.com/user-attachments/assets/fb8a32e2-9470-4819-a93d-c38caf76d72c
+## Installation
+```
+conda create python=3.10 -n worldmem
+conda activate worldmem
+pip install -r requirements.txt
+conda install -c conda-forge ffmpeg=4.3.2
+```
+## Quick start
+```
+python app.py
+```
+## Run
+To enable cloud logging with [Weights & Biases (wandb)](https://wandb.ai/site), follow these steps:
+1. Sign up for a wandb account.
+2. Run the following command to log in:
+    ```bash
+    wandb login
+    ```
+3. Open `configurations/training.yaml` and set the `entity` and `project` field to your wandb username.
+---
+### Training
+Download pretrained weights from [Oasis](https://github.com/etched-ai/open-oasis).
+Training the model on 4 H100 GPUs, it converges after approximately 500K steps.
+We observe that gradually increasing task difficulty improves performance. Thus, we adopt a multi-stage training strategy:
+,
+```bash
+sh train_stage_1.sh   # Small range, no vertical turning
+sh train_stage_2.sh   # Large range, no vertical turning
+sh train_stage_3.sh   # Large range, with vertical turning
+```
+To resume training from a previous checkpoint, configure the `resume` and `output_dir` variables in the corresponding `.sh` script.
+---
+### Inference
+To run inference:
+```bash
+sh infer.sh
+```
+You can either **load the diffusion model and VAE separately**:
+```bash
++diffusion_model_path=zeqixiao/worldmem_checkpoints/diffusion_only.ckpt \
++vae_path=zeqixiao/worldmem_checkpoints/vae_only.ckpt \
++customized_load=true \
++seperate_load=true \
+```
+Or **load a combined checkpoint**:
+```bash
++load=your_model_path \
++customized_load=true \
++seperate_load=false \
+```
+### Evaluation
+To run evaluation:
+```bash
+sh evaluate.sh
+```
+This script reproduces the results in Table 1 (beyond context window). It will generate PSNR and Lpips. Evaluating 1 case on 1 A100 GPU takes approximately 6 minutes. You can adjust `experiment.test.limit_batch` to specify the number of cases to evaluate.
+Visual results will be saved by default to a timestamped directory (e.g., `outputs/2025-11-30/00-02-42`).
+To calculate the FID score, run:
+```bash
+python calculate_fid.py --videos_dir <path_to_videos>
+```
+For example:
+```bash
+python calculate_fid.py --videos_dir outputs/2025-11-30/00-02-42/videos/test_vis
+```
+**Expected Results:**
+| Metric | Value  |
+|--------|--------|
+| PSNR   | 24.01  |
+| LPIPS  | 0.1667 |
+| FID    | 15.13  |
+*Note: FID is computed over 5000 frames.*
+---
+## Dataset
+Download the Minecraft dataset from [Hugging Face](https://huggingface.co/datasets/zeqixiao/worldmem_minecraft_dataset)
+Place the dataset in the following directory structure:
+```
+data/
+└── minecraft/
+    ├── training/
+    └── validation/
+    └── test/
+```
+## Data Generation
+After setting up the environment as described in [MineDojo's GitHub repository](https://github.com/MineDojo/MineDojo), you can generate data using the following command:
+```bash
+xvfb-run -a python data_generator.py -o data/test -z 4 --env_type plains
+```
+**Parameters:**
+- `-o`: Output directory for generated data
+- `-z`: Number of parallel workers
+- `--env_type`: Environment type (e.g., `plains`)
+## TODO
+- [x] Release inference models and weights;
+- [x] Release training pipeline on Minecraft;
+- [x] Release training data on Minecraft;
+- [x] Release evaluation scripts and data generator.
+## 🔗 Citation
+If you find our work helpful, please cite:
+```
+@misc{xiao2025worldmemlongtermconsistentworld,
+      title={WORLDMEM: Long-term Consistent World Simulation with Memory},
+      author={Zeqi Xiao and Yushi Lan and Yifan Zhou and Wenqi Ouyang and Shuai Yang and Yanhong Zeng and Xingang Pan},
+      year={2025},
+      eprint={2504.12369},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2504.12369},
+}
+```
+## 👏 Acknowledgements
+- [Diffusion Forcing](https://github.com/buoyancy99/diffusion-forcing): Diffusion Forcing provides flexible training and inference strategies for our methods.
+- [Minedojo](https://github.com/MineDojo/MineDojo): We collect our Minecraft dataset from Minedojo.
+- [Open-oasis](https://github.com/etched-ai/open-oasis): Our model architecture is based on Open-oasis. We also use pretrained VAE and DiT weight from it.

algorithms/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# algorithms
+`algorithms` folder is designed to contain implementation of algorithms or models.
+Content in `algorithms` can be loosely grouped components (e.g. models) or an algorithm has already has all
+components chained together (e.g. Lightning Module, RL algo).
+You should create a folder name after your own algorithm or baselines in it.
+Two example can be found in `examples` subfolder.
+The `common` subfolder is designed to contain general purpose classes that's useful for many projects, e.g MLP.
+You should not run any `.py` file from algorithms folder.
+Instead, you write unit tests / debug python files in `debug` and launch script in `experiments`.
+You are discouraged from putting visualization utilities in algorithms, as those should go to `utils` in project root.
+Each algorithm class takes in a DictConfig file `cfg` in its `__init__`, which allows you to pass in arguments via configuration file in `configurations/algorithm` or [command line override](https://hydra.cc/docs/tutorials/basic/your_first_app/simple_cli/).
+---
+This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research template [repo](https://github.com/buoyancy99/research-template). By its MIT license, you must keep the above sentence in `README.md` and the `LICENSE` file to credit the author.

algorithms/__init__.py ADDED Viewed

File without changes

algorithms/common/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+THis folder contains models / algorithms that are considered general for many algorithms.
+---
+This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research template [repo](https://github.com/buoyancy99/research-template). By its MIT license, you must keep the above sentence in `README.md` and the `LICENSE` file to credit the author.

algorithms/common/__init__.py ADDED Viewed

File without changes

algorithms/common/base_algo.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Union
+from omegaconf import DictConfig
+class BaseAlgo(ABC):
+    """
+    A base class for generic algorithms.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.cfg = cfg
+    @abstractmethod
+    def run(*args: Any, **kwargs: Any) -> Any:
+        """
+        Run the algorithm.
+        """
+        raise NotImplementedError

algorithms/common/base_pytorch_algo.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from abc import ABC, abstractmethod
+import warnings
+from typing import Any, Union, Sequence, Optional
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from omegaconf import DictConfig
+import lightning.pytorch as pl
+import torch
+import numpy as np
+from PIL import Image
+import wandb
+import einops
+class BasePytorchAlgo(pl.LightningModule, ABC):
+    """
+    A base class for Pytorch algorithms using Pytorch Lightning.
+    See https://lightning.ai/docs/pytorch/stable/starter/introduction.html for more details.
+    """
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.cfg = cfg
+        self._build_model()
+    @abstractmethod
+    def _build_model(self):
+        """
+        Create all pytorch nn.Modules here.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        r"""Here you compute and return the training loss and some additional metrics for e.g. the progress bar or
+        logger.
+        Args:
+            batch: The output of your data iterable, normally a :class:`~torch.utils.data.DataLoader`.
+            batch_idx: The index of this batch.
+            dataloader_idx: (only if multiple dataloaders used) The index of the dataloader that produced this batch.
+        Return:
+            Any of these options:
+            - :class:`~torch.Tensor` - The loss tensor
+            - ``dict`` - A dictionary. Can include any keys, but must include the key ``'loss'``.
+            - ``None`` - Skip to the next batch. This is only supported for automatic optimization.
+                This is not supported for multi-GPU, TPU, IPU, or DeepSpeed.
+        In this step you'd normally do the forward pass and calculate the loss for a batch.
+        You can also do fancier things like multiple forward passes or something model specific.
+        Example::
+            def training_step(self, batch, batch_idx):
+                x, y, z = batch
+                out = self.encoder(x)
+                loss = self.loss(out, x)
+                return loss
+        To use multiple optimizers, you can switch to 'manual optimization' and control their stepping:
+        .. code-block:: python
+            def __init__(self):
+                super().__init__()
+                self.automatic_optimization = False
+            # Multiple optimizers (e.g.: GANs)
+            def training_step(self, batch, batch_idx):
+                opt1, opt2 = self.optimizers()
+                # do training_step with encoder
+                ...
+                opt1.step()
+                # do training_step with decoder
+                ...
+                opt2.step()
+        Note:
+            When ``accumulate_grad_batches`` > 1, the loss returned here will be automatically
+            normalized by ``accumulate_grad_batches`` internally.
+        """
+        return super().training_step(*args, **kwargs)
+    def configure_optimizers(self):
+        """
+        Return an optimizer. If you need to use more than one optimizer, refer to pytorch lightning documentation:
+        https://lightning.ai/docs/pytorch/stable/common/optimization.html
+        """
+        parameters = self.parameters()
+        return torch.optim.Adam(parameters, lr=self.cfg.lr)
+    def log_video(
+        self,
+        key: str,
+        video: Union[np.ndarray, torch.Tensor],
+        mean: Union[np.ndarray, torch.Tensor, Sequence, float] = None,
+        std: Union[np.ndarray, torch.Tensor, Sequence, float] = None,
+        fps: int = 5,
+        format: str = "mp4",
+    ):
+        """
+        Log video to wandb. WandbLogger in pytorch lightning does not support video logging yet, so we call wandb directly.
+        Args:
+            video: a numpy array or tensor, either in form (time, channel, height, width) or in the form
+                (batch, time, channel, height, width). The content must be be in 0-255 if under dtype uint8
+                or [0, 1] otherwise.
+            mean: optional, the mean to unnormalize video tensor, assuming unnormalized data is in [0, 1].
+            std: optional, the std to unnormalize video tensor, assuming unnormalized data is in [0, 1].
+            key: the name of the video.
+            fps: the frame rate of the video.
+            format: the format of the video. Can be either "mp4" or "gif".
+        """
+        if isinstance(video, torch.Tensor):
+            video = video.detach().cpu().numpy()
+        expand_shape = [1] * (len(video.shape) - 2) + [3, 1, 1]
+        if std is not None:
+            if isinstance(std, (float, int)):
+                std = [std] * 3
+            if isinstance(std, torch.Tensor):
+                std = std.detach().cpu().numpy()
+            std = np.array(std).reshape(*expand_shape)
+            video = video * std
+        if mean is not None:
+            if isinstance(mean, (float, int)):
+                mean = [mean] * 3
+            if isinstance(mean, torch.Tensor):
+                mean = mean.detach().cpu().numpy()
+            mean = np.array(mean).reshape(*expand_shape)
+            video = video + mean
+        if video.dtype != np.uint8:
+            video = np.clip(video, a_min=0, a_max=1) * 255
+            video = video.astype(np.uint8)
+        self.logger.experiment.log(
+            {
+                key: wandb.Video(video, fps=fps, format=format),
+            },
+            step=self.global_step,
+        )
+    def log_image(
+        self,
+        key: str,
+        image: Union[np.ndarray, torch.Tensor, Image.Image, Sequence[Image.Image]],
+        mean: Union[np.ndarray, torch.Tensor, Sequence, float] = None,
+        std: Union[np.ndarray, torch.Tensor, Sequence, float] = None,
+        **kwargs: Any,
+    ):
+        """
+        Log image(s) using WandbLogger.
+        Args:
+            key: the name of the video.
+            image: a single image or a batch of images. If a batch of images, the shape should be (batch, channel, height, width).
+            mean: optional, the mean to unnormalize image tensor, assuming unnormalized data is in [0, 1].
+            std: optional, the std to unnormalize tensor, assuming unnormalized data is in [0, 1].
+            kwargs: optional, WandbLogger log_image kwargs, such as captions=xxx.
+        """
+        if isinstance(image, Image.Image):
+            image = [image]
+        elif len(image) and not isinstance(image[0], Image.Image):
+            if isinstance(image, torch.Tensor):
+                image = image.detach().cpu().numpy()
+            if len(image.shape) == 3:
+                image = image[None]
+            if image.shape[1] == 3:
+                if image.shape[-1] == 3:
+                    warnings.warn(f"Two channels in shape {image.shape} have size 3, assuming channel first.")
+                image = einops.rearrange(image, "b c h w -> b h w c")
+            if std is not None:
+                if isinstance(std, (float, int)):
+                    std = [std] * 3
+                if isinstance(std, torch.Tensor):
+                    std = std.detach().cpu().numpy()
+                std = np.array(std)[None, None, None]
+                image = image * std
+            if mean is not None:
+                if isinstance(mean, (float, int)):
+                    mean = [mean] * 3
+                if isinstance(mean, torch.Tensor):
+                    mean = mean.detach().cpu().numpy()
+                mean = np.array(mean)[None, None, None]
+                image = image + mean
+            if image.dtype != np.uint8:
+                image = np.clip(image, a_min=0.0, a_max=1.0) * 255
+                image = image.astype(np.uint8)
+                image = [img for img in image]
+        self.logger.log_image(key=key, images=image, **kwargs)
+    def log_gradient_stats(self):
+        """Log gradient statistics such as the mean or std of norm."""
+        with torch.no_grad():
+            grad_norms = []
+            gpr = []  # gradient-to-parameter ratio
+            for param in self.parameters():
+                if param.grad is not None:
+                    grad_norms.append(torch.norm(param.grad).item())
+                    gpr.append(torch.norm(param.grad) / torch.norm(param))
+            if len(grad_norms) == 0:
+                return
+            grad_norms = torch.tensor(grad_norms)
+            gpr = torch.tensor(gpr)
+            self.log_dict(
+                {
+                    "train/grad_norm/min": grad_norms.min(),
+                    "train/grad_norm/max": grad_norms.max(),
+                    "train/grad_norm/std": grad_norms.std(),
+                    "train/grad_norm/mean": grad_norms.mean(),
+                    "train/grad_norm/median": torch.median(grad_norms),
+                    "train/gpr/min": gpr.min(),
+                    "train/gpr/max": gpr.max(),
+                    "train/gpr/std": gpr.std(),
+                    "train/gpr/mean": gpr.mean(),
+                    "train/gpr/median": torch.median(gpr),
+                }
+            )
+    def register_data_mean_std(
+        self, mean: Union[str, float, Sequence], std: Union[str, float, Sequence], namespace: str = "data"
+    ):
+        """
+        Register mean and std of data as tensor buffer.
+        Args:
+            mean: the mean of data.
+            std: the std of data.
+            namespace: the namespace of the registered buffer.
+        """
+        for k, v in [("mean", mean), ("std", std)]:
+            if isinstance(v, str):
+                if v.endswith(".npy"):
+                    v = torch.from_numpy(np.load(v))
+                elif v.endswith(".pt"):
+                    v = torch.load(v)
+                else:
+                    raise ValueError(f"Unsupported file type {v.split('.')[-1]}.")
+            else:
+                v = torch.tensor(v)
+            self.register_buffer(f"{namespace}_{k}", v.float().to(self.device))

algorithms/common/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .fid import FrechetInceptionDistance
+from .lpips import LearnedPerceptualImagePatchSimilarity
+from .fvd import FrechetVideoDistance

algorithms/common/metrics/fid.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from torchmetrics.image.fid import FrechetInceptionDistance

algorithms/common/metrics/fvd.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Adopted from https://github.com/cvpr2022-stylegan-v/stylegan-v
+Verified to be the same as tf version by https://github.com/universome/fvd-comparison
+"""
+import io
+import re
+import requests
+import html
+import hashlib
+import urllib
+import urllib.request
+from typing import Any, List, Tuple, Union, Dict
+import scipy
+import torch
+import torch.nn as nn
+import numpy as np
+def open_url(
+    url: str,
+    num_attempts: int = 10,
+    verbose: bool = True,
+    return_filename: bool = False,
+) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match("^[a-z]+://", url):
+        return url if return_filename else open(url, "rb")
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith("file://"):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r"^/[a-zA-Z]:", filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [
+                                html.unescape(link)
+                                for link in content_str.split('"')
+                                if "export=download" in link
+                            ]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError(
+                                "Google Drive download quota exceeded -- please try again later"
+                            )
+                    match = re.search(
+                        r'filename="([^"]*)"',
+                        res.headers.get("Content-Disposition", ""),
+                    )
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+def compute_fvd(feats_fake: np.ndarray, feats_real: np.ndarray) -> float:
+    mu_gen, sigma_gen = compute_stats(feats_fake)
+    mu_real, sigma_real = compute_stats(feats_real)
+    m = np.square(mu_gen - mu_real).sum()
+    s, _ = scipy.linalg.sqrtm(
+        np.dot(sigma_gen, sigma_real), disp=False
+    )  # pylint: disable=no-member
+    fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+    return float(fid)
+def compute_stats(feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    mu = feats.mean(axis=0)  # [d]
+    sigma = np.cov(feats, rowvar=False)  # [d, d]
+    return mu, sigma
+class FrechetVideoDistance(nn.Module):
+    def __init__(self):
+        super().__init__()
+        detector_url = (
+            "https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1"
+        )
+        # Return raw features before the softmax layer.
+        self.detector_kwargs = dict(rescale=False, resize=True, return_features=True)
+        with open_url(detector_url, verbose=False) as f:
+            self.detector = torch.jit.load(f).eval()
+    @torch.no_grad()
+    def compute(self, videos_fake: torch.Tensor, videos_real: torch.Tensor):
+        """
+        :param videos_fake: predicted video tensor of shape (frame, batch, channel, height, width)
+        :param videos_real: ground-truth observation tensor of shape (frame, batch, channel, height, width)
+        :return:
+        """
+        n_frames, batch_size, c, h, w = videos_fake.shape
+        if n_frames < 2:
+            raise ValueError("Video must have more than 1 frame for FVD")
+        videos_fake = videos_fake.permute(1, 2, 0, 3, 4).contiguous()
+        videos_real = videos_real.permute(1, 2, 0, 3, 4).contiguous()
+        # detector takes in tensors of shape [batch_size, c, video_len, h, w] with range -1 to 1
+        feats_fake = self.detector(videos_fake, **self.detector_kwargs).cpu().numpy()
+        feats_real = self.detector(videos_real, **self.detector_kwargs).cpu().numpy()
+        return compute_fvd(feats_fake, feats_real)

algorithms/common/metrics/lpips.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity

algorithms/common/models/__init__.py ADDED Viewed

File without changes

algorithms/common/models/cnn.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import math
+import torch.nn as nn
+from torch.nn import functional as F
+def is_square_of_two(num):
+    if num <= 0:
+        return False
+    return num & (num - 1) == 0
+class CnnEncoder(nn.Module):
+    """
+    Simple cnn encoder that encodes a 64x64 image to embeddings
+    """
+    def __init__(self, embedding_size, activation_function='relu'):
+        super().__init__()
+        self.act_fn = getattr(F, activation_function)
+        self.embedding_size = embedding_size
+        self.fc = nn.Linear(1024, self.embedding_size)
+        self.conv1 = nn.Conv2d(3, 32, 4, stride=2)
+        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
+        self.conv3 = nn.Conv2d(64, 128, 4, stride=2)
+        self.conv4 = nn.Conv2d(128, 256, 4, stride=2)
+        self.modules = [self.conv1, self.conv2, self.conv3, self.conv4]
+    def forward(self, observation):
+        batch_size = observation.shape[0]
+        hidden = self.act_fn(self.conv1(observation))
+        hidden = self.act_fn(self.conv2(hidden))
+        hidden = self.act_fn(self.conv3(hidden))
+        hidden = self.act_fn(self.conv4(hidden))
+        hidden = self.fc(hidden.view(batch_size, 1024))
+        return hidden
+class CnnDecoder(nn.Module):
+    """
+    Simple Cnn decoder that decodes an embedding to 64x64 images
+    """
+    def __init__(self, embedding_size, activation_function='relu'):
+        super().__init__()
+        self.act_fn = getattr(F, activation_function)
+        self.embedding_size = embedding_size
+        self.fc = nn.Linear(embedding_size, 128)
+        self.conv1 = nn.ConvTranspose2d(128, 128, 5, stride=2)
+        self.conv2 = nn.ConvTranspose2d(128, 64, 5, stride=2)
+        self.conv3 = nn.ConvTranspose2d(64, 32, 6, stride=2)
+        self.conv4 = nn.ConvTranspose2d(32, 3, 6, stride=2)
+        self.modules = [self.conv1, self.conv2, self.conv3, self.conv4]
+    def forward(self, embedding):
+        batch_size = embedding.shape[0]
+        hidden = self.fc(embedding)
+        hidden = hidden.view(batch_size, 128, 1, 1)
+        hidden = self.act_fn(self.conv1(hidden))
+        hidden = self.act_fn(self.conv2(hidden))
+        hidden = self.act_fn(self.conv3(hidden))
+        observation = self.conv4(hidden)
+        return observation
+class FullyConvEncoder(nn.Module):
+    """
+    Simple fully convolutional encoder, with 2D input and 2D output
+    """
+    def __init__(self,
+                 input_shape=(3, 64, 64),
+                 embedding_shape=(8, 16, 16),
+                 activation_function='relu',
+                 init_channels=16,
+                 ):
+        super().__init__()
+        assert len(input_shape) == 3, "input_shape must be a tuple of length 3"
+        assert len(embedding_shape) == 3, "embedding_shape must be a tuple of length 3"
+        assert input_shape[1] == input_shape[2] and is_square_of_two(input_shape[1]), "input_shape must be square"
+        assert embedding_shape[1] == embedding_shape[2], "embedding_shape must be square"
+        assert input_shape[1] % embedding_shape[1] == 0, "input_shape must be divisible by embedding_shape"
+        assert is_square_of_two(init_channels), "init_channels must be a square of 2"
+        depth = int(math.sqrt(input_shape[1] / embedding_shape[1])) + 1
+        channels_per_layer = [init_channels * (2 ** i) for i in range(depth)]
+        self.act_fn = getattr(F, activation_function)
+        self.downs = nn.ModuleList([])
+        self.downs.append(nn.Conv2d(input_shape[0], channels_per_layer[0], kernel_size=3, stride=1, padding=1))
+        for i in range(1, depth):
+            self.downs.append(nn.Conv2d(channels_per_layer[i-1], channels_per_layer[i],
+                                        kernel_size=3, stride=2, padding=1))
+        # Bottleneck layer
+        self.downs.append(nn.Conv2d(channels_per_layer[-1], embedding_shape[0], kernel_size=1, stride=1, padding=0))
+    def forward(self, observation):
+        hidden = observation
+        for layer in self.downs:
+            hidden = self.act_fn(layer(hidden))
+        return hidden
+class FullyConvDecoder(nn.Module):
+    """
+    Simple fully convolutional decoder, with 2D input and 2D output
+    """
+    def __init__(self,
+                 embedding_shape=(8, 16, 16),
+                 output_shape=(3, 64, 64),
+                 activation_function='relu',
+                 init_channels=16,
+                 ):
+        super().__init__()
+        assert len(embedding_shape) == 3, "embedding_shape must be a tuple of length 3"
+        assert len(output_shape) == 3, "output_shape must be a tuple of length 3"
+        assert output_shape[1] == output_shape[2] and is_square_of_two(output_shape[1]), "output_shape must be square"
+        assert embedding_shape[1] == embedding_shape[2], "input_shape must be square"
+        assert output_shape[1] % embedding_shape[1] == 0, "output_shape must be divisible by input_shape"
+        assert is_square_of_two(init_channels), "init_channels must be a square of 2"
+        depth = int(math.sqrt(output_shape[1] / embedding_shape[1])) + 1
+        channels_per_layer = [init_channels * (2 ** i) for i in range(depth)]
+        self.act_fn = getattr(F, activation_function)
+        self.ups = nn.ModuleList([])
+        self.ups.append(nn.ConvTranspose2d(embedding_shape[0], channels_per_layer[-1],
+                                           kernel_size=1, stride=1, padding=0))
+        for i in range(1, depth):
+            self.ups.append(nn.ConvTranspose2d(channels_per_layer[-i], channels_per_layer[-i-1],
+                                               kernel_size=3, stride=2, padding=1, output_padding=1))
+        self.output_layer = nn.ConvTranspose2d(channels_per_layer[0], output_shape[0],
+                                               kernel_size=3, stride=1, padding=1)
+    def forward(self, embedding):
+        hidden = embedding
+        for layer in self.ups:
+            hidden = self.act_fn(layer(hidden))
+        return self.output_layer(hidden)

algorithms/common/models/mlp.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import Type, Optional
+import torch
+from torch import nn as nn
+class SimpleMlp(nn.Module):
+    """
+    A class for very simple multi layer perceptron
+    """
+    def __init__(self, in_dim=2, out_dim=1, hidden_dim=64, n_layers=2,
+                 activation: Type[nn.Module] = nn.ReLU, output_activation: Optional[Type[nn.Module]] = None):
+        super(SimpleMlp, self).__init__()
+        layers = [nn.Linear(in_dim, hidden_dim), activation()]
+        layers.extend([nn.Linear(hidden_dim, hidden_dim), activation()] * (n_layers - 2))
+        layers.append(nn.Linear(hidden_dim, out_dim))
+        if output_activation:
+            layers.append(output_activation())
+        self.net = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.net(x)

algorithms/worldmem/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .df_video import WorldMemMinecraft
2	+ from .pose_prediction import PosePrediction

algorithms/worldmem/df_base.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+This repo is forked from [Boyuan Chen](https://boyuan.space/)'s research
+template [repo](https://github.com/buoyancy99/research-template).
+By its MIT license, you must keep the above sentence in `README.md`
+and the `LICENSE` file to credit the author.
+"""
+from typing import Optional
+from tqdm import tqdm
+from omegaconf import DictConfig
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Any
+from einops import rearrange
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from algorithms.common.base_pytorch_algo import BasePytorchAlgo
+from .models.diffusion import Diffusion
+class DiffusionForcingBase(BasePytorchAlgo):
+    def __init__(self, cfg: DictConfig):
+        self.cfg = cfg
+        self.x_shape = cfg.x_shape
+        self.frame_stack = cfg.frame_stack
+        self.x_stacked_shape = list(self.x_shape)
+        self.x_stacked_shape[0] *= cfg.frame_stack
+        self.guidance_scale = cfg.guidance_scale
+        self.context_frames = cfg.context_frames
+        self.chunk_size = cfg.chunk_size
+        self.action_cond_dim = cfg.action_cond_dim
+        self.causal = cfg.causal
+        self.uncertainty_scale = cfg.uncertainty_scale
+        self.timesteps = cfg.diffusion.timesteps
+        self.sampling_timesteps = cfg.diffusion.sampling_timesteps
+        self.clip_noise = cfg.diffusion.clip_noise
+        self.cfg.diffusion.cum_snr_decay = self.cfg.diffusion.cum_snr_decay ** (self.frame_stack * cfg.frame_skip)
+        self.validation_step_outputs = []
+        super().__init__(cfg)
+    def _build_model(self):
+        self.diffusion_model = Diffusion(
+            x_shape=self.x_stacked_shape,
+            action_cond_dim=self.action_cond_dim,
+            is_causal=self.causal,
+            cfg=self.cfg.diffusion,
+        )
+        self.register_data_mean_std(self.cfg.data_mean, self.cfg.data_std)
+    def configure_optimizers(self):
+        params = tuple(self.diffusion_model.parameters())
+        optimizer_dynamics = torch.optim.AdamW(
+            params, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay, betas=self.cfg.optimizer_beta
+        )
+        return optimizer_dynamics
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
+        # update params
+        optimizer.step(closure=optimizer_closure)
+        # manually warm up lr without a scheduler
+        if self.trainer.global_step < self.cfg.warmup_steps:
+            lr_scale = min(1.0, float(self.trainer.global_step + 1) / self.cfg.warmup_steps)
+            for pg in optimizer.param_groups:
+                pg["lr"] = lr_scale * self.cfg.lr
+    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
+        xs, conditions, masks = self._preprocess_batch(batch)
+        rand_length = torch.randint(3,xs.shape[0]-2, (1,))[0].item()
+        xs = torch.cat([xs[:rand_length], xs[rand_length-3:rand_length-1]])
+        conditions = torch.cat([conditions[:rand_length], conditions[rand_length-3:rand_length-1]])
+        masks = torch.cat([masks[:rand_length], masks[rand_length-3:rand_length-1]])
+        noise_levels=self._generate_noise_levels(xs)
+        noise_levels[:rand_length] = 15 # stable_noise_levels
+        noise_levels[rand_length+1:] = 15 # stable_noise_levels
+        xs_pred, loss = self.diffusion_model(xs, conditions, noise_levels=noise_levels)
+        loss = self.reweight_loss(loss, masks)
+        # log the loss
+        if batch_idx % 20 == 0:
+            self.log("training/loss", loss)
+        xs = self._unstack_and_unnormalize(xs)
+        xs_pred = self._unstack_and_unnormalize(xs_pred)
+        output_dict = {
+            "loss": loss,
+            "xs_pred": xs_pred,
+            "xs": xs,
+        }
+        return output_dict
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx, namespace="validation") -> STEP_OUTPUT:
+        xs, conditions, masks = self._preprocess_batch(batch)
+        n_frames, batch_size, *_ = xs.shape
+        xs_pred = []
+        curr_frame = 0
+        # context
+        n_context_frames = self.context_frames // self.frame_stack
+        xs_pred = xs[:n_context_frames].clone()
+        curr_frame += n_context_frames
+        if self.condtion_similar_length:
+            n_frames -= self.condtion_similar_length
+        pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
+        while curr_frame < n_frames:
+            if self.chunk_size > 0:
+                horizon = min(n_frames - curr_frame, self.chunk_size)
+            else:
+                horizon = n_frames - curr_frame
+            assert horizon <= self.n_tokens, "horizon exceeds the number of tokens."
+            scheduling_matrix = self._generate_scheduling_matrix(horizon)
+            chunk = torch.randn((horizon, batch_size, *self.x_stacked_shape), device=self.device)
+            chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise)
+            xs_pred = torch.cat([xs_pred, chunk], 0)
+            # sliding window: only input the last n_tokens frames
+            start_frame = max(0, curr_frame + horizon - self.n_tokens)
+            pbar.set_postfix(
+                {
+                    "start": start_frame,
+                    "end": curr_frame + horizon,
+                }
+            )
+            if self.condtion_similar_length:
+                xs_pred = torch.cat([xs_pred, xs[curr_frame-self.condtion_similar_length:curr_frame].clone()], 0)
+            for m in range(scheduling_matrix.shape[0] - 1):
+                from_noise_levels = np.concatenate((np.zeros((curr_frame,), dtype=np.int64), scheduling_matrix[m]))[
+                    :, None
+                ].repeat(batch_size, axis=1)
+                to_noise_levels = np.concatenate(
+                    (
+                        np.zeros((curr_frame,), dtype=np.int64),
+                        scheduling_matrix[m + 1],
+                    )
+                )[
+                    :, None
+                ].repeat(batch_size, axis=1)
+                if self.condtion_similar_length:
+                    from_noise_levels = np.concatenate([from_noise_levels, np.array([[0,0,0,0]*self.condtion_similar_length])], axis=0)
+                    to_noise_levels = np.concatenate([to_noise_levels, np.array([[0,0,0,0]*self.condtion_similar_length])], axis=0)
+                from_noise_levels = torch.from_numpy(from_noise_levels).to(self.device)
+                to_noise_levels = torch.from_numpy(to_noise_levels).to(self.device)
+                # update xs_pred by DDIM or DDPM sampling
+                # input frames within the sliding window
+                try:
+                    input_condition = conditions[start_frame : curr_frame + horizon].clone()
+                except:
+                    import pdb;pdb.set_trace()
+                if self.condtion_similar_length:
+                    input_condition = torch.cat([conditions[start_frame : curr_frame + horizon], conditions[-self.condtion_similar_length:]], dim=0)
+                xs_pred[start_frame:] = self.diffusion_model.sample_step(
+                    xs_pred[start_frame:],
+                    input_condition,
+                    from_noise_levels[start_frame:],
+                    to_noise_levels[start_frame:],
+                )
+            if self.condtion_similar_length:
+                xs_pred = xs_pred[:-self.condtion_similar_length]
+            curr_frame += horizon
+            pbar.update(horizon)
+        if self.condtion_similar_length:
+            xs = xs[:-self.condtion_similar_length]
+        # FIXME: loss
+        loss = F.mse_loss(xs_pred, xs, reduction="none")
+        loss = self.reweight_loss(loss, masks)
+        self.validation_step_outputs.append((xs_pred.detach().cpu(), xs.detach().cpu()))
+        return loss
+    def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        return self.validation_step(*args, **kwargs, namespace="test")
+    def on_test_epoch_end(self) -> None:
+        self.on_validation_epoch_end(namespace="test")
+    def _generate_noise_levels(self, xs: torch.Tensor, masks: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Generate noise levels for training.
+        """
+        num_frames, batch_size, *_ = xs.shape
+        match self.cfg.noise_level:
+            case "random_all":  # entirely random noise levels
+                noise_levels = torch.randint(0, self.timesteps, (num_frames, batch_size), device=xs.device)
+            case "same":
+                noise_levels = torch.randint(0, self.timesteps, (num_frames, batch_size), device=xs.device)
+                noise_levels[1:] = noise_levels[0]
+        if masks is not None:
+            # for frames that are not available, treat as full noise
+            discard = torch.all(~rearrange(masks.bool(), "(t fs) b -> t b fs", fs=self.frame_stack), -1)
+            noise_levels = torch.where(discard, torch.full_like(noise_levels, self.timesteps - 1), noise_levels)
+        return noise_levels
+    def _generate_scheduling_matrix(self, horizon: int):
+        match self.cfg.scheduling_matrix:
+            case "pyramid":
+                return self._generate_pyramid_scheduling_matrix(horizon, self.uncertainty_scale)
+            case "full_sequence":
+                return np.arange(self.sampling_timesteps, -1, -1)[:, None].repeat(horizon, axis=1)
+            case "autoregressive":
+                return self._generate_pyramid_scheduling_matrix(horizon, self.sampling_timesteps)
+            case "trapezoid":
+                return self._generate_trapezoid_scheduling_matrix(horizon, self.uncertainty_scale)
+    def _generate_pyramid_scheduling_matrix(self, horizon: int, uncertainty_scale: float):
+        height = self.sampling_timesteps + int((horizon - 1) * uncertainty_scale) + 1
+        scheduling_matrix = np.zeros((height, horizon), dtype=np.int64)
+        for m in range(height):
+            for t in range(horizon):
+                scheduling_matrix[m, t] = self.sampling_timesteps + int(t * uncertainty_scale) - m
+        return np.clip(scheduling_matrix, 0, self.sampling_timesteps)
+    def _generate_trapezoid_scheduling_matrix(self, horizon: int, uncertainty_scale: float):
+        height = self.sampling_timesteps + int((horizon + 1) // 2 * uncertainty_scale)
+        scheduling_matrix = np.zeros((height, horizon), dtype=np.int64)
+        for m in range(height):
+            for t in range((horizon + 1) // 2):
+                scheduling_matrix[m, t] = self.sampling_timesteps + int(t * uncertainty_scale) - m
+                scheduling_matrix[m, -t] = self.sampling_timesteps + int(t * uncertainty_scale) - m
+        return np.clip(scheduling_matrix, 0, self.sampling_timesteps)
+    def reweight_loss(self, loss, weight=None):
+        # Note there is another part of loss reweighting (fused_snr) inside the Diffusion class!
+        loss = rearrange(loss, "t b (fs c) ... -> t b fs c ...", fs=self.frame_stack)
+        if weight is not None:
+            expand_dim = len(loss.shape) - len(weight.shape) - 1
+            weight = rearrange(
+                weight,
+                "(t fs) b ... -> t b fs ..." + " 1" * expand_dim,
+                fs=self.frame_stack,
+            )
+            loss = loss * weight
+        return loss.mean()
+    def _preprocess_batch(self, batch):
+        xs = batch[0]
+        batch_size, n_frames = xs.shape[:2]
+        if n_frames % self.frame_stack != 0:
+            raise ValueError("Number of frames must be divisible by frame stack size")
+        if self.context_frames % self.frame_stack != 0:
+            raise ValueError("Number of context frames must be divisible by frame stack size")
+        masks = torch.ones(n_frames, batch_size).to(xs.device)
+        n_frames = n_frames // self.frame_stack
+        if self.action_cond_dim:
+            conditions = batch[1]
+            conditions = torch.cat([torch.zeros_like(conditions[:, :1]), conditions[:, 1:]], 1)
+            conditions = rearrange(conditions, "b (t fs) d -> t b (fs d)", fs=self.frame_stack).contiguous()
+            # f, _, _ = conditions.shape
+            # predefined_1 = torch.tensor([0,0,0,1]).to(conditions.device)
+            # predefined_2 = torch.tensor([0,0,1,0]).to(conditions.device)
+            # conditions[:f//2] = predefined_1
+            # conditions[f//2:] = predefined_2
+        else:
+            conditions = [None for _ in range(n_frames)]
+        xs = self._normalize_x(xs)
+        xs = rearrange(xs, "b (t fs) c ... -> t b (fs c) ...", fs=self.frame_stack).contiguous()
+        return xs, conditions, masks
+    def _normalize_x(self, xs):
+        shape = [1] * (xs.ndim - self.data_mean.ndim) + list(self.data_mean.shape)
+        mean = self.data_mean.reshape(shape)
+        std = self.data_std.reshape(shape)
+        return (xs - mean) / std
+    def _unnormalize_x(self, xs):
+        shape = [1] * (xs.ndim - self.data_mean.ndim) + list(self.data_mean.shape)
+        mean = self.data_mean.reshape(shape)
+        std = self.data_std.reshape(shape)
+        return xs * std + mean
+    def _unstack_and_unnormalize(self, xs):
+        xs = rearrange(xs, "t b (fs c) ... -> (t fs) b c ...", fs=self.frame_stack)
+        return self._unnormalize_x(xs)

algorithms/worldmem/df_video.py ADDED Viewed

	@@ -0,0 +1,926 @@

+import os
+import random
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from torchvision.transforms import InterpolationMode
+from PIL import Image
+from packaging import version as pver
+from einops import rearrange
+from tqdm import tqdm
+from omegaconf import DictConfig
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from algorithms.common.metrics import (
+    LearnedPerceptualImagePatchSimilarity,
+)
+from utils.logging_utils import log_video, get_validation_metrics_for_videos
+from .df_base import DiffusionForcingBase
+from .models.vae import VAE_models
+from .models.diffusion import Diffusion
+from .models.pose_prediction import PosePredictionNet
+import glob
+# Utility Functions
+def euler_to_rotation_matrix(pitch, yaw):
+    """
+    Convert pitch and yaw angles (in radians) to a 3x3 rotation matrix.
+    Supports batch input.
+    Args:
+        pitch (torch.Tensor): Pitch angles in radians.
+        yaw (torch.Tensor): Yaw angles in radians.
+    Returns:
+        torch.Tensor: Rotation matrix of shape (batch_size, 3, 3).
+    """
+    cos_pitch, sin_pitch = torch.cos(pitch), torch.sin(pitch)
+    cos_yaw, sin_yaw = torch.cos(yaw), torch.sin(yaw)
+    R_pitch = torch.stack([
+        torch.ones_like(pitch), torch.zeros_like(pitch), torch.zeros_like(pitch),
+        torch.zeros_like(pitch), cos_pitch, -sin_pitch,
+        torch.zeros_like(pitch), sin_pitch, cos_pitch
+    ], dim=-1).reshape(-1, 3, 3)
+    R_yaw = torch.stack([
+        cos_yaw, torch.zeros_like(yaw), sin_yaw,
+        torch.zeros_like(yaw), torch.ones_like(yaw), torch.zeros_like(yaw),
+        -sin_yaw, torch.zeros_like(yaw), cos_yaw
+    ], dim=-1).reshape(-1, 3, 3)
+    return torch.matmul(R_yaw, R_pitch)
+def euler_to_camera_to_world_matrix(pose):
+    """
+    Convert (x, y, z, pitch, yaw) to a 4x4 camera-to-world transformation matrix using torch.
+    Supports both (5,) and (f, b, 5) shaped inputs.
+    Args:
+        pose (torch.Tensor): Pose tensor of shape (5,) or (f, b, 5).
+    Returns:
+        torch.Tensor: Camera-to-world transformation matrix of shape (4, 4).
+    """
+    origin_dim = pose.ndim
+    if origin_dim == 1:
+        pose = pose.unsqueeze(0).unsqueeze(0)  # Convert (5,) -> (1, 1, 5)
+    elif origin_dim == 2:
+        pose = pose.unsqueeze(0)
+    x, y, z, pitch, yaw = pose[..., 0], pose[..., 1], pose[..., 2], pose[..., 3], pose[..., 4]
+    pitch, yaw = torch.deg2rad(pitch), torch.deg2rad(yaw)
+    # Compute rotation matrix (batch mode)
+    R = euler_to_rotation_matrix(pitch, yaw)  # Shape (f*b, 3, 3)
+    # Create the 4x4 transformation matrix
+    eye = torch.eye(4, dtype=torch.float32, device=pose.device)
+    camera_to_world = eye.repeat(R.shape[0], 1, 1)  # Shape (f*b, 4, 4)
+    # Assign rotation
+    camera_to_world[:, :3, :3] = R
+    # Assign translation
+    camera_to_world[:, :3, 3] = torch.stack([x.reshape(-1), y.reshape(-1), z.reshape(-1)], dim=-1)
+    # Reshape back to (f, b, 4, 4) if needed
+    if origin_dim == 3:
+        return camera_to_world.view(pose.shape[0], pose.shape[1], 4, 4)
+    elif origin_dim == 2:
+        return camera_to_world.view(pose.shape[0], 4, 4)
+    else:
+        return camera_to_world.squeeze(0).squeeze(0)  # Convert (1,1,4,4) -> (4,4)
+def is_inside_fov_3d_hv(points, center, center_pitch, center_yaw, fov_half_h, fov_half_v):
+    """
+    Check whether points are within a given 3D field of view (FOV)
+    with separately defined horizontal and vertical ranges.
+    The center view direction is specified by pitch and yaw (in degrees).
+    :param points: (N, B, 3) Sample point coordinates
+    :param center: (3,) Center coordinates of the FOV
+    :param center_pitch: Pitch angle of the center view (in degrees)
+    :param center_yaw: Yaw angle of the center view (in degrees)
+    :param fov_half_h: Horizontal half-FOV angle (in degrees)
+    :param fov_half_v: Vertical half-FOV angle (in degrees)
+    :return: Boolean tensor (N, B), indicating whether each point is inside the FOV
+    """
+    # Compute vectors relative to the center
+    vectors = points - center  # shape (N, B, 3)
+    x = vectors[..., 0]
+    y = vectors[..., 1]
+    z = vectors[..., 2]
+    # Compute horizontal angle (yaw): measured with respect to the z-axis as the forward direction,
+    # and the x-axis as left-right, resulting in a range of -180 to 180 degrees.
+    azimuth = torch.atan2(x, z) * (180 / math.pi)
+    # Compute vertical angle (pitch): measured with respect to the horizontal plane,
+    # resulting in a range of -90 to 90 degrees.
+    elevation = torch.atan2(y, torch.sqrt(x**2 + z**2)) * (180 / math.pi)
+    # Compute the angular difference from the center view (handling circular angle wrap-around)
+    diff_azimuth = (azimuth - center_yaw).abs() % 360
+    diff_elevation = (elevation - center_pitch).abs() % 360
+    # Adjust values greater than 180 degrees to the shorter angular difference
+    diff_azimuth = torch.where(diff_azimuth > 180, 360 - diff_azimuth, diff_azimuth)
+    diff_elevation = torch.where(diff_elevation > 180, 360 - diff_elevation, diff_elevation)
+    # Check if both horizontal and vertical angles are within their respective FOV limits
+    return (diff_azimuth < fov_half_h) & (diff_elevation < fov_half_v)
+def generate_points_in_sphere(n_points, radius):
+    # Sample three independent uniform distributions
+    samples_r = torch.rand(n_points)       # For radius distribution
+    samples_phi = torch.rand(n_points)     # For azimuthal angle phi
+    samples_u = torch.rand(n_points)       # For polar angle theta
+    # Apply cube root to ensure uniform volumetric distribution
+    r = radius * torch.pow(samples_r, 1/3)
+    # Azimuthal angle phi uniformly distributed in [0, 2π]
+    phi = 2 * math.pi * samples_phi
+    # Convert u to theta to ensure cos(theta) is uniformly distributed
+    theta = torch.acos(1 - 2 * samples_u)
+    # Convert spherical coordinates to Cartesian coordinates
+    x = r * torch.sin(theta) * torch.cos(phi)
+    y = r * torch.sin(theta) * torch.sin(phi)
+    z = r * torch.cos(theta)
+    points = torch.stack((x, y, z), dim=1)
+    return points
+def tensor_max_with_number(tensor, number):
+    number_tensor = torch.tensor(number, dtype=tensor.dtype, device=tensor.device)
+    result = torch.max(tensor, number_tensor)
+    return result
+def custom_meshgrid(*args):
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def camera_to_world_to_world_to_camera(camera_to_world: torch.Tensor) -> torch.Tensor:
+    """
+    Convert Camera-to-World matrices to World-to-Camera matrices for a tensor with shape (f, b, 4, 4).
+    Args:
+        camera_to_world (torch.Tensor): A tensor of shape (f, b, 4, 4), where:
+            f = number of frames,
+            b = batch size.
+    Returns:
+        torch.Tensor: A tensor of shape (f, b, 4, 4) representing the World-to-Camera matrices.
+    """
+    # Ensure input is a 4D tensor
+    assert camera_to_world.ndim == 4 and camera_to_world.shape[2:] == (4, 4), \
+        "Input must be of shape (f, b, 4, 4)"
+    # Extract the rotation (R) and translation (T) parts
+    R = camera_to_world[:, :, :3, :3]  # Shape: (f, b, 3, 3)
+    T = camera_to_world[:, :, :3, 3]   # Shape: (f, b, 3)
+    # Initialize an identity matrix for the output
+    world_to_camera = torch.eye(4, device=camera_to_world.device).unsqueeze(0).unsqueeze(0)
+    world_to_camera = world_to_camera.repeat(camera_to_world.size(0), camera_to_world.size(1), 1, 1)  # Shape: (f, b, 4, 4)
+    # Compute the rotation (transpose of R)
+    world_to_camera[:, :, :3, :3] = R.transpose(2, 3)
+    # Compute the translation (-R^T * T)
+    world_to_camera[:, :, :3, 3] = -torch.matmul(R.transpose(2, 3), T.unsqueeze(-1)).squeeze(-1)
+    return world_to_camera.to(camera_to_world.dtype)
+def convert_to_plucker(poses, curr_frame, focal_length, image_width, image_height):
+    intrinsic = np.asarray([focal_length * image_width,
+                                focal_length * image_height,
+                                0.5 * image_width,
+                                0.5 * image_height], dtype=np.float32)
+    c2ws = get_relative_pose(poses, zero_first_frame_scale=curr_frame)
+    c2ws = rearrange(c2ws, "t b m n -> b t m n")
+    K = torch.as_tensor(intrinsic, device=poses.device, dtype=poses.dtype).repeat(c2ws.shape[0],c2ws.shape[1],1)  # [B, F, 4]
+    plucker_embedding = ray_condition(K, c2ws, image_height, image_width, device=c2ws.device)
+    plucker_embedding = rearrange(plucker_embedding, "b t h w d -> t b h w d").contiguous()
+    return plucker_embedding
+def get_relative_pose(abs_c2ws, zero_first_frame_scale):
+    abs_w2cs = camera_to_world_to_world_to_camera(abs_c2ws)
+    target_cam_c2w = torch.tensor([
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ]).to(abs_c2ws.device).to(abs_c2ws.dtype)
+    abs2rel = target_cam_c2w @ abs_w2cs[zero_first_frame_scale]
+    ret_poses = [abs2rel @ abs_c2w for abs_c2w in abs_c2ws]
+    ret_poses = torch.stack(ret_poses)
+    return ret_poses
+def ray_condition(K, c2w, H, W, device):
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B = K.shape[0]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+    zs = torch.ones_like(i, device=device, dtype=c2w.dtype)  # [B, HxW]
+    xs = -(i - cx) / fx * zs
+    ys = -(j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    return plucker
+def random_transform(tensor):
+    """
+    Apply the same random translation, rotation, and scaling to all frames in the batch.
+    Args:
+        tensor (torch.Tensor): Input tensor of shape (F, B, 3, H, W).
+    Returns:
+        torch.Tensor: Transformed tensor of shape (F, B, 3, H, W).
+    """
+    if tensor.ndim != 5:
+        raise ValueError("Input tensor must have shape (F, B, 3, H, W)")
+    F, B, C, H, W = tensor.shape
+    # Generate random transformation parameters
+    max_translate = 0.2  # Translate up to 20% of width/height
+    max_rotate = 30      # Rotate up to 30 degrees
+    max_scale = 0.2      # Scale change by up to +/- 20%
+    translate_x = random.uniform(-max_translate, max_translate) * W
+    translate_y = random.uniform(-max_translate, max_translate) * H
+    rotate_angle = random.uniform(-max_rotate, max_rotate)
+    scale_factor = 1 + random.uniform(-max_scale, max_scale)
+    # Apply the same transformation to all frames and batches
+    tensor = tensor.reshape(F*B, C, H, W)
+    transformed_tensor = TF.affine(
+        tensor,
+        angle=rotate_angle,
+        translate=(translate_x, translate_y),
+        scale=scale_factor,
+        shear=(0, 0),
+        interpolation=InterpolationMode.BILINEAR,
+        fill=0
+    )
+    transformed_tensor = transformed_tensor.reshape(F, B, C, H, W)
+    return transformed_tensor
+def save_tensor_as_png(tensor, file_path):
+    """
+    Save a 3*H*W tensor as a PNG image.
+    Args:
+        tensor (torch.Tensor): Input tensor of shape (3, H, W).
+        file_path (str): Path to save the PNG file.
+    """
+    if tensor.ndim != 3 or tensor.shape[0] != 3:
+        raise ValueError("Input tensor must have shape (3, H, W)")
+    # Convert tensor to PIL Image
+    image = TF.to_pil_image(tensor)
+    # Save image
+    image.save(file_path)
+class WorldMemMinecraft(DiffusionForcingBase):
+    """
+    Video generation for MineCraft with memory.
+    """
+    def __init__(self, cfg: DictConfig):
+        """
+        Initialize the WorldMemMinecraft class with the given configuration.
+        Args:
+            cfg (DictConfig): Configuration object.
+        """
+        self.n_tokens = cfg.n_frames // cfg.frame_stack # number of max tokens for the model
+        self.n_frames = cfg.n_frames
+        if hasattr(cfg, "n_tokens"):
+            self.n_tokens = cfg.n_tokens // cfg.frame_stack
+        self.memory_condition_length = cfg.memory_condition_length
+        self.pose_cond_dim = getattr(cfg, "pose_cond_dim", 5)
+        self.use_plucker = getattr(cfg, "use_plucker", True)
+        self.relative_embedding = getattr(cfg, "relative_embedding", True)
+        self.state_embed_only_on_qk = getattr(cfg, "state_embed_only_on_qk", True)
+        self.use_memory_attention = getattr(cfg, "use_memory_attention", True)
+        self.add_timestamp_embedding = getattr(cfg, "add_timestamp_embedding", True)
+        self.ref_mode = getattr(cfg, "ref_mode", 'sequential')
+        self.log_curve = getattr(cfg, "log_curve", False)
+        self.focal_length =  getattr(cfg, "focal_length", 0.35)
+        self.log_video = cfg.log_video
+        self.save_local = getattr(cfg, "save_local", True)
+        self.local_save_dir = getattr(cfg, "local_save_dir", None)
+        self.lpips_batch_size = getattr(cfg, "lpips_batch_size", 16)
+        self.next_frame_length = getattr(cfg, "next_frame_length", 1)
+        self.require_pose_prediction = getattr(cfg, "require_pose_prediction", False)
+        super().__init__(cfg)
+    def _build_model(self):
+        self.diffusion_model = Diffusion(
+            reference_length=self.memory_condition_length,
+            x_shape=self.x_stacked_shape,
+            action_cond_dim=self.action_cond_dim,
+            pose_cond_dim=self.pose_cond_dim,
+            is_causal=self.causal,
+            cfg=self.cfg.diffusion,
+            is_dit=True,
+            use_plucker=self.use_plucker,
+            relative_embedding=self.relative_embedding,
+            state_embed_only_on_qk=self.state_embed_only_on_qk,
+            use_memory_attention=self.use_memory_attention,
+            add_timestamp_embedding=self.add_timestamp_embedding,
+            ref_mode=self.ref_mode
+        )
+        self.validation_lpips_model = LearnedPerceptualImagePatchSimilarity()
+        vae = VAE_models["vit-l-20-shallow-encoder"]()
+        self.vae = vae.eval()
+        if self.require_pose_prediction:
+            self.pose_prediction_model = PosePredictionNet()
+    def _generate_noise_levels(self, xs: torch.Tensor, masks = None) -> torch.Tensor:
+        """
+        Generate noise levels for training.
+        """
+        num_frames, batch_size, *_ = xs.shape
+        match self.cfg.noise_level:
+            case "random_all":  # entirely random noise levels
+                noise_levels = torch.randint(0, self.timesteps, (num_frames, batch_size), device=xs.device)
+            case "same":
+                noise_levels = torch.randint(0, self.timesteps, (num_frames, batch_size), device=xs.device)
+                noise_levels[1:] = noise_levels[0]
+        if masks is not None:
+            # for frames that are not available, treat as full noise
+            discard = torch.all(~rearrange(masks.bool(), "(t fs) b -> t b fs", fs=self.frame_stack), -1)
+            noise_levels = torch.where(discard, torch.full_like(noise_levels, self.timesteps - 1), noise_levels)
+        return noise_levels
+    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
+        """
+        Perform a single training step.
+        This function processes the input batch,
+        encodes the input frames, generates noise levels, and computes the loss using the diffusion model.
+        Args:
+            batch: Input batch of data containing frames, conditions, poses, etc.
+            batch_idx: Index of the current batch.
+        Returns:
+            dict: A dictionary containing the training loss.
+        """
+        xs, conditions, pose_conditions, c2w_mat, frame_idx = self._preprocess_batch(batch)
+        if self.use_plucker:
+            if self.relative_embedding:
+                input_pose_condition = []
+                frame_idx_list = []
+                for i in range(self.n_frames):
+                    input_pose_condition.append(
+                        convert_to_plucker(
+                            torch.cat([c2w_mat[i:i + 1], c2w_mat[-self.memory_condition_length:]]).clone(),
+                            0,
+                            focal_length=self.focal_length,
+                            image_height=xs.shape[-2],image_width=xs.shape[-1]
+                        ).to(xs.dtype)
+                    )
+                    frame_idx_list.append(
+                        torch.cat([
+                            frame_idx[i:i + 1] - frame_idx[i:i + 1],
+                            frame_idx[-self.memory_condition_length:] - frame_idx[i:i + 1]
+                        ]).clone()
+                    )
+                input_pose_condition = torch.cat(input_pose_condition)
+                frame_idx_list = torch.cat(frame_idx_list)
+            else:
+                input_pose_condition = convert_to_plucker(
+                    c2w_mat, 0, focal_length=self.focal_length
+                ).to(xs.dtype)
+                frame_idx_list = frame_idx
+        else:
+            input_pose_condition = pose_conditions.to(xs.dtype)
+            frame_idx_list = None
+        xs = self.encode(xs)
+        noise_levels = self._generate_noise_levels(xs)
+        if self.memory_condition_length:
+            noise_levels[-self.memory_condition_length:] = self.diffusion_model.stabilization_level
+            conditions[-self.memory_condition_length:] *= 0
+        _, loss = self.diffusion_model(
+            xs,
+            conditions,
+            input_pose_condition,
+            noise_levels=noise_levels,
+            reference_length=self.memory_condition_length,
+            frame_idx=frame_idx_list
+        )
+        if self.memory_condition_length:
+            loss = loss[:-self.memory_condition_length]
+        loss = self.reweight_loss(loss, None)
+        if batch_idx % 20 == 0:
+            self.log("training/loss", loss.cpu())
+        return {"loss": loss}
+    def on_validation_epoch_end(self, namespace="validation") -> None:
+        if not self.validation_step_outputs:
+            return
+        xs_pred = []
+        xs = []
+        for pred, gt in self.validation_step_outputs:
+            xs_pred.append(pred)
+            xs.append(gt)
+        xs_pred = torch.cat(xs_pred, 1)
+        if gt is not None:
+            xs = torch.cat(xs, 1)
+        else:
+            xs = None
+        if self.logger and self.log_video:
+            log_video(
+                xs_pred,
+                xs,
+                step=None if namespace == "test" else self.global_step,
+                namespace=namespace + "_vis",
+                context_frames=self.context_frames,
+                logger=self.logger.experiment,
+                save_local=self.save_local,
+                local_save_dir=self.local_save_dir,
+            )
+        if xs is not None:
+            # Move data to the same device as LPIPS model for metric calculation
+            device = next(self.validation_lpips_model.parameters()).device
+            xs_pred_device = xs_pred.to(device)
+            xs_device = xs.to(device)
+            metric_dict = get_validation_metrics_for_videos(
+                xs_pred_device, xs_device,
+                lpips_model=self.validation_lpips_model,
+                lpips_batch_size=self.lpips_batch_size)
+            self.log_dict(
+                {"mse": metric_dict['mse'],
+                "psnr": metric_dict['psnr'],
+                "lpips": metric_dict['lpips']},
+                sync_dist=True
+            )
+            if self.log_curve:
+                psnr_values = metric_dict['frame_wise_psnr'].cpu().tolist()
+                frames = list(range(len(psnr_values)))
+                line_plot = wandb.plot.line_series(
+                    xs = frames,
+                    ys = [psnr_values],
+                    keys = ["PSNR"],
+                    title = "Frame-wise PSNR",
+                    xname = "Frame index"
+                )
+                self.logger.experiment.log({"frame_wise_psnr_plot": line_plot})
+        self.validation_step_outputs.clear()
+    def _preprocess_batch(self, batch):
+        xs, conditions, pose_conditions, frame_index = batch
+        if self.action_cond_dim:
+            conditions = torch.cat([torch.zeros_like(conditions[:, :1]), conditions[:, 1:]], 1)
+            conditions = rearrange(conditions, "b t d -> t b d").contiguous()
+        else:
+            raise NotImplementedError("Only support external cond.")
+        pose_conditions = rearrange(pose_conditions, "b t d -> t b d").contiguous()
+        c2w_mat = euler_to_camera_to_world_matrix(pose_conditions)
+        xs = rearrange(xs, "b t c ... -> t b c ...").contiguous()
+        frame_index = rearrange(frame_index, "b t -> t b").contiguous()
+        return xs, conditions, pose_conditions, c2w_mat, frame_index
+    def encode(self, x):
+        # vae encoding
+        T = x.shape[0]
+        H, W = x.shape[-2:]
+        scaling_factor = 0.07843137255
+        x = rearrange(x, "t b c h w -> (t b) c h w")
+        with torch.no_grad():
+            x = self.vae.encode(x * 2 - 1).mean * scaling_factor
+        x = rearrange(x, "(t b) (h w) c -> t b c h w", t=T, h=H // self.vae.patch_size, w=W // self.vae.patch_size)
+        return x
+    def decode(self, x):
+        total_frames = x.shape[0]
+        scaling_factor = 0.07843137255
+        x = rearrange(x, "t b c h w -> (t b) (h w) c")
+        with torch.no_grad():
+            x = (self.vae.decode(x / scaling_factor) + 1) / 2
+        x = rearrange(x, "(t b) c h w-> t b c h w", t=total_frames)
+        return x
+    def _generate_condition_indices(self, curr_frame, memory_condition_length, xs_pred, pose_conditions, frame_idx, horizon):
+        """
+        Generate indices for condition similarity based on the current frame and pose conditions.
+        """
+        if curr_frame < memory_condition_length:
+            random_idx = [i for i in range(curr_frame)] + [0] * (memory_condition_length - curr_frame)
+            random_idx = np.repeat(np.array(random_idx)[:, None], xs_pred.shape[1], -1)
+        else:
+            # Generate points in a sphere and filter based on field of view
+            num_samples = 10000
+            radius = 30
+            points = generate_points_in_sphere(num_samples, radius).to(pose_conditions.device)
+            points = points[:, None].repeat(1, pose_conditions.shape[1], 1)
+            points += pose_conditions[curr_frame, :, :3][None]
+            fov_half_h = torch.tensor(105 / 2, device=pose_conditions.device)
+            fov_half_v = torch.tensor(75 / 2, device=pose_conditions.device)
+            # in_fov1 = is_inside_fov_3d_hv(
+            #     points, pose_conditions[curr_frame, :, :3],
+            #     pose_conditions[curr_frame, :, -2], pose_conditions[curr_frame, :, -1],
+            #     fov_half_h, fov_half_v
+            # )
+            in_fov1 = torch.stack([
+                is_inside_fov_3d_hv(points, pc[:, :3], pc[:, -2], pc[:, -1], fov_half_h, fov_half_v)
+                for pc in pose_conditions[curr_frame:curr_frame+horizon]
+            ])
+            in_fov1 = torch.sum(in_fov1, 0) > 0
+            # Compute overlap ratios and select indices
+            in_fov_list = torch.stack([
+                is_inside_fov_3d_hv(points, pc[:, :3], pc[:, -2], pc[:, -1], fov_half_h, fov_half_v)
+                for pc in pose_conditions[:curr_frame]
+            ])
+            random_idx = []
+            for _ in range(memory_condition_length):
+                overlap_ratio = ((in_fov1.bool() & in_fov_list).sum(1)) / in_fov1.sum()
+                confidence = overlap_ratio + (curr_frame - frame_idx[:curr_frame]) / curr_frame * (-0.2)
+                if len(random_idx) > 0:
+                    confidence[torch.cat(random_idx)] = -1e10
+                _, r_idx = torch.topk(confidence, k=1, dim=0)
+                random_idx.append(r_idx[0])
+                # choice 1: directly remove overlapping region
+                occupied_mask = in_fov_list[r_idx[0, range(in_fov1.shape[-1])], :, range(in_fov1.shape[-1])].permute(1,0)
+                in_fov1 = in_fov1 & ~occupied_mask
+                # choice 2: apply similarity filter
+                # cos_sim = F.cosine_similarity(xs_pred.to(r_idx.device)[r_idx[:, range(in_fov1.shape[1])],
+                #     range(in_fov1.shape[1])], xs_pred.to(r_idx.device)[:curr_frame], dim=2)
+                # cos_sim = cos_sim.mean((-2,-1))
+                # mask_sim = cos_sim>0.9
+                # in_fov_list = in_fov_list & ~mask_sim[:,None].to(in_fov_list.device)
+            random_idx = torch.stack(random_idx).cpu()
+        return random_idx
+    def _prepare_conditions(self,
+                            start_frame, curr_frame, horizon, conditions,
+                            pose_conditions, c2w_mat, frame_idx, random_idx,
+                            image_width, image_height):
+        """
+        Prepare input conditions and pose conditions for sampling.
+        """
+        padding = torch.zeros((len(random_idx),) + conditions.shape[1:], device=conditions.device, dtype=conditions.dtype)
+        input_condition = torch.cat([conditions[start_frame:curr_frame + horizon], padding], dim=0)
+        batch_size = conditions.shape[1]
+        if self.use_plucker:
+            if self.relative_embedding:
+                frame_idx_list = []
+                input_pose_condition = []
+                for i in range(start_frame, curr_frame + horizon):
+                    input_pose_condition.append(convert_to_plucker(torch.cat([c2w_mat[i:i+1],c2w_mat[random_idx[:,range(batch_size)], range(batch_size)]]).clone(), 0, focal_length=self.focal_length,
+                                                image_width=image_width, image_height=image_height).to(conditions.dtype))
+                    frame_idx_list.append(torch.cat([frame_idx[i:i+1]-frame_idx[i:i+1], frame_idx[random_idx[:,range(batch_size)], range(batch_size)]-frame_idx[i:i+1]]))
+                input_pose_condition = torch.cat(input_pose_condition)
+                frame_idx_list = torch.cat(frame_idx_list)
+            else:
+                input_pose_condition = torch.cat([c2w_mat[start_frame : curr_frame + horizon], c2w_mat[random_idx[:,range(batch_size)], range(batch_size)]], dim=0).clone()
+                input_pose_condition = convert_to_plucker(input_pose_condition, 0, focal_length=self.focal_length)
+                frame_idx_list = None
+        else:
+            input_pose_condition = torch.cat([pose_conditions[start_frame : curr_frame + horizon], pose_conditions[random_idx[:,range(batch_size)], range(batch_size)]], dim=0).clone()
+            frame_idx_list = None
+        return input_condition, input_pose_condition, frame_idx_list
+    def _prepare_noise_levels(self, scheduling_matrix, m, curr_frame, batch_size, memory_condition_length):
+        """
+        Prepare noise levels for the current sampling step.
+        """
+        from_noise_levels = np.concatenate((np.zeros((curr_frame,), dtype=np.int64), scheduling_matrix[m]))[:, None].repeat(batch_size, axis=1)
+        to_noise_levels = np.concatenate((np.zeros((curr_frame,), dtype=np.int64), scheduling_matrix[m + 1]))[:, None].repeat(batch_size, axis=1)
+        if memory_condition_length:
+            from_noise_levels = np.concatenate([from_noise_levels, np.zeros((memory_condition_length, from_noise_levels.shape[-1]), dtype=np.int32)], axis=0)
+            to_noise_levels = np.concatenate([to_noise_levels, np.zeros((memory_condition_length, from_noise_levels.shape[-1]), dtype=np.int32)], axis=0)
+        from_noise_levels = torch.from_numpy(from_noise_levels).to(self.device)
+        to_noise_levels = torch.from_numpy(to_noise_levels).to(self.device)
+        return from_noise_levels, to_noise_levels
+    def validation_step(self, batch, batch_idx, namespace="validation") -> STEP_OUTPUT:
+        """
+        Perform a single validation step.
+        This function processes the input batch, encodes frames, generates predictions using a sliding window approach,
+        and handles condition similarity logic for sampling. The results are decoded and stored for evaluation.
+        Args:
+            batch: Input batch of data containing frames, conditions, poses, etc.
+            batch_idx: Index of the current batch.
+            namespace: Namespace for logging (default: "validation").
+        Returns:
+            None: Appends the predicted and ground truth frames to `self.validation_step_outputs`.
+        """
+        # Preprocess the input batch
+        memory_condition_length = self.memory_condition_length
+        xs_raw, conditions, pose_conditions, c2w_mat, frame_idx = self._preprocess_batch(batch)
+        # Encode frames in chunks if necessary
+        total_frame = xs_raw.shape[0]
+        if total_frame > 10:
+            xs = torch.cat([
+                self.encode(xs_raw[int(total_frame * i / 10):int(total_frame * (i + 1) / 10)]).cpu()
+                for i in range(10)
+            ])
+        else:
+            xs = self.encode(xs_raw).cpu()
+        n_frames, batch_size, *_ = xs.shape
+        curr_frame = 0
+        # Initialize context frames
+        n_context_frames = self.context_frames // self.frame_stack
+        xs_pred = xs[:n_context_frames].clone()
+        curr_frame += n_context_frames
+        # Progress bar for sampling
+        pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
+        while curr_frame < n_frames:
+            # Determine the horizon for the current chunk
+            horizon = min(n_frames - curr_frame, self.chunk_size) if self.chunk_size > 0 else n_frames - curr_frame
+            assert horizon <= self.n_tokens, "Horizon exceeds the number of tokens."
+            # Generate scheduling matrix and initialize noise
+            scheduling_matrix = self._generate_scheduling_matrix(horizon)
+            chunk = torch.randn((horizon, batch_size, *xs_pred.shape[2:]))
+            chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise).to(xs_pred.device)
+            xs_pred = torch.cat([xs_pred, chunk], 0)
+            # Sliding window: only input the last `n_tokens` frames
+            start_frame = max(0, curr_frame + horizon - self.n_tokens)
+            pbar.set_postfix({"start": start_frame, "end": curr_frame + horizon})
+            # Handle condition similarity logic
+            if memory_condition_length:
+                random_idx = self._generate_condition_indices(
+                    curr_frame, memory_condition_length, xs_pred, pose_conditions, frame_idx, horizon
+                )
+                xs_pred = torch.cat([xs_pred, xs_pred[random_idx[:, range(xs_pred.shape[1])], range(xs_pred.shape[1])].clone()], 0)
+            # Prepare input conditions and pose conditions
+            input_condition, input_pose_condition, frame_idx_list = self._prepare_conditions(
+                start_frame, curr_frame, horizon, conditions, pose_conditions, c2w_mat, frame_idx, random_idx,
+                image_width=xs_raw.shape[-1], image_height=xs_raw.shape[-2]
+            )
+            # Perform sampling for each step in the scheduling matrix
+            for m in range(scheduling_matrix.shape[0] - 1):
+                from_noise_levels, to_noise_levels = self._prepare_noise_levels(
+                    scheduling_matrix, m, curr_frame, batch_size, memory_condition_length
+                )
+                xs_pred[start_frame:] = self.diffusion_model.sample_step(
+                    xs_pred[start_frame:].to(input_condition.device),
+                    input_condition,
+                    input_pose_condition,
+                    from_noise_levels[start_frame:],
+                    to_noise_levels[start_frame:],
+                    current_frame=curr_frame,
+                    mode="validation",
+                    reference_length=memory_condition_length,
+                    frame_idx=frame_idx_list
+                ).cpu()
+            # Remove condition similarity frames if applicable
+            if memory_condition_length:
+                xs_pred = xs_pred[:-memory_condition_length]
+            curr_frame += horizon
+            pbar.update(horizon)
+        # Decode predictions and ground truth
+        xs_pred = self.decode(xs_pred[n_context_frames:].to(conditions.device))
+        xs_decode = self.decode(xs[n_context_frames:].to(conditions.device))
+        # Store results for evaluation (move to CPU to save GPU memory)
+        self.validation_step_outputs.append((xs_pred.detach().cpu(), xs_decode.detach().cpu()))
+        return
+    @torch.no_grad()
+    def interactive(self, first_frame, new_actions, first_pose, device,
+                    memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx):
+        memory_condition_length = self.memory_condition_length
+        if memory_latent_frames is None:
+            first_frame = torch.from_numpy(first_frame)
+            new_actions = torch.from_numpy(new_actions)
+            first_pose = torch.from_numpy(first_pose)
+            first_frame_encode = self.encode(first_frame[None, None].to(device))
+            memory_latent_frames = first_frame_encode.cpu()
+            memory_actions = new_actions[None, None].to(device)
+            memory_poses = first_pose[None, None].to(device)
+            new_c2w_mat = euler_to_camera_to_world_matrix(first_pose)
+            memory_c2w = new_c2w_mat[None, None].to(device)
+            memory_frame_idx = torch.tensor([[0]]).to(device)
+            return first_frame.cpu().numpy(), memory_latent_frames.cpu().numpy(), memory_actions.cpu().numpy(), memory_poses.cpu().numpy(), memory_c2w.cpu().numpy(), memory_frame_idx.cpu().numpy()
+        else:
+            memory_latent_frames = torch.from_numpy(memory_latent_frames)
+            memory_actions = torch.from_numpy(memory_actions).to(device)
+            memory_poses = torch.from_numpy(memory_poses).to(device)
+            memory_c2w = torch.from_numpy(memory_c2w).to(device)
+            memory_frame_idx = torch.from_numpy(memory_frame_idx).to(device)
+            new_actions = new_actions.to(device)
+        curr_frame = 0
+        batch_size = 1
+        horizon = self.next_frame_length
+        n_frames = curr_frame + horizon
+        # context
+        n_context_frames = len(memory_latent_frames)
+        xs_pred = memory_latent_frames[:n_context_frames].clone()
+        curr_frame += n_context_frames
+        pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
+        new_pose_condition_list = []
+        last_frame = xs_pred[-1].clone()
+        last_pose_condition = memory_poses[-1].clone()
+        curr_actions = new_actions.clone()
+        for hi in range(len(new_actions)):
+            last_pose_condition[:,3:] = last_pose_condition[:,3:] // 15
+            new_pose_condition_offset = self.pose_prediction_model(last_frame.to(device), curr_actions[None, hi], last_pose_condition)
+            new_pose_condition_offset[:,3:] = torch.round(new_pose_condition_offset[:,3:])
+            new_pose_condition = last_pose_condition + new_pose_condition_offset
+            new_pose_condition[:,3:] = new_pose_condition[:,3:] * 15
+            new_pose_condition[:,3:] %= 360
+            last_pose_condition = new_pose_condition.clone()
+            new_pose_condition_list.append(new_pose_condition[None])
+        new_pose_condition_list = torch.cat(new_pose_condition_list, 0)
+        ai = 0
+        while ai < len(new_actions):
+            next_horizon = min(horizon, len(new_actions) - ai)
+            last_frame = xs_pred[-1].clone()
+            curr_actions = new_actions[ai:ai+next_horizon].clone()
+            new_pose_condition = new_pose_condition_list[ai:ai+next_horizon].clone()
+            new_c2w_mat = euler_to_camera_to_world_matrix(new_pose_condition)
+            memory_poses = torch.cat([memory_poses, new_pose_condition])
+            memory_actions = torch.cat([memory_actions, curr_actions[:, None]])
+            memory_c2w = torch.cat([memory_c2w, new_c2w_mat])
+            new_indices = memory_frame_idx[-1,0] + torch.arange(next_horizon, device=memory_frame_idx.device) + 1
+            memory_frame_idx = torch.cat([memory_frame_idx, new_indices[:, None]])
+            conditions = memory_actions.clone()
+            pose_conditions = memory_poses.clone()
+            c2w_mat = memory_c2w .clone()
+            frame_idx = memory_frame_idx.clone()
+            # generation on frame
+            scheduling_matrix = self._generate_scheduling_matrix(next_horizon)
+            chunk = torch.randn((next_horizon, batch_size, *xs_pred.shape[2:])).to(xs_pred.device)
+            chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise)
+            xs_pred = torch.cat([xs_pred, chunk], 0)
+            # sliding window: only input the last n_tokens frames
+            start_frame = max(0, curr_frame - self.n_tokens)
+            pbar.set_postfix(
+                {
+                    "start": start_frame,
+                    "end": curr_frame + next_horizon,
+                }
+            )
+            # Handle condition similarity logic
+            if memory_condition_length:
+                random_idx = self._generate_condition_indices(
+                    curr_frame, memory_condition_length, xs_pred, pose_conditions, frame_idx, next_horizon
+                )
+                # random_idx = np.unique(random_idx)[:, None]
+                # memory_condition_length = len(random_idx)
+                xs_pred = torch.cat([xs_pred, xs_pred[random_idx[:, range(xs_pred.shape[1])], range(xs_pred.shape[1])].clone()], 0)
+            # Prepare input conditions and pose conditions
+            input_condition, input_pose_condition, frame_idx_list = self._prepare_conditions(
+                start_frame, curr_frame, next_horizon, conditions, pose_conditions, c2w_mat, frame_idx, random_idx,
+                image_width=first_frame.shape[-1], image_height=first_frame.shape[-2]
+            )
+            # Perform sampling for each step in the scheduling matrix
+            for m in range(scheduling_matrix.shape[0] - 1):
+                from_noise_levels, to_noise_levels = self._prepare_noise_levels(
+                    scheduling_matrix, m, curr_frame, batch_size, memory_condition_length
+                )
+                xs_pred[start_frame:] = self.diffusion_model.sample_step(
+                    xs_pred[start_frame:].to(input_condition.device),
+                    input_condition,
+                    input_pose_condition,
+                    from_noise_levels[start_frame:],
+                    to_noise_levels[start_frame:],
+                    current_frame=curr_frame,
+                    mode="validation",
+                    reference_length=memory_condition_length,
+                    frame_idx=frame_idx_list
+                ).cpu()
+            if memory_condition_length:
+                xs_pred = xs_pred[:-memory_condition_length]
+            curr_frame += next_horizon
+            pbar.update(next_horizon)
+            ai += next_horizon
+        memory_latent_frames = torch.cat([memory_latent_frames, xs_pred[n_context_frames:]])
+        xs_pred = self.decode(xs_pred[n_context_frames:].to(device)).cpu()
+        return xs_pred.cpu().numpy(), memory_latent_frames.cpu().numpy(), memory_actions.cpu().numpy(), \
+            memory_poses.cpu().numpy(), memory_c2w.cpu().numpy(), memory_frame_idx.cpu().numpy()

algorithms/worldmem/models/attention.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+Based on https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/attention.py
+"""
+from typing import Optional
+from collections import namedtuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from einops import rearrange
+from .rotary_embedding_torch import RotaryEmbedding, apply_rotary_emb
+import numpy as np
+class TemporalAxialAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        reference_length: int,
+        rotary_emb: RotaryEmbedding,
+        is_causal: bool = True,
+        is_temporal_independent: bool = False,
+        use_domain_adapter = False
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.use_domain_adapter = use_domain_adapter
+        if self.use_domain_adapter:
+            lora_rank = 8
+            self.lora_A = nn.Linear(dim, lora_rank, bias=False)
+            self.lora_B = nn.Linear(lora_rank, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+        self.is_causal = is_causal
+        self.is_temporal_independent = is_temporal_independent
+        self.reference_length = reference_length
+    def forward(self, x: torch.Tensor):
+        B, T, H, W, D = x.shape
+        # if T>=9:
+        #     try:
+        #         # x = torch.cat([x[:,:-1],x[:,16-T:17-T],x[:,-1:]], dim=1)
+        #         x = torch.cat([x[:,16-T:17-T],x], dim=1)
+        #     except:
+        #         import pdb;pdb.set_trace()
+        #     print("="*50)
+        #     print(x.shape)
+        B, T, H, W, D = x.shape
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        if self.use_domain_adapter:
+            q_lora, k_lora, v_lora = self.lora_B(self.lora_A(x)).chunk(3, dim=-1)
+            q = q+q_lora
+            k = k+k_lora
+            v = v+v_lora
+        q = rearrange(q, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        k = rearrange(k, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        v = rearrange(v, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        q = self.rotary_emb.rotate_queries_or_keys(q, self.rotary_emb.freqs)
+        k = self.rotary_emb.rotate_queries_or_keys(k, self.rotary_emb.freqs)
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        if self.is_temporal_independent:
+            attn_bias = torch.ones((T, T), dtype=q.dtype, device=q.device)
+            attn_bias = attn_bias.masked_fill(attn_bias == 1, float('-inf'))
+            attn_bias[range(T), range(T)] = 0
+        elif self.is_causal:
+            attn_bias = torch.triu(torch.ones((T, T), dtype=q.dtype, device=q.device), diagonal=1)
+            attn_bias = attn_bias.masked_fill(attn_bias == 1, float('-inf'))
+            attn_bias[(T-self.reference_length):] = float('-inf')
+            attn_bias[range(T), range(T)] = 0
+        else:
+            attn_bias = None
+        try:
+            x = F.scaled_dot_product_attention(query=q, key=k, value=v, attn_mask=attn_bias)
+        except:
+            import pdb;pdb.set_trace()
+        x = rearrange(x, "(B H W) h T d -> B T H W (h d)", B=B, H=H, W=W)
+        x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        # if T>=10:
+        #     try:
+        #         # x = torch.cat([x[:,:-2],x[:,-1:]], dim=1)
+        #         x = x[:,1:]
+        #     except:
+        #         import pdb;pdb.set_trace()
+        #     print(x.shape)
+        return x
+class SpatialAxialAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        rotary_emb: RotaryEmbedding,
+        use_domain_adapter = False
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.use_domain_adapter = use_domain_adapter
+        if self.use_domain_adapter:
+            lora_rank = 8
+            self.lora_A = nn.Linear(dim, lora_rank, bias=False)
+            self.lora_B = nn.Linear(lora_rank, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+    def forward(self, x: torch.Tensor):
+        B, T, H, W, D = x.shape
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        if self.use_domain_adapter:
+            q_lora, k_lora, v_lora = self.lora_B(self.lora_A(x)).chunk(3, dim=-1)
+            q = q+q_lora
+            k = k+k_lora
+            v = v+v_lora
+        q = rearrange(q, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        k = rearrange(k, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        v = rearrange(v, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        freqs = self.rotary_emb.get_axial_freqs(H, W)
+        q = apply_rotary_emb(freqs, q)
+        k = apply_rotary_emb(freqs, k)
+        # prepare for attn
+        q = rearrange(q, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        k = rearrange(k, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        v = rearrange(v, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        x = F.scaled_dot_product_attention(query=q, key=k, value=v, is_causal=False)
+        x = rearrange(x, "(B T) h (H W) d -> B T H W (h d)", B=B, H=H, W=W)
+        x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        return x
+class MemTemporalAxialAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        rotary_emb: RotaryEmbedding,
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+        self.is_causal = is_causal
+        self.reference_length = 3
+    def forward(self, x: torch.Tensor):
+        B, T, H, W, D = x.shape
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        q = rearrange(q, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        k = rearrange(k, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        v = rearrange(v, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        # q = self.rotary_emb.rotate_queries_or_keys(q, self.rotary_emb.freqs)
+        # k = self.rotary_emb.rotate_queries_or_keys(k, self.rotary_emb.freqs)
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        # if T == 21000:
+        #     # 手动计算缩放点积分数
+        #     _, _, _, d_k = q.shape
+        #     scores = torch.einsum("b h n d, b h m d -> b h n m", q, k) / (d_k ** 0.5)  # Shape: (B, T_q, T_k)
+        #     # 计算注意力图 (Attention Map)
+        #     attention_map = F.softmax(scores, dim=-1)  # Shape: (B, T_q, T_k)
+        #     b_, h_, n_, m_ = attention_map.shape
+        #     attention_map = attention_map.reshape(1, int(np.sqrt(b_/1)), int(np.sqrt(b_/1)), h_, n_, m_)
+        #     attention_map = attention_map.mean(3)
+        #     attn_bias = torch.zeros((T, T), dtype=q.dtype, device=q.device)
+        #     T_origin = T - self.reference_length
+        #     attn_bias[:T_origin, T_origin:] = 1
+        #     attn_bias[range(T), range(T)] = 1
+        #     attention_map = attention_map * attn_bias
+            # # print 注意力图
+            # import matplotlib.pyplot as plt
+            # fig, axes = plt.subplots(21000, 21000, figsize=(9, 9))  # 调整figsize以适配图像大小
+            # # 遍历3*3维度
+            # for i in range(21000):
+            #     for j in range(21000):
+            #         # 取出第(i, j)个子图像
+            #         img = attention_map[0, :, :, i, j].cpu().numpy()
+            #         axes[i, j].imshow(img, cmap='viridis')  # 可以自定义cmap
+            #         axes[i, j].axis('off')  # 隐藏坐标轴
+            # # 调整子图间距
+            # plt.tight_layout()
+            # plt.savefig('attention_map.png')
+            # import pdb; pdb.set_trace()
+            # plt.close()
+        attn_bias = torch.zeros((T, T), dtype=q.dtype, device=q.device)
+        attn_bias = attn_bias.masked_fill(attn_bias == 0, float('-inf'))
+        T_origin = T - self.reference_length
+        attn_bias[:T_origin, T_origin:] = 0
+        attn_bias[range(T), range(T)] = 0
+        # if T==121000:
+        #     import pdb;pdb.set_trace()
+        try:
+            x = F.scaled_dot_product_attention(query=q, key=k, value=v, attn_mask=attn_bias)
+        except:
+            import pdb;pdb.set_trace()
+        x = rearrange(x, "(B H W) h T d -> B T H W (h d)", B=B, H=H, W=W)
+        x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        return x
+class MemFullAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        reference_length: int,
+        rotary_emb: RotaryEmbedding,
+        is_causal: bool = True
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+        self.is_causal = is_causal
+        self.reference_length = reference_length
+        self.store = None
+    def forward(self, x: torch.Tensor, relative_embedding=False,
+                extra_condition=None,
+                state_embed_only_on_qk=False,
+                reference_length=None):
+        B, T, H, W, D = x.shape
+        if state_embed_only_on_qk:
+            q, k, _ = self.to_qkv(x+extra_condition).chunk(3, dim=-1)
+            _, _, v = self.to_qkv(x).chunk(3, dim=-1)
+        else:
+            q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        if relative_embedding:
+            length = reference_length+1
+            n_frames = T // length
+            x = x.reshape(B, n_frames, length, H, W, D)
+            x_list = []
+            for i in range(n_frames):
+                if i == n_frames-1:
+                    q_i = rearrange(q[:, i*length:], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                    k_i = rearrange(k[:, i*length+1:(i+1)*length], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                    v_i = rearrange(v[:, i*length+1:(i+1)*length], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                else:
+                    q_i = rearrange(q[:, i*length:i*length+1], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                    k_i = rearrange(k[:, i*length+1:(i+1)*length], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                    v_i = rearrange(v[:, i*length+1:(i+1)*length], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+                q_i, k_i, v_i = map(lambda t: t.contiguous(), (q_i, k_i, v_i))
+                x_i = F.scaled_dot_product_attention(query=q_i, key=k_i, value=v_i)
+                x_i = rearrange(x_i, "B h (T H W) d -> B T H W (h d)", B=B, H=H, W=W)
+                x_i = x_i.to(q.dtype)
+                x_list.append(x_i)
+            x = torch.cat(x_list, dim=1)
+        else:
+            T_ = T - reference_length
+            q = rearrange(q, "B T H W (h d) -> B h (T H W) d", h=self.heads)
+            k = rearrange(k[:, T_:], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+            v = rearrange(v[:, T_:], "B T H W (h d) -> B h (T H W) d", h=self.heads)
+            q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+            x = F.scaled_dot_product_attention(query=q, key=k, value=v)
+            x = rearrange(x, "B h (T H W) d -> B T H W (h d)", B=B, H=H, W=W)
+            x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        return x

algorithms/worldmem/models/cameractrl_module.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch.nn as nn
+class SimpleCameraPoseEncoder(nn.Module):
+    def __init__(self, c_in, c_out, hidden_dim=128):
+        super(SimpleCameraPoseEncoder, self).__init__()
+        self.model = nn.Sequential(
+            nn.Linear(c_in, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, c_out)
+        )
+    def forward(self, x):
+        return self.model(x)

algorithms/worldmem/models/diffusion.py ADDED Viewed

	@@ -0,0 +1,520 @@

+from typing import Optional, Callable
+from collections import namedtuple
+from omegaconf import DictConfig
+import torch
+from torch import nn
+from torch.nn import functional as F
+from einops import rearrange
+from .utils import linear_beta_schedule, cosine_beta_schedule, sigmoid_beta_schedule, extract
+from .dit import DiT_models
+ModelPrediction = namedtuple("ModelPrediction", ["pred_noise", "pred_x_start", "model_out"])
+class Diffusion(nn.Module):
+    # Special thanks to lucidrains for the implementation of the base Diffusion model
+    # https://github.com/lucidrains/denoising-diffusion-pytorch
+    def __init__(
+        self,
+        x_shape: torch.Size,
+        reference_length: int,
+        action_cond_dim: int,
+        pose_cond_dim,
+        is_causal: bool,
+        cfg: DictConfig,
+        is_dit: bool=False,
+        use_plucker=False,
+        relative_embedding=False,
+        state_embed_only_on_qk=False,
+        use_memory_attention=False,
+        add_timestamp_embedding=False,
+        ref_mode='sequential'
+    ):
+        super().__init__()
+        self.cfg = cfg
+        self.x_shape = x_shape
+        self.action_cond_dim = action_cond_dim
+        self.timesteps = cfg.timesteps
+        self.sampling_timesteps = cfg.sampling_timesteps
+        self.beta_schedule = cfg.beta_schedule
+        self.schedule_fn_kwargs = cfg.schedule_fn_kwargs
+        self.objective = cfg.objective
+        self.use_fused_snr = cfg.use_fused_snr
+        self.snr_clip = cfg.snr_clip
+        self.cum_snr_decay = cfg.cum_snr_decay
+        self.ddim_sampling_eta = cfg.ddim_sampling_eta
+        self.clip_noise = cfg.clip_noise
+        self.arch = cfg.architecture
+        self.stabilization_level = cfg.stabilization_level
+        self.is_causal = is_causal
+        self.is_dit = is_dit
+        self.reference_length = reference_length
+        self.pose_cond_dim = pose_cond_dim
+        self.use_plucker = use_plucker
+        self.relative_embedding = relative_embedding
+        self.state_embed_only_on_qk = state_embed_only_on_qk
+        self.use_memory_attention = use_memory_attention
+        self.add_timestamp_embedding = add_timestamp_embedding
+        self.ref_mode = ref_mode
+        self._build_model()
+        self._build_buffer()
+    def _build_model(self):
+        x_channel = self.x_shape[0]
+        if self.is_dit:
+            self.model = DiT_models["DiT-S/2"](action_cond_dim=self.action_cond_dim,
+                                            pose_cond_dim=self.pose_cond_dim, reference_length=self.reference_length,
+                                            use_plucker=self.use_plucker,
+                                            relative_embedding=self.relative_embedding,
+                                            state_embed_only_on_qk=self.state_embed_only_on_qk,
+                                            use_memory_attention=self.use_memory_attention,
+                                            add_timestamp_embedding=self.add_timestamp_embedding,
+                                            ref_mode=self.ref_mode)
+        else:
+            raise NotImplementedError
+    def _build_buffer(self):
+        if self.beta_schedule == "linear":
+            beta_schedule_fn = linear_beta_schedule
+        elif self.beta_schedule == "cosine":
+            beta_schedule_fn = cosine_beta_schedule
+        elif self.beta_schedule == "sigmoid":
+            beta_schedule_fn = sigmoid_beta_schedule
+        else:
+            raise ValueError(f"unknown beta schedule {self.beta_schedule}")
+        betas = beta_schedule_fn(self.timesteps, **self.schedule_fn_kwargs)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0)
+        # sampling related parameters
+        assert self.sampling_timesteps <= self.timesteps
+        self.is_ddim_sampling = self.sampling_timesteps < self.timesteps
+        # helper function to register buffer from float64 to float32
+        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+        register_buffer("betas", betas)
+        register_buffer("alphas_cumprod", alphas_cumprod)
+        register_buffer("alphas_cumprod_prev", alphas_cumprod_prev)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        register_buffer("sqrt_alphas_cumprod", torch.sqrt(alphas_cumprod))
+        register_buffer("sqrt_one_minus_alphas_cumprod", torch.sqrt(1.0 - alphas_cumprod))
+        register_buffer("log_one_minus_alphas_cumprod", torch.log(1.0 - alphas_cumprod))
+        register_buffer("sqrt_recip_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod))
+        register_buffer("sqrt_recipm1_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod - 1))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        register_buffer("posterior_variance", posterior_variance)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        register_buffer(
+            "posterior_log_variance_clipped",
+            torch.log(posterior_variance.clamp(min=1e-20)),
+        )
+        register_buffer(
+            "posterior_mean_coef1",
+            betas * torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
+        )
+        register_buffer(
+            "posterior_mean_coef2",
+            (1.0 - alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - alphas_cumprod),
+        )
+        # calculate p2 reweighting
+        # register_buffer(
+        #     "p2_loss_weight",
+        #     (self.p2_loss_weight_k + alphas_cumprod / (1 - alphas_cumprod))
+        #     ** -self.p2_loss_weight_gamma,
+        # )
+        # derive loss weight
+        # https://arxiv.org/abs/2303.09556
+        # snr: signal noise ratio
+        snr = alphas_cumprod / (1 - alphas_cumprod)
+        clipped_snr = snr.clone()
+        clipped_snr.clamp_(max=self.snr_clip)
+        register_buffer("clipped_snr", clipped_snr)
+        register_buffer("snr", snr)
+    def add_shape_channels(self, x):
+        return rearrange(x, f"... -> ...{' 1' * len(self.x_shape)}")
+    def model_predictions(self, x, t, action_cond=None, current_frame=None,
+        pose_cond=None, mode="training", reference_length=None, frame_idx=None):
+        x = x.permute(1,0,2,3,4)
+        action_cond = action_cond.permute(1,0,2)
+        if pose_cond is not None and pose_cond[0] is not None:
+            try:
+                pose_cond = pose_cond.permute(1,0,2)
+            except:
+                pass
+        t = t.permute(1,0)
+        model_output = self.model(x, t, action_cond, current_frame=current_frame, pose_cond=pose_cond,
+            mode=mode, reference_length=reference_length, frame_idx=frame_idx)
+        model_output = model_output.permute(1,0,2,3,4)
+        x = x.permute(1,0,2,3,4)
+        t = t.permute(1,0)
+        if self.objective == "pred_noise":
+            pred_noise = torch.clamp(model_output, -self.clip_noise, self.clip_noise)
+            x_start = self.predict_start_from_noise(x, t, pred_noise)
+        elif self.objective == "pred_x0":
+            x_start = model_output
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == "pred_v":
+            v = model_output
+            x_start = self.predict_start_from_v(x, t, v)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return ModelPrediction(pred_noise, x_start, model_output)
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / extract(
+            self.sqrt_recipm1_alphas_cumprod, t, x_t.shape
+        )
+    def predict_v(self, x_start, t, noise):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise
+            - extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
+        )
+    def predict_start_from_v(self, x_t, t, v):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t
+            - extract(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+    def q_mean_variance(self, x_start, t):
+        mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = extract(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = torch.randn_like(x_start)
+            noise = torch.clamp(noise, -self.clip_noise, self.clip_noise)
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_mean_variance(self, x, t, action_cond=None, pose_cond=None, reference_length=None):
+        model_pred = self.model_predictions(x=x, t=t, action_cond=action_cond,
+            pose_cond=pose_cond, reference_length=reference_length,
+            frame_idx=frame_idx)
+        x_start = model_pred.pred_x_start
+        return self.q_posterior(x_start=x_start, x_t=x, t=t)
+    def compute_loss_weights(self, noise_levels: torch.Tensor):
+        snr = self.snr[noise_levels]
+        clipped_snr = self.clipped_snr[noise_levels]
+        normalized_clipped_snr = clipped_snr / self.snr_clip
+        normalized_snr = snr / self.snr_clip
+        if not self.use_fused_snr:
+            # min SNR reweighting
+            match self.objective:
+                case "pred_noise":
+                    return clipped_snr / snr
+                case "pred_x0":
+                    return clipped_snr
+                case "pred_v":
+                    return clipped_snr / (snr + 1)
+        cum_snr = torch.zeros_like(normalized_snr)
+        for t in range(0, noise_levels.shape[0]):
+            if t == 0:
+                cum_snr[t] = normalized_clipped_snr[t]
+            else:
+                cum_snr[t] = self.cum_snr_decay * cum_snr[t - 1] + (1 - self.cum_snr_decay) * normalized_clipped_snr[t]
+        cum_snr = F.pad(cum_snr[:-1], (0, 0, 1, 0), value=0.0)
+        clipped_fused_snr = 1 - (1 - cum_snr * self.cum_snr_decay) * (1 - normalized_clipped_snr)
+        fused_snr = 1 - (1 - cum_snr * self.cum_snr_decay) * (1 - normalized_snr)
+        match self.objective:
+            case "pred_noise":
+                return clipped_fused_snr / fused_snr
+            case "pred_x0":
+                return clipped_fused_snr * self.snr_clip
+            case "pred_v":
+                return clipped_fused_snr * self.snr_clip / (fused_snr * self.snr_clip + 1)
+            case _:
+                raise ValueError(f"unknown objective {self.objective}")
+    def forward(
+        self,
+        x: torch.Tensor,
+        action_cond: Optional[torch.Tensor],
+        pose_cond,
+        noise_levels: torch.Tensor,
+        reference_length,
+        frame_idx=None
+    ):
+        noise = torch.randn_like(x)
+        noise = torch.clamp(noise, -self.clip_noise, self.clip_noise)
+        noised_x = self.q_sample(x_start=x, t=noise_levels, noise=noise)
+        model_pred = self.model_predictions(x=noised_x, t=noise_levels, action_cond=action_cond,
+                                    pose_cond=pose_cond,reference_length=reference_length, frame_idx=frame_idx)
+        pred = model_pred.model_out
+        x_pred = model_pred.pred_x_start
+        if self.objective == "pred_noise":
+            target = noise
+        elif self.objective == "pred_x0":
+            target = x
+        elif self.objective == "pred_v":
+            target = self.predict_v(x, noise_levels, noise)
+        else:
+            raise ValueError(f"unknown objective {self.objective}")
+        # 训练的时候每个frame随便给噪声
+        loss = F.mse_loss(pred, target.detach(), reduction="none")
+        loss_weight = self.compute_loss_weights(noise_levels)
+        loss_weight = loss_weight.view(*loss_weight.shape, *((1,) * (loss.ndim - 2)))
+        loss = loss * loss_weight
+        return x_pred, loss
+    def sample_step(
+        self,
+        x: torch.Tensor,
+        action_cond: Optional[torch.Tensor],
+        pose_cond,
+        curr_noise_level: torch.Tensor,
+        next_noise_level: torch.Tensor,
+        guidance_fn: Optional[Callable] = None,
+        current_frame=None,
+        mode="training",
+        reference_length=None,
+        frame_idx=None
+    ):
+        real_steps = torch.linspace(-1, self.timesteps - 1, steps=self.sampling_timesteps + 1, device=x.device).long()
+        # convert noise levels (0 ~ sampling_timesteps) to real noise levels (-1 ~ timesteps - 1)
+        curr_noise_level = real_steps[curr_noise_level]
+        next_noise_level = real_steps[next_noise_level]
+        if self.is_ddim_sampling:
+            return self.ddim_sample_step(
+                x=x,
+                action_cond=action_cond,
+                pose_cond=pose_cond,
+                curr_noise_level=curr_noise_level,
+                next_noise_level=next_noise_level,
+                guidance_fn=guidance_fn,
+                current_frame=current_frame,
+                mode=mode,
+                reference_length=reference_length,
+                frame_idx=frame_idx
+            )
+        # FIXME: temporary code for checking ddpm sampling
+        assert torch.all(
+            (curr_noise_level - 1 == next_noise_level) | ((curr_noise_level == -1) & (next_noise_level == -1))
+        ), "Wrong noise level given for ddpm sampling."
+        assert (
+            self.sampling_timesteps == self.timesteps
+        ), "sampling_timesteps should be equal to timesteps for ddpm sampling."
+        return self.ddpm_sample_step(
+            x=x,
+            action_cond=action_cond,
+            pose_cond=pose_cond,
+            curr_noise_level=curr_noise_level,
+            guidance_fn=guidance_fn,
+            reference_length=reference_length,
+            frame_idx=frame_idx
+        )
+    def ddpm_sample_step(
+        self,
+        x: torch.Tensor,
+        action_cond: Optional[torch.Tensor],
+        pose_cond,
+        curr_noise_level: torch.Tensor,
+        guidance_fn: Optional[Callable] = None,
+        reference_length=None,
+        frame_idx=None,
+    ):
+        clipped_curr_noise_level = torch.where(
+            curr_noise_level < 0,
+            torch.full_like(curr_noise_level, self.stabilization_level - 1, dtype=torch.long),
+            curr_noise_level,
+        )
+        # treating as stabilization would require us to scale with sqrt of alpha_cum
+        orig_x = x.clone().detach()
+        scaled_context = self.q_sample(
+            x,
+            clipped_curr_noise_level,
+            noise=torch.zeros_like(x),
+        )
+        x = torch.where(self.add_shape_channels(curr_noise_level < 0), scaled_context, orig_x)
+        if guidance_fn is not None:
+            raise NotImplementedError("Guidance function is not implemented for ddpm sampling yet.")
+        else:
+            model_mean, _, model_log_variance = self.p_mean_variance(
+                x=x,
+                t=clipped_curr_noise_level,
+                action_cond=action_cond,
+                pose_cond=pose_cond,
+                reference_length=reference_length,
+                frame_idx=frame_idx
+            )
+        noise = torch.where(
+            self.add_shape_channels(clipped_curr_noise_level > 0),
+            torch.randn_like(x),
+            0,
+        )
+        noise = torch.clamp(noise, -self.clip_noise, self.clip_noise)
+        x_pred = model_mean + torch.exp(0.5 * model_log_variance) * noise
+        # only update frames where the noise level decreases
+        return torch.where(self.add_shape_channels(curr_noise_level == -1), orig_x, x_pred)
+    def ddim_sample_step(
+        self,
+        x: torch.Tensor,
+        action_cond: Optional[torch.Tensor],
+        pose_cond,
+        curr_noise_level: torch.Tensor,
+        next_noise_level: torch.Tensor,
+        guidance_fn: Optional[Callable] = None,
+        current_frame=None,
+        mode="training",
+        reference_length=None,
+        frame_idx=None
+    ):
+        # convert noise level -1 to self.stabilization_level - 1
+        clipped_curr_noise_level = torch.where(
+            curr_noise_level < 0,
+            torch.full_like(curr_noise_level, self.stabilization_level - 1, dtype=torch.long),
+            curr_noise_level,
+        )
+        # treating as stabilization would require us to scale with sqrt of alpha_cum
+        orig_x = x.clone().detach()
+        scaled_context = self.q_sample(
+            x,
+            clipped_curr_noise_level,
+            noise=torch.zeros_like(x),
+        )
+        x = torch.where(self.add_shape_channels(curr_noise_level < 0), scaled_context, orig_x)
+        alpha = self.alphas_cumprod[clipped_curr_noise_level]
+        alpha_next = torch.where(
+            next_noise_level < 0,
+            torch.ones_like(next_noise_level),
+            self.alphas_cumprod[next_noise_level],
+        )
+        sigma = torch.where(
+            next_noise_level < 0,
+            torch.zeros_like(next_noise_level),
+            self.ddim_sampling_eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt(),
+        )
+        c = (1 - alpha_next - sigma**2).sqrt()
+        alpha_next = self.add_shape_channels(alpha_next)
+        c = self.add_shape_channels(c)
+        sigma = self.add_shape_channels(sigma)
+        if guidance_fn is not None:
+            with torch.enable_grad():
+                x = x.detach().requires_grad_()
+                model_pred = self.model_predictions(
+                    x=x,
+                    t=clipped_curr_noise_level,
+                    action_cond=action_cond,
+                    pose_cond=pose_cond,
+                    current_frame=current_frame,
+                    mode=mode,
+                    reference_length=reference_length,
+                    frame_idx=frame_idx
+                )
+                guidance_loss = guidance_fn(model_pred.pred_x_start)
+                grad = -torch.autograd.grad(
+                    guidance_loss,
+                    x,
+                )[0]
+                pred_noise = model_pred.pred_noise + (1 - alpha_next).sqrt() * grad
+                x_start = self.predict_start_from_noise(x, clipped_curr_noise_level, pred_noise)
+        else:
+            # print(clipped_curr_noise_level)
+            model_pred = self.model_predictions(
+                x=x,
+                t=clipped_curr_noise_level,
+                action_cond=action_cond,
+                pose_cond=pose_cond,
+                current_frame=current_frame,
+                mode=mode,
+                reference_length=reference_length,
+                frame_idx=frame_idx
+            )
+            x_start = model_pred.pred_x_start
+            pred_noise = model_pred.pred_noise
+        noise = torch.randn_like(x)
+        noise = torch.clamp(noise, -self.clip_noise, self.clip_noise)
+        x_pred = x_start * alpha_next.sqrt() + pred_noise * c + sigma * noise
+        # only update frames where the noise level decreases
+        mask = curr_noise_level == next_noise_level
+        x_pred = torch.where(
+            self.add_shape_channels(mask),
+            orig_x,
+            x_pred,
+        )
+        return x_pred

algorithms/worldmem/models/dit.py ADDED Viewed

	@@ -0,0 +1,572 @@

+"""
+References:
+    - DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+    - Diffusion Forcing: https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/unet3d.py
+    - Latte: https://github.com/Vchitect/Latte/blob/main/models/latte.py
+"""
+from typing import Optional, Literal
+import torch
+from torch import nn
+from .rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange
+from .attention import SpatialAxialAttention, TemporalAxialAttention, MemTemporalAxialAttention, MemFullAttention
+from timm.models.vision_transformer import Mlp
+from timm.layers.helpers import to_2tuple
+import math
+from collections import namedtuple
+from typing import Optional, Callable
+from .cameractrl_module import SimpleCameraPoseEncoder
+def modulate(x, shift, scale):
+    fixed_dims = [1] * len(shift.shape[1:])
+    shift = shift.repeat(x.shape[0] // shift.shape[0], *fixed_dims)
+    scale = scale.repeat(x.shape[0] // scale.shape[0], *fixed_dims)
+    while shift.dim() < x.dim():
+        shift = shift.unsqueeze(-2)
+        scale = scale.unsqueeze(-2)
+    return x * (1 + scale) + shift
+def gate(x, g):
+    fixed_dims = [1] * len(g.shape[1:])
+    g = g.repeat(x.shape[0] // g.shape[0], *fixed_dims)
+    while g.dim() < x.dim():
+        g = g.unsqueeze(-2)
+    return g * x
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_height=256,
+        img_width=256,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = (img_height, img_width)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x, random_sample=False):
+        B, C, H, W = x.shape
+        assert random_sample or (H == self.img_size[0] and W == self.img_size[1]), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = rearrange(x, "B C H W -> B (H W) C")
+        else:
+            x = rearrange(x, "B C H W -> B H W C")
+        x = self.norm(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256, freq_type='time_step'):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),  # hidden_size is diffusion model hidden size
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.freq_type = freq_type
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000, freq_type='time_step'):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        if freq_type == 'time_step':
+            freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(device=t.device)
+        elif freq_type == 'spatial': # ~(-5 5)
+            freqs = torch.linspace(1.0, half, half).to(device=t.device) * torch.pi
+        elif freq_type == 'angle': # 0-360
+            freqs = torch.linspace(1.0, half, half).to(device=t.device) * torch.pi / 180
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size, freq_type=self.freq_type)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SpatioTemporalDiTBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        reference_length,
+        mlp_ratio=4.0,
+        is_causal=True,
+        spatial_rotary_emb: Optional[RotaryEmbedding] = None,
+        temporal_rotary_emb: Optional[RotaryEmbedding] = None,
+        reference_rotary_emb=None,
+        use_plucker=False,
+        relative_embedding=False,
+        state_embed_only_on_qk=False,
+        use_memory_attention=False,
+        ref_mode='sequential'
+    ):
+        super().__init__()
+        self.is_causal = is_causal
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.s_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.s_attn = SpatialAxialAttention(
+            hidden_size,
+            heads=num_heads,
+            dim_head=hidden_size // num_heads,
+            rotary_emb=spatial_rotary_emb
+        )
+        self.s_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.s_mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.s_adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+        self.t_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.t_attn = TemporalAxialAttention(
+            hidden_size,
+            heads=num_heads,
+            dim_head=hidden_size // num_heads,
+            is_causal=is_causal,
+            rotary_emb=temporal_rotary_emb,
+            reference_length=reference_length
+        )
+        self.t_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.t_mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.t_adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+        self.use_memory_attention = use_memory_attention
+        if self.use_memory_attention:
+            self.r_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+            self.ref_type = "full_ref"
+            if self.ref_type == "temporal_ref":
+                self.r_attn = MemTemporalAxialAttention(
+                    hidden_size,
+                    heads=num_heads,
+                    dim_head=hidden_size // num_heads,
+                    is_causal=is_causal,
+                    rotary_emb=None
+                )
+            elif self.ref_type == "full_ref":
+                self.r_attn = MemFullAttention(
+                    hidden_size,
+                    heads=num_heads,
+                    dim_head=hidden_size // num_heads,
+                    is_causal=is_causal,
+                    rotary_emb=reference_rotary_emb,
+                    reference_length=reference_length
+                )
+            self.r_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+            self.r_mlp = Mlp(
+                in_features=hidden_size,
+                hidden_features=mlp_hidden_dim,
+                act_layer=approx_gelu,
+                drop=0,
+            )
+            self.r_adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
+            self.use_plucker = use_plucker
+            if use_plucker:
+                self.pose_cond_mlp = nn.Linear(hidden_size, hidden_size)
+                self.temporal_pose_cond_mlp = nn.Linear(hidden_size, hidden_size)
+        self.reference_length = reference_length
+        self.relative_embedding = relative_embedding
+        self.state_embed_only_on_qk = state_embed_only_on_qk
+        self.ref_mode = ref_mode
+        if self.ref_mode == 'parallel':
+            self.parallel_map = nn.Linear(hidden_size, hidden_size)
+    def forward(self, x, c, current_frame=None, timestep=None, is_last_block=False,
+        pose_cond=None, mode="training", c_action_cond=None, reference_length=None):
+        B, T, H, W, D = x.shape
+        # spatial block
+        s_shift_msa, s_scale_msa, s_gate_msa, s_shift_mlp, s_scale_mlp, s_gate_mlp = self.s_adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate(self.s_attn(modulate(self.s_norm1(x), s_shift_msa, s_scale_msa)), s_gate_msa)
+        x = x + gate(self.s_mlp(modulate(self.s_norm2(x), s_shift_mlp, s_scale_mlp)), s_gate_mlp)
+        # temporal block
+        if c_action_cond is not None:
+            t_shift_msa, t_scale_msa, t_gate_msa, t_shift_mlp, t_scale_mlp, t_gate_mlp = self.t_adaLN_modulation(c_action_cond).chunk(6, dim=-1)
+        else:
+            t_shift_msa, t_scale_msa, t_gate_msa, t_shift_mlp, t_scale_mlp, t_gate_mlp = self.t_adaLN_modulation(c).chunk(6, dim=-1)
+        x_t = x + gate(self.t_attn(modulate(self.t_norm1(x), t_shift_msa, t_scale_msa)), t_gate_msa)
+        x_t = x_t + gate(self.t_mlp(modulate(self.t_norm2(x_t), t_shift_mlp, t_scale_mlp)), t_gate_mlp)
+        if self.ref_mode == 'sequential':
+            x = x_t
+        # memory block
+        relative_embedding = self.relative_embedding # and mode == "training"
+        if self.use_memory_attention:
+            r_shift_msa, r_scale_msa, r_gate_msa, r_shift_mlp, r_scale_mlp, r_gate_mlp = self.r_adaLN_modulation(c).chunk(6, dim=-1)
+            if pose_cond is not None:
+                if self.use_plucker:
+                    input_cond = self.pose_cond_mlp(pose_cond)
+                    if relative_embedding:
+                        n_frames = x.shape[1] - reference_length
+                        x1_relative_embedding = []
+                        r_shift_msa_relative_embedding = []
+                        r_scale_msa_relative_embedding = []
+                        for i in range(n_frames):
+                            x1_relative_embedding.append(torch.cat([x[:,i:i+1], x[:, -reference_length:]], dim=1).clone())
+                            r_shift_msa_relative_embedding.append(torch.cat([r_shift_msa[:,i:i+1], r_shift_msa[:, -reference_length:]], dim=1).clone())
+                            r_scale_msa_relative_embedding.append(torch.cat([r_scale_msa[:,i:i+1], r_scale_msa[:, -reference_length:]], dim=1).clone())
+                        x1_zero_frame = torch.cat(x1_relative_embedding, dim=1)
+                        r_shift_msa = torch.cat(r_shift_msa_relative_embedding, dim=1)
+                        r_scale_msa = torch.cat(r_scale_msa_relative_embedding, dim=1)
+                        # if current_frame == 18:
+                        #     import pdb;pdb.set_trace()
+                        if self.state_embed_only_on_qk:
+                            attn_input = x1_zero_frame
+                            extra_condition = input_cond
+                        else:
+                            attn_input = input_cond + x1_zero_frame
+                            extra_condition = None
+                    else:
+                        attn_input = input_cond + x
+                        extra_condition = None
+                    # print("input_cond2:", input_cond.abs().mean())
+                    # print("c:", c.abs().mean())
+                    # input_cond = x1
+                    x = x + gate(self.r_attn(modulate(self.r_norm1(attn_input), r_shift_msa, r_scale_msa),
+                                    relative_embedding=relative_embedding,
+                                    extra_condition=extra_condition,
+                                    state_embed_only_on_qk=self.state_embed_only_on_qk,
+                                    reference_length=reference_length), r_gate_msa)
+                else:
+                    # pose_cond *= 0
+                    x = x + gate(self.r_attn(modulate(self.r_norm1(x+pose_cond[:,:,None, None]), r_shift_msa, r_scale_msa),
+                                    current_frame=current_frame, timestep=timestep,
+                                    is_last_block=is_last_block,
+                                    reference_length=reference_length), r_gate_msa)
+            else:
+                x = x + gate(self.r_attn(modulate(self.r_norm1(x), r_shift_msa, r_scale_msa), current_frame=current_frame, timestep=timestep,
+                                is_last_block=is_last_block), r_gate_msa)
+            x = x + gate(self.r_mlp(modulate(self.r_norm2(x), r_shift_mlp, r_scale_mlp)), r_gate_mlp)
+        if self.ref_mode == 'parallel':
+            x = x_t + self.parallel_map(x)
+        return x
+        # print((x1-x2).abs().sum())
+        # r_shift_msa, r_scale_msa, r_gate_msa, r_shift_mlp, r_scale_mlp, r_gate_mlp = self.r_adaLN_modulation(c).chunk(6, dim=-1)
+        # x2 = x1 + gate(self.r_attn(modulate(self.r_norm1(x_), r_shift_msa, r_scale_msa)), r_gate_msa)
+        # x2 = gate(self.r_mlp(modulate(self.r_norm2(x2), r_shift_mlp, r_scale_mlp)), r_gate_mlp)
+        # x = x1 + x2
+        # print(x.mean())
+        # return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_h=18,
+        input_w=32,
+        patch_size=2,
+        in_channels=16,
+        hidden_size=1024,
+        depth=12,
+        num_heads=16,
+        mlp_ratio=4.0,
+        action_cond_dim=25,
+        pose_cond_dim=4,
+        max_frames=32,
+        reference_length=8,
+        use_plucker=False,
+        relative_embedding=False,
+        state_embed_only_on_qk=False,
+        use_memory_attention=False,
+        add_timestamp_embedding=False,
+        ref_mode='sequential'
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.max_frames = max_frames
+        self.x_embedder = PatchEmbed(input_h, input_w, patch_size, in_channels, hidden_size, flatten=False)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.add_timestamp_embedding = add_timestamp_embedding
+        if self.add_timestamp_embedding:
+            self.timestamp_embedding = TimestepEmbedder(hidden_size)
+        frame_h, frame_w = self.x_embedder.grid_size
+        self.spatial_rotary_emb = RotaryEmbedding(dim=hidden_size // num_heads // 2, freqs_for="pixel", max_freq=256)
+        self.temporal_rotary_emb = RotaryEmbedding(dim=hidden_size // num_heads)
+        # self.reference_rotary_emb = RotaryEmbedding(dim=hidden_size // num_heads // 2, freqs_for="pixel", max_freq=256)
+        self.reference_rotary_emb = None
+        self.external_cond = nn.Linear(action_cond_dim, hidden_size) if action_cond_dim > 0 else nn.Identity()
+        # self.pose_cond = nn.Linear(pose_cond_dim, hidden_size) if pose_cond_dim > 0 else nn.Identity()
+        self.use_plucker = use_plucker
+        if not self.use_plucker:
+            self.position_embedder = TimestepEmbedder(hidden_size, freq_type='spatial')
+            self.angle_embedder = TimestepEmbedder(hidden_size, freq_type='angle')
+        else:
+            self.pose_embedder = SimpleCameraPoseEncoder(c_in=6, c_out=hidden_size)
+        self.blocks = nn.ModuleList(
+            [
+                SpatioTemporalDiTBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    is_causal=True,
+                    reference_length=reference_length,
+                    spatial_rotary_emb=self.spatial_rotary_emb,
+                    temporal_rotary_emb=self.temporal_rotary_emb,
+                    reference_rotary_emb=self.reference_rotary_emb,
+                    use_plucker=self.use_plucker,
+                    relative_embedding=relative_embedding,
+                    state_embed_only_on_qk=state_embed_only_on_qk,
+                    use_memory_attention=use_memory_attention,
+                    ref_mode=ref_mode
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.use_memory_attention = use_memory_attention
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        if self.use_memory_attention:
+            if not self.use_plucker:
+                nn.init.normal_(self.position_embedder.mlp[0].weight, std=0.02)
+                nn.init.normal_(self.position_embedder.mlp[2].weight, std=0.02)
+                nn.init.normal_(self.angle_embedder.mlp[0].weight, std=0.02)
+                nn.init.normal_(self.angle_embedder.mlp[2].weight, std=0.02)
+            if self.add_timestamp_embedding:
+                nn.init.normal_(self.timestamp_embedding.mlp[0].weight, std=0.02)
+                nn.init.normal_(self.timestamp_embedding.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.s_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.s_adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.t_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.t_adaLN_modulation[-1].bias, 0)
+            if self.use_plucker and self.use_memory_attention:
+                nn.init.constant_(block.pose_cond_mlp.weight, 0)
+                nn.init.constant_(block.pose_cond_mlp.bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, H, W, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = x.shape[1]
+        w = x.shape[2]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum("nhwpqc->nchpwq", x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def forward(self, x, t, action_cond=None, pose_cond=None, current_frame=None, mode=None,
+                    reference_length=None, frame_idx=None):
+        """
+        Forward pass of DiT.
+        x: (B, T, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (B, T,) tensor of diffusion timesteps
+        """
+        B, T, C, H, W = x.shape
+        # add spatial embeddings
+        x = rearrange(x, "b t c h w -> (b t) c h w")
+        x = self.x_embedder(x)  # (B*T, C, H, W) -> (B*T, H/2, W/2, D) , C = 16, D = d_model
+        # restore shape
+        x = rearrange(x, "(b t) h w d -> b t h w d", t=T)
+        # embed noise steps
+        t = rearrange(t, "b t -> (b t)")
+        c_t = self.t_embedder(t)  # (N, D)
+        c = c_t.clone()
+        c = rearrange(c, "(b t) d -> b t d", t=T)
+        if torch.is_tensor(action_cond):
+            try:
+                c_action_cond = c + self.external_cond(action_cond)
+            except:
+                import pdb;pdb.set_trace()
+        else:
+            c_action_cond = None
+        if torch.is_tensor(pose_cond):
+            if not self.use_plucker:
+                pose_cond = pose_cond.to(action_cond.dtype)
+                b_, t_, d_ = pose_cond.shape
+                pos_emb = self.position_embedder(rearrange(pose_cond[...,:3], "b t d -> (b t d)"))
+                angle_emb = self.angle_embedder(rearrange(pose_cond[...,3:], "b t d -> (b t d)"))
+                pos_emb = rearrange(pos_emb, "(b t d) c -> b t d c", b=b_, t=t_, d=3).sum(-2)
+                angle_emb = rearrange(angle_emb, "(b t d) c -> b t d c", b=b_, t=t_, d=2).sum(-2)
+                pc = pos_emb + angle_emb
+            else:
+                pose_cond = pose_cond[:, :, ::40, ::40]
+                # pc = self.pose_embedder(pose_cond)[0]
+                # pc = pc.permute(0,2,3,4,1)
+                pc = self.pose_embedder(pose_cond)
+                pc = pc.permute(1,0,2,3,4)
+                if torch.is_tensor(frame_idx) and self.add_timestamp_embedding:
+                    bb = frame_idx.shape[1]
+                    frame_idx = rearrange(frame_idx, "t b -> (b t)")
+                    frame_idx = self.timestamp_embedding(frame_idx)
+                    frame_idx = rearrange(frame_idx, "(b t) d -> b t d", b=bb)
+                    pc = pc + frame_idx[:, :, None, None]
+                # pc = pc + rearrange(c_t.clone(), "(b t) d -> b t d", t=T)[:,:,None,None] # add time condition for different timestep scaling
+        else:
+            pc = None
+        for i, block in enumerate(self.blocks):
+            x = block(x, c, current_frame=current_frame, timestep=t, is_last_block= (i+1 == len(self.blocks)),
+                pose_cond=pc, mode=mode, c_action_cond=c_action_cond, reference_length=reference_length)  # (N, T, H, W, D)
+        x = self.final_layer(x, c) # (N, T, H, W, patch_size ** 2 * out_channels)
+        # unpatchify
+        x = rearrange(x, "b t h w d -> (b t) h w d")
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        x = rearrange(x, "(b t) c h w -> b t c h w", t=T)
+        return x
+def DiT_S_2(action_cond_dim, pose_cond_dim, reference_length,
+use_plucker, relative_embedding,
+state_embed_only_on_qk, use_memory_attention, add_timestamp_embedding,
+ref_mode):
+    return DiT(
+        patch_size=2,
+        hidden_size=1024,
+        depth=16,
+        num_heads=16,
+        action_cond_dim=action_cond_dim,
+        pose_cond_dim=pose_cond_dim,
+        reference_length=reference_length,
+        use_plucker=use_plucker,
+        relative_embedding=relative_embedding,
+        state_embed_only_on_qk=state_embed_only_on_qk,
+        use_memory_attention=use_memory_attention,
+        add_timestamp_embedding=add_timestamp_embedding,
+        ref_mode=ref_mode
+    )
+DiT_models = {"DiT-S/2": DiT_S_2}

algorithms/worldmem/models/pose_prediction.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PosePredictionNet(nn.Module):
+    def __init__(self, img_channels=16, img_feat_dim=256, pose_dim=5, action_dim=25, hidden_dim=128):
+        super(PosePredictionNet, self).__init__()
+        self.cnn = nn.Sequential(
+            nn.Conv2d(img_channels, 32, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool2d((1, 1))
+        )
+        self.fc_img = nn.Linear(128, img_feat_dim)
+        self.mlp_motion = nn.Sequential(
+            nn.Linear(pose_dim + action_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU()
+        )
+        self.fc_out = nn.Sequential(
+            nn.Linear(img_feat_dim + hidden_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, pose_dim)
+        )
+    def forward(self, img, action, pose):
+        img_feat = self.cnn(img).view(img.size(0), -1)
+        img_feat = self.fc_img(img_feat)
+        motion_feat = self.mlp_motion(torch.cat([pose, action], dim=1))
+        fused_feat = torch.cat([img_feat, motion_feat], dim=1)
+        pose_next_pred = self.fc_out(fused_feat)
+        return pose_next_pred

algorithms/worldmem/models/rotary_embedding_torch.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+Adapted from https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+"""
+from __future__ import annotations
+from math import pi, log
+import torch
+from torch.nn import Module, ModuleList
+from torch.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+from einops import rearrange, repeat
+from typing import Literal
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    dtype = t.dtype
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[-1], f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    # Split t into three parts: left, middle (to be transformed), and right
+    t_left = t[..., :start_index]
+    t_middle = t[..., start_index:end_index]
+    t_right = t[..., end_index:]
+    # Apply rotary embeddings without modifying t in place
+    t_transformed = (t_middle * freqs.cos() * scale) + (rotate_half(t_middle) * freqs.sin() * scale)
+    out = torch.cat((t_left, t_transformed, t_right), dim=-1)
+    return out.type(dtype)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for: Literal["lang", "pixel", "constant"] = "lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+        cache_max_seq_len=8192,
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "spacetime":
+            time_freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        if freqs_for == "spacetime":
+            self.time_freqs = nn.Parameter(time_freqs, requires_grad=learned_freq)
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        self.cache_if_possible = cache_if_possible
+        self.cache_max_seq_len = cache_max_seq_len
+        self.register_buffer("cached_freqs", torch.zeros(cache_max_seq_len, dim), persistent=False)
+        self.register_buffer("cached_freqs_seq_len", torch.tensor(0), persistent=False)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.register_buffer("scale", scale, persistent=False)
+        self.register_buffer("cached_scales", torch.zeros(cache_max_seq_len, dim), persistent=False)
+        self.register_buffer("cached_scales_seq_len", torch.tensor(0), persistent=False)
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, freqs, seq_dim=None, offset=0, scale=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos or exists(scale), "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset)
+        seq_freqs = self.forward(seq, freqs, seq_len=seq_len, offset=offset)
+        if seq_dim == -3:
+            seq_freqs = rearrange(seq_freqs, "n d -> n 1 d")
+        return apply_rotary_emb(seq_freqs, t, scale=default(scale, 1.0), seq_dim=seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        dtype, device, seq_dim = (
+            q.dtype,
+            q.device,
+            default(seq_dim, self.default_seq_dim),
+        )
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        q_scale = k_scale = 1.0
+        if self.use_xpos:
+            seq = self.get_seq_pos(k_len, dtype=dtype, device=device)
+            q_scale = self.get_scale(seq[-q_len:]).type(dtype)
+            k_scale = self.get_scale(seq).type(dtype)
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, scale=q_scale, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, scale=k_scale**-1)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, freqs, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        seq_freqs = self.forward(seq, freqs, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+        if seq_dim == -3:
+            seq_freqs = rearrange(seq_freqs, "n d -> n 1 d")
+            scale = rearrange(scale, "n d -> n 1 d")
+        rotated_q = apply_rotary_emb(seq_freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(seq_freqs, k, scale=scale**-1, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(self, t: Tensor, seq_len: int | None = None, offset=0):
+        assert self.use_xpos
+        should_cache = self.cache_if_possible and exists(seq_len) and (offset + seq_len) <= self.cache_max_seq_len
+        if should_cache and exists(self.cached_scales) and (seq_len + offset) <= self.cached_scales_seq_len.item():
+            return self.cached_scales[offset : (offset + seq_len)]
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = repeat(scale, "n d -> n (d r)", r=2)
+        if should_cache and offset == 0:
+            self.cached_scales[:seq_len] = scale.detach()
+            self.cached_scales_seq_len.copy_(seq_len)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            # only allow pixel freqs for last two dimensions
+            use_pixel = (self.freqs_for == "pixel" or self.freqs_for == "spacetime") and ind >= len(dims) - 2
+            if use_pixel:
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+            if self.freqs_for == "spacetime" and not use_pixel:
+                seq_freqs = self.forward(pos, self.time_freqs, seq_len=dim)
+            else:
+                seq_freqs = self.forward(pos, self.freqs, seq_len=dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(seq_freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, freqs: Tensor, seq_len=None, offset=0):
+        should_cache = self.cache_if_possible and not self.learned_freq and exists(seq_len) and self.freqs_for != "pixel" and (offset + seq_len) <= self.cache_max_seq_len
+        if should_cache and exists(self.cached_freqs) and (offset + seq_len) <= self.cached_freqs_seq_len.item():
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if should_cache and offset == 0:
+            self.cached_freqs[:seq_len] = freqs.detach()
+            self.cached_freqs_seq_len.copy_(seq_len)
+        return freqs

algorithms/worldmem/models/utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Adapted from https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/utils.py
+Action format derived from VPT https://github.com/openai/Video-Pre-Training
+Adapted from https://github.com/etched-ai/open-oasis/blob/master/utils.py
+"""
+import math
+import torch
+from torch import nn
+from torchvision.io import read_image, read_video
+from torchvision.transforms.functional import resize
+from einops import rearrange
+from typing import Mapping, Sequence
+from einops import rearrange, parse_shape
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def extract(a, t, x_shape):
+    f, b = t.shape
+    out = a[t]
+    return out.reshape(f, b, *((1,) * (len(x_shape) - 2)))
+def linear_beta_schedule(timesteps):
+    """
+    linear schedule, proposed in original ddpm paper
+    """
+    scale = 1000 / timesteps
+    beta_start = scale * 0.0001
+    beta_end = scale * 0.02
+    return torch.linspace(beta_start, beta_end, timesteps, dtype=torch.float64)
+def cosine_beta_schedule(timesteps, s=0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype=torch.float64) / timesteps
+    alphas_cumprod = torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+def sigmoid_beta_schedule(timesteps, start=-3, end=3, tau=1, clamp_min=1e-5):
+    """
+    sigmoid schedule
+    proposed in https://arxiv.org/abs/2212.11972 - Figure 8
+    better for images > 64x64, when used during training
+    """
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype=torch.float64) / timesteps
+    v_start = torch.tensor(start / tau).sigmoid()
+    v_end = torch.tensor(end / tau).sigmoid()
+    alphas_cumprod = (-((t * (end - start) + start) / tau).sigmoid() + v_end) / (v_end - v_start)
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+ACTION_KEYS = [
+    "inventory",
+    "ESC",
+    "hotbar.1",
+    "hotbar.2",
+    "hotbar.3",
+    "hotbar.4",
+    "hotbar.5",
+    "hotbar.6",
+    "hotbar.7",
+    "hotbar.8",
+    "hotbar.9",
+    "forward",
+    "back",
+    "left",
+    "right",
+    "cameraX",
+    "cameraY",
+    "jump",
+    "sneak",
+    "sprint",
+    "swapHands",
+    "attack",
+    "use",
+    "pickItem",
+    "drop",
+]
+def one_hot_actions(actions: Sequence[Mapping[str, int]]) -> torch.Tensor:
+    actions_one_hot = torch.zeros(len(actions), len(ACTION_KEYS))
+    for i, current_actions in enumerate(actions):
+        for j, action_key in enumerate(ACTION_KEYS):
+            if action_key.startswith("camera"):
+                if action_key == "cameraX":
+                    value = current_actions["camera"][0]
+                elif action_key == "cameraY":
+                    value = current_actions["camera"][1]
+                else:
+                    raise ValueError(f"Unknown camera action key: {action_key}")
+                max_val = 20
+                bin_size = 0.5
+                num_buckets = int(max_val / bin_size)
+                value = (value - num_buckets) / num_buckets
+                assert -1 - 1e-3 <= value <= 1 + 1e-3, f"Camera action value must be in [-1, 1], got {value}"
+            else:
+                value = current_actions[action_key]
+                assert 0 <= value <= 1, f"Action value must be in [0, 1] got {value}"
+            actions_one_hot[i, j] = value
+    return actions_one_hot
+IMAGE_EXTENSIONS = {"png", "jpg", "jpeg"}
+VIDEO_EXTENSIONS = {"mp4"}
+def load_prompt(path, video_offset=None, n_prompt_frames=1):
+    if path.lower().split(".")[-1] in IMAGE_EXTENSIONS:
+        print("prompt is image; ignoring video_offset and n_prompt_frames")
+        prompt = read_image(path)
+        # add frame dimension
+        prompt = rearrange(prompt, "c h w -> 1 c h w")
+    elif path.lower().split(".")[-1] in VIDEO_EXTENSIONS:
+        prompt = read_video(path, pts_unit="sec")[0]
+        if video_offset is not None:
+            prompt = prompt[video_offset:]
+        prompt = prompt[:n_prompt_frames]
+    else:
+        raise ValueError(f"unrecognized prompt file extension; expected one in {IMAGE_EXTENSIONS} or {VIDEO_EXTENSIONS}")
+    assert prompt.shape[0] == n_prompt_frames, f"input prompt {path} had less than n_prompt_frames={n_prompt_frames} frames"
+    prompt = resize(prompt, (360, 640))
+    # add batch dimension
+    prompt = rearrange(prompt, "t c h w -> 1 t c h w")
+    prompt = prompt.float() / 255.0
+    return prompt
+def load_actions(path, action_offset=None):
+    if path.endswith(".actions.pt"):
+        actions = one_hot_actions(torch.load(path))
+    elif path.endswith(".one_hot_actions.pt"):
+        actions = torch.load(path, weights_only=True)
+    else:
+        raise ValueError("unrecognized action file extension; expected '*.actions.pt' or '*.one_hot_actions.pt'")
+    if action_offset is not None:
+        actions = actions[action_offset:]
+    actions = torch.cat([torch.zeros_like(actions[:1]), actions], dim=0)
+    # add batch dimension
+    actions = rearrange(actions, "t d -> 1 t d")
+    return actions

algorithms/worldmem/models/vae.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""
+References:
+    - VQGAN: https://github.com/CompVis/taming-transformers
+    - MAE: https://github.com/facebookresearch/mae
+"""
+import numpy as np
+import math
+import functools
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.vision_transformer import Mlp
+from timm.layers.helpers import to_2tuple
+from rotary_embedding_torch import RotaryEmbedding, apply_rotary_emb
+from .dit import PatchEmbed
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False, dim=1):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        if dim == 1:
+            self.dims = [1, 2, 3]
+        elif dim == 2:
+            self.dims = [1, 2]
+        else:
+            raise NotImplementedError
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def mode(self):
+        return self.mean
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        frame_height,
+        frame_width,
+        qkv_bias=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.frame_height = frame_height
+        self.frame_width = frame_width
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        rotary_freqs = RotaryEmbedding(
+            dim=head_dim // 4,
+            freqs_for="pixel",
+            max_freq=frame_height * frame_width,
+        ).get_axial_freqs(frame_height, frame_width)
+        self.register_buffer("rotary_freqs", rotary_freqs, persistent=False)
+    def forward(self, x):
+        B, N, C = x.shape
+        assert N == self.frame_height * self.frame_width
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q = rearrange(
+            q,
+            "b (H W) (h d) -> b h H W d",
+            H=self.frame_height,
+            W=self.frame_width,
+            h=self.num_heads,
+        )
+        k = rearrange(
+            k,
+            "b (H W) (h d) -> b h H W d",
+            H=self.frame_height,
+            W=self.frame_width,
+            h=self.num_heads,
+        )
+        v = rearrange(
+            v,
+            "b (H W) (h d) -> b h H W d",
+            H=self.frame_height,
+            W=self.frame_width,
+            h=self.num_heads,
+        )
+        q = apply_rotary_emb(self.rotary_freqs, q)
+        k = apply_rotary_emb(self.rotary_freqs, k)
+        q = rearrange(q, "b h H W d -> b h (H W) d")
+        k = rearrange(k, "b h H W d -> b h (H W) d")
+        v = rearrange(v, "b h H W d -> b h (H W) d")
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b h N d -> b N (h d)")
+        x = self.proj(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        frame_height,
+        frame_width,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        attn_causal=False,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads,
+            frame_height,
+            frame_width,
+            qkv_bias=qkv_bias,
+        )
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+        )
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        latent_dim,
+        input_height=256,
+        input_width=256,
+        patch_size=16,
+        enc_dim=768,
+        enc_depth=6,
+        enc_heads=12,
+        dec_dim=768,
+        dec_depth=6,
+        dec_heads=12,
+        mlp_ratio=4.0,
+        norm_layer=functools.partial(nn.LayerNorm, eps=1e-6),
+        use_variational=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_height = input_height
+        self.input_width = input_width
+        self.patch_size = patch_size
+        self.seq_h = input_height // patch_size
+        self.seq_w = input_width // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.patch_dim = 3 * patch_size**2
+        self.latent_dim = latent_dim
+        self.enc_dim = enc_dim
+        self.dec_dim = dec_dim
+        # patch
+        self.patch_embed = PatchEmbed(input_height, input_width, patch_size, 3, enc_dim)
+        # encoder
+        self.encoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    enc_dim,
+                    enc_heads,
+                    self.seq_h,
+                    self.seq_w,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )
+                for i in range(enc_depth)
+            ]
+        )
+        self.enc_norm = norm_layer(enc_dim)
+        # bottleneck
+        self.use_variational = use_variational
+        mult = 2 if self.use_variational else 1
+        self.quant_conv = nn.Linear(enc_dim, mult * latent_dim)
+        self.post_quant_conv = nn.Linear(latent_dim, dec_dim)
+        # decoder
+        self.decoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    dec_dim,
+                    dec_heads,
+                    self.seq_h,
+                    self.seq_w,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )
+                for i in range(dec_depth)
+            ]
+        )
+        self.dec_norm = norm_layer(dec_dim)
+        self.predictor = nn.Linear(dec_dim, self.patch_dim)  # decoder to patch
+        # initialize this weight first
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0.0)
+            nn.init.constant_(m.weight, 1.0)
+    def patchify(self, x):
+        # patchify
+        bsz, _, h, w = x.shape
+        x = x.reshape(
+            bsz,
+            3,
+            self.seq_h,
+            self.patch_size,
+            self.seq_w,
+            self.patch_size,
+        ).permute([0, 1, 3, 5, 2, 4])  # [b, c, h, p, w, p] --> [b, c, p, p, h, w]
+        x = x.reshape(bsz, self.patch_dim, self.seq_h, self.seq_w)  # --> [b, cxpxp, h, w]
+        x = x.permute([0, 2, 3, 1]).reshape(bsz, self.seq_len, self.patch_dim)  # --> [b, hxw, cxpxp]
+        return x
+    def unpatchify(self, x):
+        bsz = x.shape[0]
+        # unpatchify
+        x = x.reshape(bsz, self.seq_h, self.seq_w, self.patch_dim).permute([0, 3, 1, 2])  # [b, h, w, cxpxp] --> [b, cxpxp, h, w]
+        x = x.reshape(
+            bsz,
+            3,
+            self.patch_size,
+            self.patch_size,
+            self.seq_h,
+            self.seq_w,
+        ).permute([0, 1, 4, 2, 5, 3])  # [b, c, p, p, h, w] --> [b, c, h, p, w, p]
+        x = x.reshape(
+            bsz,
+            3,
+            self.input_height,
+            self.input_width,
+        )  # [b, c, hxp, wxp]
+        return x
+    def encode(self, x):
+        # patchify
+        x = self.patch_embed(x)
+        # encoder
+        for blk in self.encoder:
+            x = blk(x)
+        x = self.enc_norm(x)
+        # bottleneck
+        moments = self.quant_conv(x)
+        if not self.use_variational:
+            moments = torch.cat((moments, torch.zeros_like(moments)), 2)
+        posterior = DiagonalGaussianDistribution(moments, deterministic=(not self.use_variational), dim=2)
+        return posterior
+    def decode(self, z):
+        # bottleneck
+        z = self.post_quant_conv(z)
+        # decoder
+        for blk in self.decoder:
+            z = blk(z)
+        z = self.dec_norm(z)
+        # predictor
+        z = self.predictor(z)
+        # unpatchify
+        dec = self.unpatchify(z)
+        return dec
+    def autoencode(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if self.use_variational and sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior, z
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def forward(self, inputs, labels, split="train"):
+        rec, post, latent = self.autoencode(inputs)
+        return rec, post, latent
+    def get_last_layer(self):
+        return self.predictor.weight
+def ViT_L_20_Shallow_Encoder(**kwargs):
+    if "latent_dim" in kwargs:
+        latent_dim = kwargs.pop("latent_dim")
+    else:
+        latent_dim = 16
+    return AutoencoderKL(
+        latent_dim=latent_dim,
+        patch_size=20,
+        enc_dim=1024,
+        enc_depth=6,
+        enc_heads=16,
+        dec_dim=1024,
+        dec_depth=12,
+        dec_heads=16,
+        input_height=360,
+        input_width=640,
+        **kwargs,
+    )
+VAE_models = {
+    "vit-l-20-shallow-encoder": ViT_L_20_Shallow_Encoder,
+}

algorithms/worldmem/pose_prediction.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from omegaconf import DictConfig
+import torch
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from algorithms.common.metrics import (
+    FrechetInceptionDistance,
+    LearnedPerceptualImagePatchSimilarity,
+    FrechetVideoDistance,
+)
+from .df_base import DiffusionForcingBase
+from utils.logging_utils import log_video, get_validation_metrics_for_videos
+from .models.vae import VAE_models
+from .models.dit import DiT_models
+from einops import rearrange
+from torch import autocast
+import numpy as np
+from tqdm import tqdm
+import torch.nn.functional as F
+from .models.pose_prediction import PosePredictionNet
+import torchvision.transforms.functional as TF
+import random
+from torchvision.transforms import InterpolationMode
+from PIL import Image
+import math
+from packaging import version as pver
+import torch.distributed as dist
+import matplotlib.pyplot as plt
+import torch
+import math
+import wandb
+import torch.nn as nn
+from algorithms.common.base_pytorch_algo import BasePytorchAlgo
+class PosePrediction(BasePytorchAlgo):
+    def __init__(self, cfg: DictConfig):
+        super().__init__(cfg)
+    def _build_model(self):
+        self.pose_prediction_model = PosePredictionNet()
+        vae = VAE_models["vit-l-20-shallow-encoder"]()
+        self.vae = vae.eval()
+    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
+        xs, conditions, pose_conditions= batch
+        pose_conditions[:,:,3:] = pose_conditions[:,:,3:] // 15
+        xs = self.encode(xs)
+        b,f,c,h,w = xs.shape
+        xs = xs[:,:-1].reshape(-1, c, h, w)
+        conditions = conditions[:,1:].reshape(-1, 25)
+        offset_gt = pose_conditions[:,1:] - pose_conditions[:,:-1]
+        pose_conditions = pose_conditions[:,:-1].reshape(-1, 5)
+        offset_gt = offset_gt.reshape(-1, 5)
+        offset_gt[:, 3][offset_gt[:, 3]==23] = -1
+        offset_gt[:, 3][offset_gt[:, 3]==-23] = 1
+        offset_gt[:, 4][offset_gt[:, 4]==23] = -1
+        offset_gt[:, 4][offset_gt[:, 4]==-23] = 1
+        offset_pred = self.pose_prediction_model(xs, conditions, pose_conditions)
+        criterion = nn.MSELoss()
+        loss = criterion(offset_pred, offset_gt)
+        if batch_idx % 200 == 0:
+            self.log("training/loss", loss.cpu())
+        output_dict = {
+            "loss": loss}
+        return output_dict
+    def encode(self, x):
+        # vae encoding
+        B = x.shape[1]
+        T = x.shape[0]
+        H, W = x.shape[-2:]
+        scaling_factor = 0.07843137255
+        x = rearrange(x, "t b c h w -> (t b) c h w")
+        with torch.no_grad():
+            with autocast("cuda", dtype=torch.half):
+                x = self.vae.encode(x * 2 - 1).mean * scaling_factor
+        x = rearrange(x, "(t b) (h w) c -> t b c h w", t=T, h=H // self.vae.patch_size, w=W // self.vae.patch_size)
+        # x = x[:, :n_prompt_frames]
+        return x
+    def decode(self, x):
+        total_frames = x.shape[0]
+        scaling_factor = 0.07843137255
+        x = rearrange(x, "t b c h w -> (t b) (h w) c")
+        with torch.no_grad():
+            with autocast("cuda", dtype=torch.half):
+                x = (self.vae.decode(x / scaling_factor) + 1) / 2
+        x = rearrange(x, "(t b) c h w-> t b c h w", t=total_frames)
+        return x
+    def validation_step(self, batch, batch_idx, namespace="validation") -> STEP_OUTPUT:
+        xs, conditions, pose_conditions= batch
+        pose_conditions[:,:,3:] = pose_conditions[:,:,3:] // 15
+        xs = self.encode(xs)
+        b,f,c,h,w = xs.shape
+        xs = xs[:,:-1].reshape(-1, c, h, w)
+        conditions = conditions[:,1:].reshape(-1, 25)
+        offset_gt = pose_conditions[:,1:] - pose_conditions[:,:-1]
+        pose_conditions = pose_conditions[:,:-1].reshape(-1, 5)
+        offset_gt = offset_gt.reshape(-1, 5)
+        offset_gt[:, 3][offset_gt[:, 3]==23] = -1
+        offset_gt[:, 3][offset_gt[:, 3]==-23] = 1
+        offset_gt[:, 4][offset_gt[:, 4]==23] = -1
+        offset_gt[:, 4][offset_gt[:, 4]==-23] = 1
+        offset_pred = self.pose_prediction_model(xs, conditions, pose_conditions)
+        criterion = nn.MSELoss()
+        loss = criterion(offset_pred, offset_gt)
+        if batch_idx % 200 == 0:
+            self.log("validation/loss", loss.cpu())
+        output_dict = {
+            "loss": loss}
+        return
+    @torch.no_grad()
+    def interactive(self, batch, context_frames, device):
+        with torch.cuda.amp.autocast():
+            condition_similar_length = self.condition_similar_length
+            # xs_raw, conditions, pose_conditions, c2w_mat, masks, frame_idx = self._preprocess_batch(batch)
+            first_frame, new_conditions, new_pose_conditions, new_c2w_mat, new_frame_idx = batch
+            if self.frames is None:
+                first_frame_encode = self.encode(first_frame[None, None].to(device))
+                self.frames = first_frame_encode.to(device)
+                self.actions = new_conditions[None, None].to(device)
+                self.poses = new_pose_conditions[None, None].to(device)
+                self.memory_c2w = new_c2w_mat[None, None].to(device)
+                self.frame_idx = torch.tensor([[new_frame_idx]]).to(device)
+                return first_frame
+            else:
+                self.actions = torch.cat([self.actions, new_conditions[None, None].to(device)])
+                self.poses = torch.cat([self.poses, new_pose_conditions[None, None].to(device)])
+                self.memory_c2w = torch.cat([self.memory_c2w, new_c2w_mat[None, None].to(device)])
+                self.frame_idx = torch.cat([self.frame_idx, torch.tensor([[new_frame_idx]]).to(device)])
+            conditions = self.actions.clone()
+            pose_conditions = self.poses.clone()
+            c2w_mat = self.memory_c2w .clone()
+            frame_idx = self.frame_idx.clone()
+            curr_frame = 0
+            horizon = 1
+            batch_size = 1
+            n_frames = curr_frame + horizon
+            # context
+            n_context_frames = context_frames // self.frame_stack
+            xs_pred = self.frames[:n_context_frames].clone()
+            curr_frame += n_context_frames
+            pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
+            # generation on frame
+            scheduling_matrix = self._generate_scheduling_matrix(horizon)
+            chunk = torch.randn((horizon, batch_size, *xs_pred.shape[2:])).to(xs_pred.device)
+            chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise)
+            xs_pred = torch.cat([xs_pred, chunk], 0)
+            # sliding window: only input the last n_tokens frames
+            start_frame = max(0, curr_frame + horizon - self.n_tokens)
+            pbar.set_postfix(
+                {
+                    "start": start_frame,
+                    "end": curr_frame + horizon,
+                }
+            )
+            if condition_similar_length:
+                if curr_frame < condition_similar_length:
+                    random_idx = [i for i in range(curr_frame)] + [0] * (condition_similar_length-curr_frame)
+                    random_idx = np.repeat(np.array(random_idx)[:,None], xs_pred.shape[1], -1)
+                else:
+                    num_samples = 10000
+                    radius = 30
+                    samples = torch.rand((num_samples, 1), device=pose_conditions.device)
+                    angles = 2 * np.pi * torch.rand((num_samples,), device=pose_conditions.device)
+                    # points = radius * torch.sqrt(samples) * torch.stack((torch.cos(angles), torch.sin(angles)), dim=1)
+                    points = generate_points_in_sphere(num_samples, radius).to(pose_conditions.device)
+                    points = points[:, None].repeat(1, pose_conditions.shape[1], 1)
+                    points += pose_conditions[curr_frame, :, :3][None]
+                    fov_half_h = torch.tensor(105/2, device=pose_conditions.device)
+                    fov_half_v = torch.tensor(75/2, device=pose_conditions.device)
+                    # in_fov1 = is_inside_fov(points, pose_conditions[curr_frame, :, [0, 2]], pose_conditions[curr_frame, :, -1], fov_half)
+                    in_fov1 = is_inside_fov_3d_hv(points, pose_conditions[curr_frame, :, :3],
+                        pose_conditions[curr_frame, :, -2], pose_conditions[curr_frame, :, -1],
+                        fov_half_h, fov_half_v)
+                    in_fov_list = []
+                    for pc in pose_conditions[:curr_frame]:
+                        in_fov_list.append(is_inside_fov_3d_hv(points, pc[:, :3], pc[:, -2], pc[:, -1],
+                                                        fov_half_h, fov_half_v))
+                    in_fov_list = torch.stack(in_fov_list)
+                    # v3
+                    random_idx = []
+                    for csl in range(self.condition_similar_length // 2):
+                        overlap_ratio = ((in_fov1[None].bool() & in_fov_list).sum(1))/in_fov1.sum()
+                        # mask = distance > (in_fov1.bool().sum(0) / 4)
+                        #_, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        # if csl > self.condition_similar_length:
+                        # 	_, r_idx = torch.topk(overlap_ratio, k=1, dim=0)
+                        # else:
+                        # 	_, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        _, r_idx = torch.topk(overlap_ratio, k=1, dim=0)
+                        # _, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        # if curr_frame >=93:
+                        #     import pdb;pdb.set_trace()
+                        # start_time = time.time()
+                        cos_sim = F.cosine_similarity(xs_pred.to(r_idx.device)[r_idx[:, range(in_fov1.shape[1])],
+                            range(in_fov1.shape[1])], xs_pred.to(r_idx.device)[:curr_frame], dim=2)
+                        cos_sim = cos_sim.mean((-2,-1))
+                        mask_sim = cos_sim>0.9
+                        in_fov_list = in_fov_list & ~mask_sim[:,None].to(in_fov_list.device)
+                        random_idx.append(r_idx)
+                    for bi in range(conditions.shape[1]):
+                        if len(torch.nonzero(conditions[:,bi,24] == 1))==0:
+                            pass
+                        else:
+                            last_idx = torch.nonzero(conditions[:,bi,24] == 1)[-1]
+                            in_fov_list[:last_idx,:,bi] = False
+                    for csl in range(self.condition_similar_length // 2):
+                        overlap_ratio = ((in_fov1[None].bool() & in_fov_list).sum(1))/in_fov1.sum()
+                        # mask = distance > (in_fov1.bool().sum(0) / 4)
+                        #_, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        # if csl > self.condition_similar_length:
+                        # 	_, r_idx = torch.topk(overlap_ratio, k=1, dim=0)
+                        # else:
+                        # 	_, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        _, r_idx = torch.topk(overlap_ratio, k=1, dim=0)
+                        # _, r_idx = torch.topk(overlap_ratio / tensor_max_with_number((frame_idx[curr_frame] - frame_idx[:curr_frame]), 10), k=1, dim=0)
+                        # if curr_frame >=93:
+                        #     import pdb;pdb.set_trace()
+                        # start_time = time.time()
+                        cos_sim = F.cosine_similarity(xs_pred.to(r_idx.device)[r_idx[:, range(in_fov1.shape[1])],
+                            range(in_fov1.shape[1])], xs_pred.to(r_idx.device)[:curr_frame], dim=2)
+                        cos_sim = cos_sim.mean((-2,-1))
+                        mask_sim = cos_sim>0.9
+                        in_fov_list = in_fov_list & ~mask_sim[:,None].to(in_fov_list.device)
+                        random_idx.append(r_idx)
+                    random_idx = torch.cat(random_idx).cpu()
+                    condition_similar_length = len(random_idx)
+                xs_pred = torch.cat([xs_pred, xs_pred[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])].clone()], 0)
+            if condition_similar_length:
+                # import pdb;pdb.set_trace()
+                padding = torch.zeros((condition_similar_length,) + conditions.shape[1:], device=conditions.device, dtype=conditions.dtype)
+                input_condition = torch.cat([conditions[start_frame : curr_frame + horizon], padding], dim=0)
+                if self.pose_cond_dim:
+                    # if not self.use_plucker:
+                    input_pose_condition = torch.cat([pose_conditions[start_frame : curr_frame + horizon], pose_conditions[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]], dim=0).clone()
+                if self.use_plucker:
+                    if self.all_zero_frame:
+                        frame_idx_list = []
+                        input_pose_condition = []
+                        for i in range(start_frame, curr_frame + horizon):
+                            input_pose_condition.append(convert_to_plucker(torch.cat([c2w_mat[i:i+1],c2w_mat[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]]).clone(), 0, focal_length=self.focal_length, is_old_setting=self.old_setting).to(xs_pred.dtype))
+                            frame_idx_list.append(torch.cat([frame_idx[i:i+1]-frame_idx[i:i+1], frame_idx[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]-frame_idx[i:i+1]]))
+                        input_pose_condition = torch.cat(input_pose_condition)
+                        frame_idx_list = torch.cat(frame_idx_list)
+                        # print(frame_idx_list[:,0])
+                    else:
+                        # print(curr_frame-start_frame)
+                        # input_pose_condition = torch.cat([c2w_mat[start_frame : curr_frame + horizon], c2w_mat[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]], dim=0).clone()
+                        # import pdb;pdb.set_trace()
+                        if self.last_frame_refer:
+                            input_pose_condition = torch.cat([c2w_mat[start_frame : curr_frame + horizon], c2w_mat[-1:]], dim=0).clone()
+                        else:
+                            input_pose_condition = torch.cat([c2w_mat[start_frame : curr_frame + horizon], c2w_mat[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]], dim=0).clone()
+                        if self.zero_curr:
+                            # print("="*50)
+                            input_pose_condition = convert_to_plucker(input_pose_condition, curr_frame-start_frame, focal_length=self.focal_length, is_old_setting=self.old_setting)
+                            # input_pose_condition[:curr_frame-start_frame] = input_pose_condition[curr_frame-start_frame:curr_frame-start_frame+1]
+                        # input_pose_condition = convert_to_plucker(input_pose_condition, -self.condition_similar_length-1, focal_length=self.focal_length)
+                        else:
+                            input_pose_condition = convert_to_plucker(input_pose_condition, -condition_similar_length, focal_length=self.focal_length, is_old_setting=self.old_setting)
+                        frame_idx_list = None
+                else:
+                    input_pose_condition = torch.cat([pose_conditions[start_frame : curr_frame + horizon], pose_conditions[random_idx[:,range(xs_pred.shape[1])], range(xs_pred.shape[1])]], dim=0).clone()
+                    frame_idx_list = None
+            else:
+                input_condition = conditions[start_frame : curr_frame + horizon]
+                input_pose_condition = None
+                frame_idx_list = None
+            for m in range(scheduling_matrix.shape[0] - 1):
+                from_noise_levels = np.concatenate((np.zeros((curr_frame,), dtype=np.int64), scheduling_matrix[m]))[
+                    :, None
+                ].repeat(batch_size, axis=1)
+                to_noise_levels = np.concatenate(
+                    (
+                        np.zeros((curr_frame,), dtype=np.int64),
+                        scheduling_matrix[m + 1],
+                    )
+                )[
+                    :, None
+                ].repeat(batch_size, axis=1)
+                if condition_similar_length:
+                    from_noise_levels = np.concatenate([from_noise_levels, np.zeros((condition_similar_length,from_noise_levels.shape[-1]), dtype=np.int32)], axis=0)
+                    to_noise_levels = np.concatenate([to_noise_levels, np.zeros((condition_similar_length,from_noise_levels.shape[-1]), dtype=np.int32)], axis=0)
+                from_noise_levels = torch.from_numpy(from_noise_levels).to(self.device)
+                to_noise_levels = torch.from_numpy(to_noise_levels).to(self.device)
+                if input_pose_condition is not None:
+                    input_pose_condition = input_pose_condition.to(xs_pred.dtype)
+                xs_pred[start_frame:] = self.diffusion_model.sample_step(
+                    xs_pred[start_frame:],
+                    input_condition,
+                    input_pose_condition,
+                    from_noise_levels[start_frame:],
+                    to_noise_levels[start_frame:],
+                    current_frame=curr_frame,
+                    mode="validation",
+                    reference_length=condition_similar_length,
+                    frame_idx=frame_idx_list
+                )
+                # if curr_frame > 14:
+                #     import pdb;pdb.set_trace()
+                # if xs_pred_back is not None:
+                #     xs_pred = torch.cat([xs_pred[:6], xs_pred_back[6:12], xs_pred[6:]], dim=0)
+            # import pdb;pdb.set_trace()
+            if condition_similar_length: # and curr_frame+1!=n_frames:
+                xs_pred = xs_pred[:-condition_similar_length]
+            curr_frame += horizon
+            pbar.update(horizon)
+            self.frames = torch.cat([self.frames, xs_pred[n_context_frames:]])
+            xs_pred = self.decode(xs_pred[n_context_frames:])
+            return xs_pred[-1,0].cpu()

app.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import gradio as gr
+import time
+import sys
+import subprocess
+import time
+from pathlib import Path
+import hydra
+from omegaconf import DictConfig, OmegaConf
+from omegaconf.omegaconf import open_dict
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+import cv2
+import subprocess
+from PIL import Image
+from datetime import datetime
+import spaces
+from algorithms.worldmem import WorldMemMinecraft
+from huggingface_hub import hf_hub_download
+import tempfile
+import os
+import requests
+from huggingface_hub import model_info
+from experiments.exp_base import load_custom_checkpoint
+torch.set_float32_matmul_precision("high")
+def download_assets_if_needed():
+    ASSETS_URL_BASE = "https://huggingface.co/spaces/yslan/worldmem/resolve/main/assets/examples"
+    ASSETS_DIR = "assets/examples"
+    ASSETS = ['case1.npz', 'case2.npz', 'case3.npz', 'case4.npz']
+    if not os.path.exists(ASSETS_DIR):
+        os.makedirs(ASSETS_DIR)
+    # Download assets if they don't exist (total 4 files)
+    for filename in ASSETS:
+        filepath = os.path.join(ASSETS_DIR, filename)
+        if not os.path.exists(filepath):
+            print(f"Downloading {filename}...")
+            url = f"{ASSETS_URL_BASE}/{filename}"
+            response = requests.get(url)
+            if response.status_code == 200:
+                with open(filepath, "wb") as f:
+                    f.write(response.content)
+            else:
+                print(f"Failed to download {filename}: {response.status_code}")
+def parse_input_to_tensor(input_str):
+    """
+    Convert an input string into a (sequence_length, 25) tensor, where each row is a one-hot representation
+    of the corresponding action key.
+    Args:
+        input_str (str): A string consisting of "WASD" characters (e.g., "WASDWS").
+    Returns:
+        torch.Tensor: A tensor of shape (sequence_length, 25), where each row is a one-hot encoded action.
+    """
+    # Get the length of the input sequence
+    seq_len = len(input_str)
+    # Initialize a zero tensor of shape (seq_len, 25)
+    action_tensor = torch.zeros((seq_len, 25))
+    # Iterate through the input string and update the corresponding positions
+    for i, char in enumerate(input_str):
+        action, value = KEY_TO_ACTION.get(char.upper())  # Convert to uppercase to handle case insensitivity
+        if action and action in ACTION_KEYS:
+            index = ACTION_KEYS.index(action)
+            action_tensor[i, index] = value  # Set the corresponding action index to 1
+    return action_tensor
+def load_image_as_tensor(image_path: str) -> torch.Tensor:
+    """
+    Load an image and convert it to a 0-1 normalized tensor.
+    Args:
+        image_path (str): Path to the image file.
+    Returns:
+        torch.Tensor: Image tensor of shape (C, H, W), normalized to [0,1].
+    """
+    if isinstance(image_path, str):
+        image = Image.open(image_path).convert("RGB")  # Ensure it's RGB
+    else:
+        image = image_path
+    transform = transforms.Compose([
+        transforms.ToTensor(),  # Converts to tensor and normalizes to [0,1]
+    ])
+    return transform(image)
+def enable_amp(model, precision="16-mixed"):
+    original_forward = model.forward
+    def amp_forward(*args, **kwargs):
+        with torch.autocast("cuda", dtype=torch.float16 if precision == "16-mixed" else torch.bfloat16):
+            return original_forward(*args, **kwargs)
+    model.forward = amp_forward
+    return model
+download_assets_if_needed()
+ACTION_KEYS = [
+    "inventory",
+    "ESC",
+    "hotbar.1",
+    "hotbar.2",
+    "hotbar.3",
+    "hotbar.4",
+    "hotbar.5",
+    "hotbar.6",
+    "hotbar.7",
+    "hotbar.8",
+    "hotbar.9",
+    "forward",
+    "back",
+    "left",
+    "right",
+    "cameraY",
+    "cameraX",
+    "jump",
+    "sneak",
+    "sprint",
+    "swapHands",
+    "attack",
+    "use",
+    "pickItem",
+    "drop",
+]
+# Mapping of input keys to action names
+KEY_TO_ACTION = {
+    "Q": ("forward", 1),
+    "E": ("back", 1),
+    "W": ("cameraY", -1),
+    "S": ("cameraY", 1),
+    "A": ("cameraX", -1),
+    "D": ("cameraX", 1),
+    "U": ("drop", 1),
+    "N": ("noop", 1),
+    "1": ("hotbar.1", 1),
+}
+example_images = [
+    ["1", "assets/ice_plains.png", "turn rightgo backward→look up→turn left→look down→turn right→go forward→turn left", 20, 3, 8],
+    ["2", "assets/place.png", "put item→go backward→put item→go backward→go around", 20, 3, 8],
+    ["3", "assets/rain_sunflower_plains.png", "turn right→look up→turn right→look down→turn left→go backward→turn left", 20, 3, 8],
+    ["4", "assets/desert.png", "turn 360 degree→turn right→go forward→turn left", 20, 3, 8],
+]
+video_frames = []
+input_history = ""
+ICE_PLAINS_IMAGE = "assets/ice_plains.png"
+DESERT_IMAGE = "assets/desert.png"
+SAVANNA_IMAGE = "assets/savanna.png"
+PLAINS_IMAGE = "assets/plans.png"
+PLACE_IMAGE = "assets/place.png"
+SUNFLOWERS_IMAGE = "assets/sunflower_plains.png"
+SUNFLOWERS_RAIN_IMAGE = "assets/rain_sunflower_plains.png"
+device = torch.device('cuda')
+def save_video(frames, path="output.mp4", fps=10):
+    temp_path = path[:-4] + "_temp.mp4"
+    h, w, _ = frames[0].shape
+    out = cv2.VideoWriter(temp_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
+    for frame in frames:
+        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    out.release()
+    ffmpeg_cmd = [
+        "ffmpeg", "-y", "-i", temp_path,
+        "-c:v", "libx264", "-crf", "23", "-preset", "medium",
+        path
+    ]
+    subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    os.remove(temp_path)
+cfg = OmegaConf.load("configurations/huggingface.yaml")
+worldmem = WorldMemMinecraft(cfg)
+load_custom_checkpoint(algo=worldmem.diffusion_model, checkpoint_path=cfg.diffusion_path)
+load_custom_checkpoint(algo=worldmem.vae, checkpoint_path=cfg.vae_path)
+load_custom_checkpoint(algo=worldmem.pose_prediction_model, checkpoint_path=cfg.pose_predictor_path)
+worldmem.to("cuda").eval()
+# worldmem = enable_amp(worldmem, precision="16-mixed")
+actions = np.zeros((1, 25), dtype=np.float32)
+poses = np.zeros((1, 5), dtype=np.float32)
+def get_duration_single_image_to_long_video(first_frame, action, first_pose, device, memory_latent_frames, memory_actions,
+                            memory_poses, memory_c2w, memory_frame_idx):
+    return 5 * len(action) if memory_actions is not None else 5
+@spaces.GPU(duration=get_duration_single_image_to_long_video)
+def run_interactive(first_frame, action, first_pose, device, memory_latent_frames, memory_actions,
+                            memory_poses, memory_c2w, memory_frame_idx):
+    new_frame, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx = worldmem.interactive(first_frame,
+                                    action,
+                                    first_pose,
+                                    device=device,
+                                    memory_latent_frames=memory_latent_frames,
+                                    memory_actions=memory_actions,
+                                    memory_poses=memory_poses,
+                                    memory_c2w=memory_c2w,
+                                    memory_frame_idx=memory_frame_idx)
+    return new_frame, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx
+def set_denoising_steps(denoising_steps, sampling_timesteps_state):
+    worldmem.sampling_timesteps = denoising_steps
+    worldmem.diffusion_model.sampling_timesteps = denoising_steps
+    sampling_timesteps_state = denoising_steps
+    print("set denoising steps to", worldmem.sampling_timesteps)
+    return sampling_timesteps_state
+def set_context_length(context_length, sampling_context_length_state):
+    worldmem.n_tokens = context_length
+    sampling_context_length_state = context_length
+    print("set context length to", worldmem.n_tokens)
+    return sampling_context_length_state
+def set_memory_condition_length(memory_condition_length, sampling_memory_condition_length_state):
+    worldmem.memory_condition_length = memory_condition_length
+    sampling_memory_condition_length_state = memory_condition_length
+    print("set memory length to", worldmem.memory_condition_length)
+    return sampling_memory_condition_length_state
+def set_next_frame_length(next_frame_length, sampling_next_frame_length_state):
+    worldmem.next_frame_length = next_frame_length
+    sampling_next_frame_length_state = next_frame_length
+    print("set next frame length to", worldmem.next_frame_length)
+    return sampling_next_frame_length_state
+def generate(keys, input_history, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx):
+    input_actions = parse_input_to_tensor(keys)
+    if memory_latent_frames is None:
+        new_frame, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx = run_interactive(video_frames[0],
+                                    actions[0],
+                                    poses[0],
+                                    device=device,
+                                    memory_latent_frames=memory_latent_frames,
+                                    memory_actions=memory_actions,
+                                    memory_poses=memory_poses,
+                                    memory_c2w=memory_c2w,
+                                    memory_frame_idx=memory_frame_idx)
+    new_frame, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx = run_interactive(video_frames[0],
+                                    input_actions,
+                                    None,
+                                    device=device,
+                                    memory_latent_frames=memory_latent_frames,
+                                    memory_actions=memory_actions,
+                                    memory_poses=memory_poses,
+                                    memory_c2w=memory_c2w,
+                                    memory_frame_idx=memory_frame_idx)
+    video_frames = np.concatenate([video_frames, new_frame[:,0]])
+    out_video = video_frames.transpose(0,2,3,1).copy()
+    out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
+    out_video = (out_video * 255).astype(np.uint8)
+    last_frame = out_video[-1].copy()
+    border_thickness = 2
+    out_video[-len(new_frame):, :border_thickness, :, :] = [255, 0, 0]
+    out_video[-len(new_frame):, -border_thickness:, :, :] = [255, 0, 0]
+    out_video[-len(new_frame):, :, :border_thickness, :] = [255, 0, 0]
+    out_video[-len(new_frame):, :, -border_thickness:, :] = [255, 0, 0]
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_video(out_video, temporal_video_path)
+    input_history += keys
+    # now = datetime.now()
+    # folder_name = now.strftime("%Y-%m-%d_%H-%M-%S")
+    # folder_path = os.path.join("/mnt/xiaozeqi/worldmem/output_material", folder_name)
+    # os.makedirs(folder_path, exist_ok=True)
+    # data_dict = {
+    #     "input_history": input_history,
+    #     "video_frames": video_frames,
+    #     "memory_latent_frames": memory_latent_frames,
+    #     "memory_actions": memory_actions,
+    #     "memory_poses": memory_poses,
+    #     "memory_c2w": memory_c2w,
+    #     "memory_frame_idx": memory_frame_idx,
+    # }
+    # np.savez(os.path.join(folder_path, "data_bundle.npz"), **data_dict)
+    return last_frame, temporal_video_path, input_history, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx
+def reset(selected_image):
+    memory_latent_frames = None
+    memory_poses = None
+    memory_actions = None
+    memory_c2w = None
+    memory_frame_idx = None
+    video_frames = load_image_as_tensor(selected_image).numpy()[None]
+    input_history = ""
+    new_frame, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx = run_interactive(video_frames[0],
+                                actions[0],
+                                poses[0],
+                                device=device,
+                                memory_latent_frames=memory_latent_frames,
+                                memory_actions=memory_actions,
+                                memory_poses=memory_poses,
+                                memory_c2w=memory_c2w,
+                                memory_frame_idx=memory_frame_idx,
+                                )
+    return input_history, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx
+def on_image_click(selected_image):
+    input_history, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx = reset(selected_image)
+    return input_history, selected_image, selected_image, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx
+def set_memory(examples_case):
+    if examples_case == '1':
+        data_bundle = np.load("assets/examples/case1.npz")
+        input_history = data_bundle['input_history'].item()
+        video_frames = data_bundle['memory_frames']
+        memory_latent_frames = data_bundle['self_frames']
+        memory_actions = data_bundle['self_actions']
+        memory_poses = data_bundle['self_poses']
+        memory_c2w = data_bundle['self_memory_c2w']
+        memory_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '2':
+        data_bundle = np.load("assets/examples/case2.npz")
+        input_history = data_bundle['input_history'].item()
+        video_frames = data_bundle['memory_frames']
+        memory_latent_frames = data_bundle['self_frames']
+        memory_actions = data_bundle['self_actions']
+        memory_poses = data_bundle['self_poses']
+        memory_c2w = data_bundle['self_memory_c2w']
+        memory_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '3':
+        data_bundle = np.load("assets/examples/case3.npz")
+        input_history = data_bundle['input_history'].item()
+        video_frames = data_bundle['memory_frames']
+        memory_latent_frames = data_bundle['self_frames']
+        memory_actions = data_bundle['self_actions']
+        memory_poses = data_bundle['self_poses']
+        memory_c2w = data_bundle['self_memory_c2w']
+        memory_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '4':
+        data_bundle = np.load("assets/examples/case4.npz")
+        input_history = data_bundle['input_history'].item()
+        video_frames = data_bundle['memory_frames']
+        memory_latent_frames = data_bundle['self_frames']
+        memory_actions = data_bundle['self_actions']
+        memory_poses = data_bundle['self_poses']
+        memory_c2w = data_bundle['self_memory_c2w']
+        memory_frame_idx = data_bundle['self_frame_idx']
+    out_video = video_frames.transpose(0,2,3,1)
+    out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
+    out_video = (out_video * 255).astype(np.uint8)
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_video(out_video, temporal_video_path)
+    return input_history, out_video[-1], temporal_video_path, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx
+css = """
+h1 {
+    text-align: center;
+    display:block;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        """
+        # WORLDMEM: Long-term Consistent World Simulation with Memory
+        """
+        )
+    gr.Markdown(
+        """
+        ## 🚀 How to Explore WorldMem
+        Follow these simple steps to get started:
+        1. **Choose a scene**.
+        2. **Input your action sequence**.
+        3. **Click "Generate"**.
+        - You can continuously click **"Generate"** to **extend the video** and observe how well the world maintains consistency over time.
+        - For best performance, we recommend **running locally** (1s/frame on H100) instead of Spaces (5s/frame).
+        - ⭐️ If you like this project, please [give it a star on GitHub]()!
+        - 💬 For questions or feedback, feel free to open an issue or email me at **zeqixiao1@gmail.com**.
+        Happy exploring! 🌍
+        """
+    )
+        # <div style="text-align: center;">
+        # <!-- Public Website -->
+        # <a style="display:inline-block" href="https://nirvanalan.github.io/projects/GA/">
+        #     <img src="https://img.shields.io/badge/public_website-8A2BE2">
+        # </a>
+        # <!-- GitHub Stars -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://github.com/NIRVANALAN/GaussianAnything">
+        #     <img src="https://img.shields.io/github/stars/NIRVANALAN/GaussianAnything?style=social">
+        # </a>
+        # <!-- Project Page -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://nirvanalan.github.io/projects/GA/">
+        #     <img src="https://img.shields.io/badge/project_page-blue">
+        # </a>
+        # <!-- arXiv Paper -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://arxiv.org/abs/XXXX.XXXXX">
+        #     <img src="https://img.shields.io/badge/arXiv-paper-red">
+        # </a>
+        # </div>
+    example_actions = {"turn left→turn right": "AAAAAAAAAAAADDDDDDDDDDDD",
+                        "turn 360 degree": "AAAAAAAAAAAAAAAAAAAAAAAA",
+                        "turn right→go backward→look up→turn left→look down": "DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW",
+                        "turn right→go forward→turn right": "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
+                        "turn right→look up→turn right→look down": "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSS",
+                        "put item→go backward→put item→go backward":"SSUNNWWEEEEEEEEEAAASSUNNWWEEEEEEEEE"}
+    selected_image = gr.State(ICE_PLAINS_IMAGE)
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            gr.Markdown("🖼️ Start from this frame.")
+            image_display = gr.Image(value=selected_image.value, interactive=False, label="Current Frame")
+        with gr.Column():
+            gr.Markdown("🎞️ Generated videos. New contents are marked in red box.")
+            video_display = gr.Video(autoplay=True, loop=True)
+    gr.Markdown("### 🏞️ Choose a scene and start generation.")
+    with gr.Row():
+        image_display_1 = gr.Image(value=SUNFLOWERS_IMAGE, interactive=False, label="Sunflower Plains")
+        image_display_2 = gr.Image(value=DESERT_IMAGE, interactive=False, label="Desert")
+        image_display_3 = gr.Image(value=SAVANNA_IMAGE, interactive=False, label="Savanna")
+        image_display_4 = gr.Image(value=ICE_PLAINS_IMAGE, interactive=False, label="Ice Plains")
+        image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
+        image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=2):
+            gr.Markdown("### 🕹️ Input action sequences for interaction.")
+            input_box = gr.Textbox(label="Action Sequences", placeholder="Enter action sequences here, e.g. (AAAAAAAAAAAADDDDDDDDDDDD)", lines=1, max_lines=1)
+            log_output = gr.Textbox(label="History Sequences", interactive=False)
+            gr.Markdown(
+                """
+                ### 💡 Action Key Guide
+                <pre style="font-family: monospace; font-size: 14px; line-height: 1.6;">
+                W: Turn up      S: Turn down     A: Turn left     D: Turn right
+                Q: Go forward   E: Go backward   N: No-op         U: Use item
+                </pre>
+                """
+            )
+            gr.Markdown("### 👇 Click to quickly set action sequence examples.")
+            with gr.Row():
+                buttons = []
+                for action_key in list(example_actions.keys())[:2]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
+            with gr.Row():
+                for action_key in list(example_actions.keys())[2:4]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
+            with gr.Row():
+                for action_key in list(example_actions.keys())[4:6]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
+        with gr.Column(scale=1):
+            submit_button = gr.Button("🎬 Generate!", variant="primary")
+            reset_btn = gr.Button("🔄 Reset")
+            # gr.Markdown("<div style='flex-grow:1; height: 100px'></div>")
+            gr.Markdown("### ⚙️ Advanced Settings")
+            slider_denoising_step = gr.Slider(
+                minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1,
+                label="Denoising Steps",
+                info="Higher values yield better quality but slower speed"
+            )
+            slider_context_length = gr.Slider(
+                minimum=2, maximum=10, value=worldmem.n_tokens, step=1,
+                label="Context Length",
+                info="How many previous frames in temporal context window."
+            )
+            slider_memory_condition_length = gr.Slider(
+                minimum=4, maximum=16, value=worldmem.memory_condition_length, step=1,
+                label="Memory Length",
+                info="How many previous frames in memory window. (Recommended: 1, multi-frame generation is not stable yet)"
+            )
+            slider_next_frame_length = gr.Slider(
+                minimum=1, maximum=5, value=worldmem.next_frame_length, step=1,
+                label="Next Frame Length",
+                info="How many next frames to generate at once."
+            )
+    sampling_timesteps_state = gr.State(worldmem.sampling_timesteps)
+    sampling_context_length_state = gr.State(worldmem.n_tokens)
+    sampling_memory_condition_length_state = gr.State(worldmem.memory_condition_length)
+    sampling_next_frame_length_state = gr.State(worldmem.next_frame_length)
+    video_frames = gr.State(load_image_as_tensor(selected_image.value)[None].numpy())
+    memory_latent_frames = gr.State()
+    memory_actions = gr.State()
+    memory_poses = gr.State()
+    memory_c2w = gr.State()
+    memory_frame_idx = gr.State()
+    def set_action(action):
+        return action
+    for button, action_key in zip(buttons, list(example_actions.keys())):
+            button.click(set_action, inputs=[gr.State(value=example_actions[action_key])], outputs=input_box)
+    gr.Markdown("### 👇 Click to review generated examples, and continue generation based on them.")
+    example_case = gr.Textbox(label="Case", visible=False)
+    image_output = gr.Image(visible=False)
+    examples = gr.Examples(
+        examples=example_images,
+        inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_condition_length],
+        cache_examples=False
+    )
+    example_case.change(
+        fn=set_memory,
+        inputs=[example_case],
+        outputs=[log_output, image_display, video_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx]
+    )
+    submit_button.click(generate, inputs=[input_box, log_output, video_frames,
+                                          memory_latent_frames, memory_actions, memory_poses,
+                                          memory_c2w, memory_frame_idx],
+                                          outputs=[image_display, video_display, log_output,
+                                                    video_frames, memory_latent_frames, memory_actions, memory_poses,
+                                                    memory_c2w, memory_frame_idx])
+    reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=[log_output, selected_image, image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_2.select(lambda: on_image_click(DESERT_IMAGE), outputs=[log_output, selected_image, image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_3.select(lambda: on_image_click(SAVANNA_IMAGE), outputs=[log_output, selected_image, image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_4.select(lambda: on_image_click(ICE_PLAINS_IMAGE), outputs=[log_output, selected_image, image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_5.select(lambda: on_image_click(SUNFLOWERS_RAIN_IMAGE), outputs=[log_output, selected_image, image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    image_display_6.select(lambda: on_image_click(PLACE_IMAGE), outputs=[log_output, selected_image,image_display, video_frames, memory_latent_frames, memory_actions, memory_poses, memory_c2w, memory_frame_idx])
+    slider_denoising_step.change(fn=set_denoising_steps, inputs=[slider_denoising_step, sampling_timesteps_state], outputs=sampling_timesteps_state)
+    slider_context_length.change(fn=set_context_length, inputs=[slider_context_length, sampling_context_length_state], outputs=sampling_context_length_state)
+    slider_memory_condition_length.change(fn=set_memory_condition_length, inputs=[slider_memory_condition_length, sampling_memory_condition_length_state], outputs=sampling_memory_condition_length_state)
+    slider_next_frame_length.change(fn=set_next_frame_length, inputs=[slider_next_frame_length, sampling_next_frame_length_state], outputs=sampling_next_frame_length_state)
+demo.launch(share=True)

assets/desert.png ADDED Viewed

Git LFS Details

SHA256: 3b85899ba8b3d111370fbcc25079d661a04d80563ecb43e55eb0c36f36c44b76
Pointer size: 131 Bytes
Size of remote file: 298 kB

assets/ice_plains.png ADDED Viewed

Git LFS Details

SHA256: ced8ab54ebb2c8c34b6fd10340dde905dc0f6a3096109521a08ee880688ae9cc
Pointer size: 131 Bytes
Size of remote file: 238 kB

assets/place.png ADDED Viewed

Git LFS Details

SHA256: d4a1630a6f3e73c38e0dfec88bd902a5cf08bc8f857768e94199ea850d7eff81
Pointer size: 131 Bytes
Size of remote file: 212 kB

assets/plains.png ADDED Viewed

Git LFS Details

SHA256: adf5ad62acc998e35fec82c8e53b2559e26a7d78bb91a0f1cf8039a8610c3c78
Pointer size: 131 Bytes
Size of remote file: 263 kB

assets/rain_sunflower_plains.png ADDED Viewed

Git LFS Details

SHA256: 2488d19febab9dac852b5d0b6e6894ac276f48a1788220f2a8d38c7030cf7a98
Pointer size: 131 Bytes
Size of remote file: 387 kB

assets/savanna.png ADDED Viewed

Git LFS Details

SHA256: 5f8df1e988d84cd40f1af49eee73ef42d11f29d2c37a66bb5fda12d5b3278a55
Pointer size: 131 Bytes
Size of remote file: 339 kB

assets/sunflower_plains.png ADDED Viewed

Git LFS Details

SHA256: 98d828eb41fc7fb53909b66083db07208feffd66e88f4ae07092cc482e4e20df
Pointer size: 131 Bytes
Size of remote file: 283 kB

assets/worldmem_logo.png ADDED Viewed

Git LFS Details

SHA256: 8a1c0133cd1c20a557b800e5067ba97787c90e71c33f6b9695ecc44d78238426
Pointer size: 131 Bytes
Size of remote file: 313 kB

calculate_fid.py ADDED Viewed

	@@ -0,0 +1,277 @@

+#!/usr/bin/env python3
+"""
+Calculate FID (Fréchet Inception Distance) between predicted and ground truth videos.
+Usage:
+    python calculate_fid.py --videos_dir /path/to/videos
+    python calculate_fid.py --videos_dir /path/to/videos --batch_size 32
+"""
+import torch
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import argparse
+import cv2
+from torchmetrics.image.fid import FrechetInceptionDistance
+def load_video_frames(video_path, max_frames=None):
+    """
+    Load frames from a video file.
+    Args:
+        video_path: Path to the video file
+        max_frames: Maximum number of frames to load (None = all frames)
+    Returns:
+        torch.Tensor: Video frames with shape (T, C, H, W) in range [0, 255]
+    """
+    cap = cv2.VideoCapture(str(video_path))
+    frames = []
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Convert BGR to RGB
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(frame)
+        frame_count += 1
+        if max_frames and frame_count >= max_frames:
+            break
+    cap.release()
+    if len(frames) == 0:
+        raise ValueError(f"No frames loaded from {video_path}")
+    # Convert to tensor: (T, H, W, C) -> (T, C, H, W)
+    frames = np.stack(frames, axis=0)
+    frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
+    return frames
+def load_videos_from_directory(video_dir, max_frames_per_video=None, max_videos=None):
+    """
+    Load all videos from a directory.
+    Args:
+        video_dir: Directory containing .mp4 files
+        max_frames_per_video: Maximum frames to load per video
+        max_videos: Maximum number of videos to load
+    Returns:
+        torch.Tensor: All frames concatenated with shape (N, C, H, W)
+    """
+    video_dir = Path(video_dir)
+    video_paths = sorted(list(video_dir.glob("**/*.mp4")))
+    if max_videos:
+        video_paths = video_paths[:max_videos]
+    all_frames = []
+    print(f"Loading videos from {video_dir}")
+    print(f"Found {len(video_paths)} videos")
+    for video_path in tqdm(video_paths, desc="Loading videos"):
+        try:
+            frames = load_video_frames(video_path, max_frames=max_frames_per_video)
+            all_frames.append(frames)
+        except Exception as e:
+            print(f"\nWarning: Failed to load {video_path.name}: {e}")
+            continue
+    if len(all_frames) == 0:
+        raise ValueError(f"No videos loaded from {video_dir}")
+    # Concatenate all frames: (N_videos, T, C, H, W) -> (N_total_frames, C, H, W)
+    all_frames = torch.cat(all_frames, dim=0)
+    print(f"Loaded {all_frames.shape[0]} frames total")
+    print(f"Frame shape: {all_frames.shape[1:]}")
+    return all_frames
+def calculate_fid(pred_dir, gt_dir, batch_size=32, device='cuda',
+                  max_frames_per_video=None, max_videos=None):
+    """
+    Calculate FID between predicted and ground truth videos.
+    Args:
+        pred_dir: Directory containing predicted videos
+        gt_dir: Directory containing ground truth videos
+        batch_size: Batch size for FID calculation
+        device: Device to use ('cuda' or 'cpu')
+        max_frames_per_video: Maximum frames to load per video
+        max_videos: Maximum number of videos to load from each directory
+    Returns:
+        float: FID score
+    """
+    print("="*60)
+    print("FID Calculation")
+    print("="*60)
+    print(f"Pred directory: {pred_dir}")
+    print(f"GT directory: {gt_dir}")
+    print(f"Device: {device}")
+    print(f"Batch size: {batch_size}")
+    print("="*60 + "\n")
+    # Check if directories exist
+    pred_dir = Path(pred_dir)
+    gt_dir = Path(gt_dir)
+    if not pred_dir.exists():
+        raise ValueError(f"Pred directory does not exist: {pred_dir}")
+    if not gt_dir.exists():
+        raise ValueError(f"GT directory does not exist: {gt_dir}")
+    # Load videos
+    print("\n[1/3] Loading predicted videos...")
+    pred_frames = load_videos_from_directory(
+        pred_dir,
+        max_frames_per_video=max_frames_per_video,
+        max_videos=max_videos
+    )
+    print("\n[2/3] Loading ground truth videos...")
+    gt_frames = load_videos_from_directory(
+        gt_dir,
+        max_frames_per_video=max_frames_per_video,
+        max_videos=max_videos
+    )
+    # Initialize FID model
+    print("\n[3/3] Calculating FID...")
+    fid_model = FrechetInceptionDistance(normalize=True).to(device)
+    # Process pred frames in batches
+    print("Processing predicted frames...")
+    num_pred_frames = pred_frames.shape[0]
+    for i in tqdm(range(0, num_pred_frames, batch_size)):
+        batch = pred_frames[i:i+batch_size]
+        batch = batch.to(device)
+        fid_model.update(batch, real=False)
+    # Process gt frames in batches
+    print("Processing ground truth frames...")
+    num_gt_frames = gt_frames.shape[0]
+    for i in tqdm(range(0, num_gt_frames, batch_size)):
+        batch = gt_frames[i:i+batch_size]
+        batch = batch.to(device)
+        fid_model.update(batch, real=True)
+    # Compute FID
+    fid_score = fid_model.compute().item()
+    return fid_score
+def main():
+    parser = argparse.ArgumentParser(
+        description="Calculate FID between predicted and ground truth videos"
+    )
+    parser.add_argument(
+        "--videos_dir",
+        type=str,
+        default="/mnt/worldmem_valid/outputs/2025-12-01/08-09-46/videos/test_vis",
+        help="Base directory containing 'pred' and 'gt' subdirectories"
+    )
+    parser.add_argument(
+        "--pred_dir",
+        type=str,
+        default=None,
+        help="Override pred directory (default: {videos_dir}/pred)"
+    )
+    parser.add_argument(
+        "--gt_dir",
+        type=str,
+        default=None,
+        help="Override gt directory (default: {videos_dir}/gt)"
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Batch size for FID calculation (default: 32)"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to use (default: cuda if available)"
+    )
+    parser.add_argument(
+        "--max_frames_per_video",
+        type=int,
+        default=None,
+        help="Maximum frames to load per video (default: None, load all)"
+    )
+    parser.add_argument(
+        "--max_videos",
+        type=int,
+        default=50,
+        help="Maximum number of videos to load (default: None, load all)"
+    )
+    args = parser.parse_args()
+    # Determine pred and gt directories
+    videos_dir = Path(args.videos_dir)
+    if args.pred_dir:
+        pred_dir = Path(args.pred_dir)
+    else:
+        pred_dir = videos_dir / "pred"
+    if args.gt_dir:
+        gt_dir = Path(args.gt_dir)
+    else:
+        gt_dir = videos_dir / "gt"
+    # Calculate FID
+    try:
+        fid_score = calculate_fid(
+            pred_dir=pred_dir,
+            gt_dir=gt_dir,
+            batch_size=args.batch_size,
+            device=args.device,
+            max_frames_per_video=args.max_frames_per_video,
+            max_videos=args.max_videos
+        )
+        # Print results
+        print("\n" + "="*60)
+        print("RESULTS")
+        print("="*60)
+        print(f"FID Score: {fid_score:.4f}")
+        print("="*60)
+        # Save results to file
+        output_file = videos_dir / "fid_results.txt"
+        with open(output_file, 'w') as f:
+            f.write(f"FID Score: {fid_score:.4f}\n")
+            f.write(f"Pred directory: {pred_dir}\n")
+            f.write(f"GT directory: {gt_dir}\n")
+        print(f"\nResults saved to: {output_file}")
+    except Exception as e:
+        print(f"\n✗ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())

configurations/algorithm/base_algo.yaml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # This will be passed as the cfg to Algo.__init__(cfg) of your algorithm class
2	+
3	+ debug: ${debug} # inherited from configurations/config.yaml

configurations/algorithm/base_pytorch_algo.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+defaults:
+  - base_algo # inherits from configurations/algorithm/base_algo.yaml
+lr: ${experiment.training.lr}

configurations/algorithm/df_base.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+defaults:
+  - base_pytorch_algo
+# dataset-dependent configurations
+x_shape: ${dataset.observation_shape}
+frame_stack: 1
+frame_skip: 1
+data_mean: ${dataset.data_mean}
+data_std: ${dataset.data_std}
+external_cond_dim: 0 #${dataset.action_dim}
+context_frames: ${dataset.context_length}
+# training hyperparameters
+weight_decay: 1e-4
+warmup_steps: 10000
+optimizer_beta: [0.9, 0.999]
+# diffusion-related
+uncertainty_scale: 1
+guidance_scale: 0.0
+chunk_size: 1 # -1 for full trajectory diffusion, number to specify diffusion chunk size
+scheduling_matrix: autoregressive
+noise_level: random_all
+causal: True
+diffusion:
+  # training
+  objective: pred_x0
+  beta_schedule: cosine
+  schedule_fn_kwargs: {}
+  clip_noise: 20.0
+  use_snr: False
+  use_cum_snr: False
+  use_fused_snr: False
+  snr_clip: 5.0
+  cum_snr_decay: 0.98
+  timesteps: 1000
+  # sampling
+  sampling_timesteps: 50 # fixme, numer of diffusion steps, should be increased
+  ddim_sampling_eta: 1.0
+  stabilization_level: 10
+  # architecture
+  architecture:
+    network_size: 64

configurations/algorithm/df_video_worldmemminecraft.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+defaults:
+  - df_base
+n_frames: ${dataset.n_frames}
+frame_skip: ${dataset.frame_skip}
+metadata: ${dataset.metadata}
+# training hyperparameters
+weight_decay: 2e-3
+warmup_steps: 1000
+optimizer_beta: [0.9, 0.99]
+action_cond_dim: 25
+use_plucker: true
+diffusion:
+  # training
+  beta_schedule: sigmoid
+  objective: pred_v
+  use_fused_snr: True
+  cum_snr_decay: 0.96
+  clip_noise: 20.
+  # sampling
+  sampling_timesteps: 20
+  ddim_sampling_eta: 0.0
+  stabilization_level: 15
+  # architecture
+  architecture:
+    network_size: 64
+    attn_heads: 4
+    attn_dim_head: 64
+    dim_mults: [1, 2, 4, 8]
+    resolution: ${dataset.resolution}
+    attn_resolutions: [16, 32, 64, 128]
+    use_init_temporal_attn: True
+    use_linear_attn: True
+    time_emb_type: rotary
+_name: df_video_worldmemminecraft

configurations/dataset/base_dataset.yaml ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # This will be passed as the cfg to Dataset.__init__(cfg) of your dataset class
2	+
3	+ debug: ${debug} # inherited from configurations/config.yaml

configurations/dataset/base_video.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+defaults:
+  - base_dataset
+metadata: "data/${dataset.name}/metadata.json"
+data_mean: "data/${dataset.name}/data_mean.npy"
+data_std: "data/${dataset.name}/data_std.npy"
+save_dir: ???
+n_frames: 32
+context_length: 4
+resolution: 128
+observation_shape: [3, "${dataset.resolution}", "${dataset.resolution}"]
+external_cond_dim: 0
+validation_multiplier: 1
+frame_skip: 1

configurations/dataset/video_minecraft.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+defaults:
+  - base_video
+save_dir: data/minecraft_simple_backforward
+n_frames: 16 # TODO: increase later
+resolution: 128
+data_mean: 0.5
+data_std: 0.5
+action_cond_dim: 25
+context_length: 1
+frame_skip: 1
+validation_multiplier: 1
+_name: video_minecraft_oasis

configurations/experiment/base_experiment.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ debug: ${debug} # inherited from configurations/config.yaml
2	+ tasks: [main] # tasks to run sequantially, such as [training, test], useful when your project has multiple stages and you want to run only a subset of them.

configurations/experiment/base_pytorch.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+# inherites from base_experiment.yaml
+# most of the options have docs at https://lightning.ai/docs/pytorch/stable/common/trainer.html
+defaults:
+  - base_experiment
+tasks: [training] # tasks to run sequantially, change when your project has multiple stages and you want to run only a subset of them.
+num_nodes: 1 # number of gpu servers used in large scale distributed training
+training:
+  precision: 16-mixed # set float precision, 16-mixed is faster while 32 is more stable
+  compile: False # whether to compile the model with torch.compile
+  lr: 0.001 # learning rate
+  batch_size: 16 # training batch size; effective batch size is this number * gpu * nodes iff using distributed training
+  max_epochs: 1000 # set to -1 to train forever
+  max_steps: -1 # set to -1 to train forever, will override max_epochs
+  max_time: null # set to something like "00:12:00:00" to enable
+  data:
+    num_workers: 4 # number of CPU threads for data preprocessing.
+    shuffle: True # whether training data will be shuffled
+  optim:
+    accumulate_grad_batches: 1 # accumulate gradients for n batches before backprop
+    gradient_clip_val: 0 # clip gradients with norm above this value, set to 0 to disable
+  checkpointing:
+    # these are arguments to pytorch lightning's callback, `ModelCheckpoint` class
+    every_n_train_steps: 5000 # save a checkpoint every n train steps
+    every_n_epochs: null # mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``
+    train_time_interval: null # in format of "00:12:00:00", mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``.
+    enable_version_counter: False # If this is ``False``, later checkpoint will be overwrite previous ones.
+validation:
+  precision: 16-mixed
+  compile: False # whether to compile the model with torch.compile
+  batch_size: 16 # validation batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
+  val_every_n_step: 2000 # controls how frequent do we run validation, can be float (fraction of epoches) or int (steps) or null (if val_every_n_epoch is set)
+  val_every_n_epoch: null # if you want to do validation every n epoches, requires val_every_n_step to be null.
+  limit_batch: null # if null, run through validation set. Otherwise limit the number of batches to use for validation.
+  inference_mode: True # whether to run validation in inference mode (enable_grad won't work!)
+  data:
+    num_workers: 4 # number of CPU threads for data preprocessing, for validation.
+    shuffle: False # whether validation data will be shuffled
+test:
+  precision: 16-mixed
+  compile: False # whether to compile the model with torch.compile
+  batch_size: 4 # test batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
+  limit_batch: null # if null, run through test set. Otherwise limit the number of batches to use for test.
+  data:
+    num_workers: 4 # number of CPU threads for data preprocessing, for test.
+    shuffle: False # whether test data will be shuffled

configurations/experiment/exp_video.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+defaults:
+  - base_pytorch
+tasks: [training]
+training:
+  lr: 2e-5
+  precision: 16-mixed
+  batch_size: 4
+  max_epochs: -1
+  max_steps: 2000005
+  checkpointing:
+    every_n_train_steps: 2500
+  optim:
+    gradient_clip_val: 1.0
+validation:
+  val_every_n_step: 2500
+  val_every_n_epoch: null
+  batch_size: 4
+  limit_batch: 1
+test:
+  limit_batch: 1
+  batch_size: 1
+logging:
+  metrics:
+    # - fvd
+    # - fid
+    # - lpips

configurations/huggingface.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+n_tokens: 3
+pose_cond_dim: 5
+use_plucker: true
+focal_length: 0.35
+customized_validation: true
+memory_condition_length: 8
+log_video: true
+relative_embedding: true
+state_embed_only_on_qk: true
+use_domain_adapter: false
+use_memory_attention: true
+add_timestamp_embedding: true
+use_pose_prediction: true
+require_pose_prediction: true
+is_interactive: true
+diffusion:
+  sampling_timesteps: 20
+  beta_schedule: sigmoid
+  objective: pred_v
+  use_fused_snr: True
+  cum_snr_decay: 0.96
+  clip_noise: 20.
+  ddim_sampling_eta: 0.0
+  stabilization_level: 15
+  schedule_fn_kwargs: {}
+  use_snr: False
+  use_cum_snr: False
+  snr_clip: 5.0
+  timesteps: 1000
+  # architecture
+  architecture:
+    network_size: 64
+    attn_heads: 4
+    attn_dim_head: 64
+    dim_mults: [1, 2, 4, 8]
+    resolution: ${dataset.resolution}
+    attn_resolutions: [16, 32, 64, 128]
+    use_init_temporal_attn: True
+    use_linear_attn: True
+    time_emb_type: rotary
+weight_decay: 2e-3
+warmup_steps: 10000
+optimizer_beta: [0.9, 0.99]
+action_cond_dim: 25
+n_frames: 8
+frame_skip: 1
+frame_stack: 1
+uncertainty_scale: 1
+guidance_scale: 0.0
+chunk_size: 1 # -1 for full trajectory diffusion, number to specify diffusion chunk size
+scheduling_matrix: full_sequence
+noise_level: random_all
+causal: True
+x_shape: [3, 360, 640]
+context_frames: 1
+diffusion_path: zeqixiao/worldmem_checkpoints/diffusion_only.ckpt
+vae_path: zeqixiao/worldmem_checkpoints/vae_only.ckpt
+pose_predictor_path: zeqixiao/worldmem_checkpoints/pose_prediction_model_only.ckpt
+next_frame_length: 1

configurations/training.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# configuration parsing starts here
+defaults:
+  - experiment: exp_video # experiment yaml file name in configurations/experiments folder [fixme]
+  - dataset: video_minecraft # dataset yaml file name in configurations/dataset folder [fixme]
+  - algorithm: df_video_worldmemminecraft # algorithm yaml file name in configurations/algorithm folder [fixme]
+  - cluster: null # optional, cluster yaml file name in configurations/cluster folder. Leave null for local compute
+debug: false # global debug flag will be passed into configuration of experiment, dataset and algorithm
+wandb:
+  entity: xizaoqu # wandb account name / organization name [fixme]
+  project: worldmem # wandb project name; if not provided, defaults to root folder name [fixme]
+  mode: online # set wandb logging to online, offline or dryrun
+resume: null # wandb run id to resume logging and loading checkpoint from
+load: null # wandb run id containing checkpoint or a path to a checkpoint file