Spaces:

yujiwang0606
/

rCM-Wan

Running on Zero

App Files Files Community

yujiwang0606 commited on Oct 17

Commit

230e1e3

1 Parent(s): 1b3e11b

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +6 -5
app.py +131 -4
imaginaire/.DS_Store +0 -0
imaginaire/__init__.py +14 -0
imaginaire/__pycache__/__init__.cpython-310.pyc +0 -0
imaginaire/__pycache__/__init__.cpython-39.pyc +0 -0
imaginaire/callbacks/__init__.py +14 -0
imaginaire/callbacks/every_n.py +84 -0
imaginaire/callbacks/manual_gc.py +49 -0
imaginaire/config.py +410 -0
imaginaire/lazy_config/__init__.py +73 -0
imaginaire/lazy_config/__pycache__/__init__.cpython-310.pyc +0 -0
imaginaire/lazy_config/__pycache__/file_io.cpython-310.pyc +0 -0
imaginaire/lazy_config/__pycache__/instantiate.cpython-310.pyc +0 -0
imaginaire/lazy_config/__pycache__/lazy.cpython-310.pyc +0 -0
imaginaire/lazy_config/__pycache__/omegaconf_patch.cpython-310.pyc +0 -0
imaginaire/lazy_config/__pycache__/registry.cpython-310.pyc +0 -0
imaginaire/lazy_config/file_io.py +24 -0
imaginaire/lazy_config/instantiate.py +119 -0
imaginaire/lazy_config/lazy.py +442 -0
imaginaire/lazy_config/omegaconf_patch.py +65 -0
imaginaire/lazy_config/registry.py +74 -0
imaginaire/model.py +137 -0
imaginaire/trainer.py +322 -0
imaginaire/utils/.DS_Store +0 -0
imaginaire/utils/__init__.py +14 -0
imaginaire/utils/__pycache__/__init__.cpython-310.pyc +0 -0
imaginaire/utils/__pycache__/__init__.cpython-39.pyc +0 -0
imaginaire/utils/__pycache__/device.cpython-310.pyc +0 -0
imaginaire/utils/__pycache__/distributed.cpython-310.pyc +0 -0
imaginaire/utils/__pycache__/io.cpython-310.pyc +0 -0
imaginaire/utils/__pycache__/io.cpython-39.pyc +0 -0
imaginaire/utils/__pycache__/log.cpython-310.pyc +0 -0
imaginaire/utils/__pycache__/log.cpython-39.pyc +0 -0
imaginaire/utils/__pycache__/misc.cpython-310.pyc +0 -0
imaginaire/utils/callback.py +518 -0
imaginaire/utils/checkpointer.py +282 -0
imaginaire/utils/config_helper.py +201 -0
imaginaire/utils/device.py +39 -0
imaginaire/utils/distributed.py +444 -0
imaginaire/utils/easy_io/__init__.py +14 -0
imaginaire/utils/easy_io/__pycache__/__init__.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/__pycache__/easy_io.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/__pycache__/file_client.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/backends/__init__.py +28 -0
imaginaire/utils/easy_io/backends/__pycache__/__init__.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/backends/__pycache__/base_backend.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/backends/__pycache__/http_backend.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/backends/__pycache__/local_backend.cpython-310.pyc +0 -0
imaginaire/utils/easy_io/backends/__pycache__/registry_utils.cpython-310.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: RCM Wan
-emoji: 🔥
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 short_description: rCM model for Wan2.1
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: rCM-Wan
+emoji: 🦀
+colorFrom: pink
+colorTo: red
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
+license: apache-2.0
 short_description: rCM model for Wan2.1
 ---
+This demo uses the unofficial rCM models for Wan from [worstcoder/rcm-Wan](https://huggingface.co/worstcoder/rcm-Wan).

app.py CHANGED Viewed

@@ -1,7 +1,134 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import spaces
 import gradio as gr
+import time
+import os
+import requests
+from wan2pt1_t2v_rcm_infer import inference, prepare_models
+from huggingface_hub import hf_hub_download
+import random
+from types import SimpleNamespace
+import gc
+import torch
+import flash_attn
+print("flash_attn version: ", flash_attn.__version__)
+dit_path_1p3B = hf_hub_download(
+    repo_id="worstcoder/rcm-Wan",
+    filename="rCM_Wan2.1_T2V_1.3B_480p.pt",
+)
+dit_path_14B = hf_hub_download(
+    repo_id="worstcoder/rcm-Wan",
+    filename="rCM_Wan2.1_T2V_14B_480p.pt",
+)
+vae_path = hf_hub_download(
+    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    filename="Wan2.1_VAE.pth"
+)
+text_encoder_path = hf_hub_download(
+    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    filename="models_t5_umt5-xxl-enc-bf16.pth"
+)
+net_1p3B, net_14B, tokenizer, t5_encoder = prepare_models(dit_path_1p3B, dit_path_14B, vae_path, text_encoder_path)
+print("Loaded models")
+gc.collect()
+def random_seed():
+    return random.randint(0, 2**32 - 1)
+@spaces.GPU(duration=120)
+def generate_videos(prompt, model_size, num_samples, aspect_ratio, sigma_max, num_steps, seed):
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+    args = SimpleNamespace(
+        prompt=prompt,
+        model_size=model_size,
+        num_steps=num_steps,
+        num_samples=num_samples,
+        sigma_max=sigma_max,
+        num_frames=77,
+        resolution="480p",
+        aspect_ratio=aspect_ratio,
+        seed=seed,
+    )
+    with torch.no_grad():
+        video_list = inference(args, net_1p3B, net_14B, tokenizer, t5_encoder)
+    if aspect_ratio == "16:9":
+        return video_list, None
+    else:
+        return None, video_list
+def update_num_samples(model_choice):
+    if model_choice == "rCM-Wan2.1-T2V-1.3B-480p":
+        options = [1, 2, 3, 4]
+    else:
+        options = [1, 2, 3]
+    return gr.Dropdown(choices=options, value=options[0], label="num_samples")
+with gr.Blocks() as demo:
+    gr.Markdown("## rCM model for Wan")
+    examples = [
+        ["A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."],
+        ["A close-up shot captures a steaming hot pot brimming with vegetables and dumplings, set on a rustic wooden table. The camera focuses on the bubbling broth as a woman, dressed in a light, patterned blouse, reaches in with chopsticks to lift a tender leaf of cabbage from the simmering mixture. Steam rises around her as she leans back slightly, her warm smile reflecting satisfaction and joy. Her movements are smooth and deliberate, showcasing her comfort and familiarity with the dining process. The background includes a small bowl of dipping sauce and a clay pot, adding to the cozy, communal dining atmosphere."],
+        ["A dynamic time-lapse video showing the rapidly moving scenery from the window of a speeding train. The camera captures various elements such as lush green fields, towering trees, quaint countryside houses, and distant mountain ranges passing by quickly. The train window frames the view, adding a sense of speed and motion as the landscape rushes past. The camera remains static but emphasizes the fast-paced movement outside. The overall atmosphere is serene yet exhilarating, capturing the essence of travel and exploration. Medium shot focusing on the train window and the rushing scenery beyond."]
+    ]
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Row():
+                prompt = gr.Textbox(label="Text prompt", placeholder="Text prompt for videos")
+                model_size = gr.Radio(
+                    ["rCM-Wan2.1-T2V-1.3B-480p", "rCM-Wan2.1-T2V-14B-480p"],
+                    value="rCM-Wan2.1-T2V-1.3B-480p",
+                    label="Model"
+                )
+            with gr.Row():
+                num_samples = gr.Dropdown([1, 2, 3, 4], value=1, label="num_samples")
+                aspect_ratio = gr.Radio(["16:9", "9:16"], value="16:9", label="aspect_ratio")
+                sigma_max = gr.Dropdown([40, 80, 120, 200, 400, 800, 1600], value=80, label="sigma_max")
+            with gr.Row():
+                num_steps = gr.Slider(1, 4, value=4, step=1, label="num_steps")
+                seed = gr.Number(label="seed", value=random_seed(), interactive=True)
+            with gr.Row():
+                regenerate_btn = gr.Button("New Seed")
+                run_btn = gr.Button("Generate Videos")
+            with gr.Row():
+                gr.Examples(
+                    examples,
+                    inputs=[prompt],
+                    label="Example prompts"
+                )
+        with gr.Column(scale=1):
+            video_16_9 = gr.Video(label="Videos 16:9", width=832)
+            video_9_16 = gr.Video(label="Videos 9:16", width=480, visible=False)
+        def show_video(aspect):
+            if aspect == "16:9":
+                return gr.update(visible=True), gr.update(visible=False, value=None)
+            else:
+                return gr.update(visible=False, value=None), gr.update(visible=True)
+    model_size.change(fn=update_num_samples, inputs=model_size, outputs=num_samples)
+    aspect_ratio.change(show_video, inputs=aspect_ratio, outputs=[video_16_9, video_9_16])
+    regenerate_btn.click(fn=random_seed, outputs=seed)
+    run_btn.click(
+        fn=generate_videos,
+        inputs=[prompt, model_size, num_samples, aspect_ratio, sigma_max, num_steps, seed],
+        outputs=[video_16_9, video_9_16],
+    )
+demo.launch()

imaginaire/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

imaginaire/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (130 Bytes). View file

imaginaire/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (145 Bytes). View file

imaginaire/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/callbacks/every_n.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+import torch
+from imaginaire.model import ImaginaireModel
+from imaginaire.trainer import ImaginaireTrainer
+from imaginaire.utils import distributed, log
+from imaginaire.utils.callback import Callback
+class EveryN(Callback):
+    def __init__(
+        self,
+        every_n: int | None = None,
+        step_size: int = 1,
+        barrier_after_run: bool = True,
+        run_at_start: bool = False,
+    ) -> None:
+        """Constructor for `EveryN`.
+        Args:
+            every_n (int): Frequency with which callback is run during training.
+            step_size (int): Size of iteration step count. Default 1.
+            barrier_after_run (bool): Whether to have a distributed barrier after each execution. Default True, to avoid timeouts.
+            run_at_start (bool): Whether to run at the beginning of training. Default False.
+        """
+        self.every_n = every_n
+        if self.every_n == 0:
+            log.warning(
+                f"every_n is set to 0. Callback {self.__class__.__name__} will be invoked only once in the beginning of the training. Calls happens on_training_step_end will be skipped."
+            )
+        self.step_size = step_size
+        self.barrier_after_run = barrier_after_run
+        self.run_at_start = run_at_start
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        # every_n = 0 is a special case which means every_n_impl will be called only once in the beginning of the training
+        if self.every_n != 0:
+            trainer = self.trainer
+            global_step = iteration // self.step_size
+            should_run = (iteration == 1 and self.run_at_start) or (
+                global_step % self.every_n == 0
+            )  # (self.every_n - 1)
+            if should_run:
+                log.debug(f"Callback {self.__class__.__name__} fired on train_batch_end step {global_step}")
+                self.every_n_impl(trainer, model, data_batch, output_batch, loss, iteration)
+                log.debug(f"Callback {self.__class__.__name__} finished on train_batch_end step {global_step}")
+                # add necessary barrier to avoid timeout
+                if self.barrier_after_run:
+                    distributed.barrier()
+    @abstractmethod
+    def every_n_impl(
+        self,
+        trainer: ImaginaireTrainer,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int,
+    ) -> None: ...

imaginaire/callbacks/manual_gc.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+from imaginaire.callbacks.every_n import EveryN
+from imaginaire.utils import log
+class ManualGarbageCollection(EveryN):
+    """
+    Disable auto gc and manually trigger garbage collection every N iterations
+    It is super useful for large scale training to reduce gpu sync time!
+    Can reach 50% speedup.
+    It is important to note that this callback only disables gc in main process and have auto gc enabled in subprocesses.
+    We start disable gc after warm_up iterations to avoid disabling gc in subprocesses, such as dataloader, which can cause OOM
+    """
+    def __init__(self, *args, warm_up: int = 5, **kwargs):
+        kwargs["barrier_after_run"] = False
+        super().__init__(*args, **kwargs)
+        self.counter = 0
+        self.warm = warm_up
+    def every_n_impl(self, trainer, model, data_batch, output_batch, loss, iteration):
+        del trainer, model, data_batch, output_batch, loss
+        self.counter += 1
+        if self.counter < self.warm:
+            return
+        if self.counter == self.warm:
+            gc.disable()
+            log.critical("Garbage collection disabled")
+        gc.collect(1)

imaginaire/config.py ADDED Viewed

	@@ -0,0 +1,410 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training config system for Imaginare4"""
+from __future__ import annotations
+import os
+from typing import Any, TypeVar
+import attrs
+import torch
+import torch.utils.data
+from imaginaire.model import ImaginaireModel
+try:
+    from megatron.core import ModelParallelConfig
+    USE_MEGATRON = True
+except ImportError:
+    USE_MEGATRON = False
+    print("Megatron-core is not installed.")
+import builtins
+from imaginaire.lazy_config import LazyCall as L
+from imaginaire.lazy_config import LazyDict
+from imaginaire.utils import callback, distributed
+from imaginaire.utils.misc import Color
+T = TypeVar("T")
+def _is_attrs_instance(obj: object) -> bool:
+    """
+    Helper function to check if an object is an instance of an attrs-defined class.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs-defined class, False otherwise.
+    """
+    return hasattr(obj, "__attrs_attrs__")
+def make_freezable(cls: T) -> T:
+    """
+    A decorator that adds the capability to freeze instances of an attrs-defined class.
+    NOTE: This requires the wrapped attrs to be defined with attrs.define(slots=False) because we need
+    to hack on a "_is_frozen" attribute.
+    This decorator enhances an attrs-defined class with the ability to be "frozen" at runtime.
+    Once an instance is frozen, its attributes cannot be changed. It also recursively freezes
+    any attrs-defined objects that are attributes of the class.
+    Usage:
+        @make_freezable
+        @attrs.define(slots=False)
+        class MyClass:
+            attribute1: int
+            attribute2: str
+        obj = MyClass(1, 'a')
+        obj.freeze()  # Freeze the instance
+        obj.attribute1 = 2  # Raises AttributeError
+    Args:
+        cls: The class to be decorated.
+    Returns:
+        The decorated class with added freezing capability.
+    """
+    if not hasattr(cls, "__dict__"):
+        raise TypeError(
+            "make_freezable cannot be used with classes that do not define __dict__. Make sure that the wrapped "
+            "class was defined with `@attrs.define(slots=False)`"
+        )
+    original_setattr = cls.__setattr__
+    def setattr_override(self, key, value) -> None:
+        """
+        Override __setattr__ to allow modifications during initialization
+        and prevent modifications once the instance is frozen.
+        """
+        if hasattr(self, "_is_frozen") and self._is_frozen and key != "_is_frozen":
+            raise AttributeError("Cannot modify frozen instance")
+        original_setattr(self, key, value)  # type: ignore
+    cls.__setattr__ = setattr_override  # type: ignore
+    def freeze(self: object) -> None:
+        """
+        Freeze the instance and all its attrs-defined attributes.
+        """
+        for _, value in attrs.asdict(self, recurse=False).items():
+            if _is_attrs_instance(value) and hasattr(value, "freeze"):
+                value.freeze()
+        self._is_frozen = True  # type: ignore
+    cls.freeze = freeze  # type: ignore
+    return cls
+def _pretty_print_attrs_instance(obj: object, indent: int = 0, use_color: bool = False) -> str:
+    """
+    Recursively pretty prints attrs objects with color.
+    """
+    assert attrs.has(obj.__class__)
+    lines: list[str] = []
+    for attribute in attrs.fields(obj.__class__):
+        value = getattr(obj, attribute.name)
+        if attrs.has(value.__class__):
+            if use_color:
+                lines.append("   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ":")
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ":")
+            lines.append(_pretty_print_attrs_instance(value, indent + 1, use_color))
+        else:
+            if use_color:
+                lines.append(
+                    "   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ": " + Color.yellow(value)
+                )
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ": " + str(value))
+    return "\n".join(lines)
+def pretty_print_overrides(overrides: list[str] | None = None, use_color: bool = False) -> str:
+    """
+    Pretty prints overrides.
+    """
+    lines: list[str] = []
+    lines.append(Color.cyan("* ") + Color.green("overrides") + ": ")
+    for override in overrides:
+        if override == "--":
+            continue
+        if override.startswith("~"):
+            attribute_name = override[1:]
+            attribute_value = None
+        else:
+            attribute_name, attribute_value = override.split("=")
+        if use_color:
+            lines.append("   " + Color.cyan("* ") + Color.green(attribute_name) + ": " + Color.yellow(attribute_value))
+        else:
+            lines.append("   " + "* " + attribute_name + ": " + str(attribute_value))
+    return "\n".join(lines)
+@make_freezable
+@attrs.define(slots=False)  # slots=False is required for make_freezable. See the make_freezable notes for more info.
+class ObjectStoreConfig:
+    # Whether the file I/O is from object store instead of local disk.
+    enabled: bool = False
+    # Path to the object store credentials file.
+    credentials: str = ""
+    # Object store bucket to read from / write to the objects.
+    bucket: str = ""
+@make_freezable
+@attrs.define(slots=False)
+class JobConfig:
+    # Project name.
+    project: str = ""
+    # Experiment name.
+    group: str = ""
+    # Run/job name.
+    name: str = ""
+    @property
+    def path(self) -> str:
+        return f"{self.project}/{self.group}/{self.name}"
+    @property
+    def path_local(self) -> str:
+        local_root = os.environ.get("IMAGINAIRE_OUTPUT_ROOT", "checkpoints")
+        return f"{local_root}/{self.path}"
+@make_freezable
+@attrs.define(slots=False)
+class EMAConfig:
+    # Enable tracking a set of exponential moving average (EMA) weights.
+    enabled: bool = False
+    # EMA decay rate.
+    beta: float = 0.9999
+    # Enable removing "_orig_mod-" from buffer names that is added by torch.compile
+    torch_compile_buffer_renaming: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class PowerEMAConfig:
+    # Enable tracking a set of exponential moving average (EMA) weights.
+    enabled: bool = False
+    # EDM2 paper EMA decay rate.
+    s: float = 0.1
+    # Enable removing "_orig_mod-" from buffer names that is added by torch.compile
+    torch_compile_buffer_renaming: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class DDPConfig:
+    # Traverse the computation graph to find parameters that don't receive gradients.
+    find_unused_parameters: bool = False
+    # Set to True if the computation graph does not change during the whole training loop.
+    static_graph: bool = True
+    # Set to True if we want to synchronize buffers. Set to False if the sync is going to be handled elsewhere.
+    broadcast_buffers: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class CuDNNConfig:
+    # Set to True for better reproducibility of the results (only using deterministic cudnn functions).
+    deterministic: bool = False
+    # If set to True, cudnn will benchmark several algorithms and pick the fastest one.
+    benchmark: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class JITConfig:
+    # Enable exporting a JIT compiled model.
+    enabled: bool = False
+    # Input tensor shape, for example input.
+    input_shape: list[int] | None = None
+    # Device to compile onto.
+    device: str = "cuda"
+    # # Data type to compile onto.
+    dtype: str = "bfloat16"
+    # Strict mode for PyTorch JIT.
+    strict: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class CheckpointConfig:
+    # possible checkpoint class
+    type: dict | None = None
+    # for dcp, whether to use async mode
+    dcp_async_mode_enabled: bool = False
+    # Save the checkpoint every N iterations.
+    save_iter: int = 999999999
+    # Path of model weights to resume the checkpoint from.
+    load_path: str = ""
+    # Whether to load the training states (optimizer/scheduler/grad-scaler) from the checkpoint path.
+    load_training_state: bool = False
+    # Whether to load the scheduler state only from the checkpoint path. If load_training_state is True, this will be ignored.
+    only_load_scheduler_state: bool = False
+    # Load state_dict to the models in strict mode.
+    strict_resume: bool = True
+    # Configs for JIT compiling EMA model.
+    jit: JITConfig = attrs.field(factory=JITConfig)
+    # Print detailed information during checkpoint saving/loading.
+    verbose: bool = True
+    # keys not to resume from the checkpoint, choices: ["model", "optim", "scheduler", "trainer"]
+    keys_not_to_resume: list[str] = []  # noqa: RUF008
+    # Whether to use the local filesystem for broadcasting checkpoint data (used for Tensor Parallel Checkpointer).
+    broadcast_via_filesystem: bool = False
+    load_ema_to_reg: bool = False
+    # In dcp planner, skip the weight shape check, load weights into the model even weight shape is different
+    dcp_allow_mismatched_size: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class NVTXConfig:
+    """Config for NVTX ranges used in the main training loop.
+    See tutorials/nanogpt for more details on how to integrate profiling into your model."""
+    # Enable the NVTX ranges.
+    enabled: bool = False
+    # Synchronize everything in each NVTX range.
+    cuda_synchronize: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class Profiling:
+    enable_profiling: bool = False
+    enable_memory_snapshot: bool = False
+    profile_freq: int = 1
+    first_n_rank: int = 8  # -1 means all ranks, n means first n ranks dumpy profiling info
+    record_shape: bool = True
+    profile_memory: bool = True
+    with_stack: bool = True
+    with_modules: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class TrainerConfig:
+    from imaginaire.trainer import ImaginaireTrainer
+    type: builtins.type[ImaginaireTrainer] = ImaginaireTrainer
+    # Set the callback class.
+    # Defaults to the callbacks below.
+    callbacks: LazyDict[dict[str, callback.Callback]] = LazyDict(  # noqa: RUF009
+        dict(
+            ema=L(callback.EMAModelCallback)(),
+            progress_bar=L(callback.ProgressBarCallback)(),
+        )
+    )
+    # distributed parallelism strategy
+    distributed_parallelism: str = "ddp"
+    # Distributed data parallel configs.
+    ddp: DDPConfig = attrs.field(factory=DDPConfig)
+    # cuDNN configs.
+    cudnn: CuDNNConfig = attrs.field(factory=CuDNNConfig)
+    # Set the random seed.
+    seed: int = 0
+    # Gradient scaler arguments (for torch.amp.GradScaler).
+    grad_scaler_args: dict = attrs.field(factory=lambda: dict(enabled=False))
+    # Maximum number of iterations to train the model.
+    max_iter: int = 999999999
+    # Maximum number of iterations to validate the model. If None, validate on the entire dataset.
+    max_val_iter: int | None = None
+    # How often we log the training stats.
+    logging_iter: int = 100
+    # Whether we want to run the validation routines.
+    run_validation: bool = True
+    # How often we evaluate on the validation set.
+    validation_iter: int = 999999999
+    # Kill the process after N seconds since the last iteration (usually means dead job).
+    timeout_period: int = 999999999
+    # Tensor memory organization format.
+    memory_format: torch.memory_format = torch.preserve_format
+    # Gradient accumulation (update step every N iteration).
+    grad_accum_iter: int = 1
+    # Profiling config
+    profiling: Profiling = attrs.field(factory=Profiling)
+@make_freezable
+@attrs.define(slots=False)
+class Config:
+    """Config for an imaginaire4 job.
+    See /README.md/Configuration System for more info.
+    """
+    # Model configs.
+    model: LazyDict[ImaginaireModel]
+    # Optimizer configs.
+    optimizer: LazyDict[torch.optim.Optimizer]
+    # Scheduler configs.
+    scheduler: LazyDict[torch.optim.lr_scheduler.LRScheduler]
+    # Training data configs.
+    dataloader_train: LazyDict[torch.utils.data.DataLoader]
+    # Validation data configs.
+    dataloader_val: LazyDict[torch.utils.data.DataLoader]
+    # Training job configs.
+    job: JobConfig = attrs.field(factory=JobConfig)
+    # Trainer configs.
+    trainer: TrainerConfig = attrs.field(factory=TrainerConfig)
+    if USE_MEGATRON:
+        # Megatron-Core configs
+        model_parallel: ModelParallelConfig = attrs.field(factory=ModelParallelConfig)
+    else:
+        model_parallel: None = None
+    # Checkpointer configs.
+    checkpoint: CheckpointConfig = attrs.field(factory=CheckpointConfig)
+    def pretty_print(self, use_color: bool = False) -> str:
+        return _pretty_print_attrs_instance(self, 0, use_color)
+    def to_dict(self) -> dict[str, Any]:
+        return attrs.asdict(self)
+    def validate(self) -> None:
+        """Validate that the config has all required fields."""
+        # broadcast job.name across all ranks to make sure it is consistent
+        # otherwise, unaligned job names leads unaligned path to save checkpoints
+        job_name_tensor = torch.ByteTensor(bytearray(self.job.name, "utf-8")).cuda()
+        distributed.broadcast(job_name_tensor, 0)
+        self.job.name = job_name_tensor.cpu().numpy().tobytes().decode("utf-8")
+        assert self.job.project != ""
+        assert self.job.group != ""
+        assert self.job.name != ""

imaginaire/lazy_config/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from omegaconf import OmegaConf
+from imaginaire.lazy_config.instantiate import instantiate
+from imaginaire.lazy_config.lazy import LazyCall, LazyConfig, LazyDict
+from imaginaire.lazy_config.omegaconf_patch import to_object
+OmegaConf.to_object = to_object
+PLACEHOLDER = None
+__all__ = ["PLACEHOLDER", "LazyCall", "LazyConfig", "LazyDict", "instantiate"]
+DOC_BUILDING = os.getenv("_DOC_BUILDING", False)  # set in docs/conf.py
+def fixup_module_metadata(module_name, namespace, keys=None):
+    """
+    Fix the __qualname__ of module members to be their exported api name, so
+    when they are referenced in docs, sphinx can find them. Reference:
+    https://github.com/python-trio/trio/blob/6754c74eacfad9cc5c92d5c24727a2f3b620624e/trio/_util.py#L216-L241
+    """
+    if not DOC_BUILDING:
+        return
+    seen_ids = set()
+    def fix_one(qualname, name, obj):
+        # avoid infinite recursion (relevant when using
+        # typing.Generic, for example)
+        if id(obj) in seen_ids:
+            return
+        seen_ids.add(id(obj))
+        mod = getattr(obj, "__module__", None)
+        if mod is not None and (mod.startswith(module_name) or mod.startswith("fvcore.")):
+            obj.__module__ = module_name
+            # Modules, unlike everything else in Python, put fully-qualitied
+            # names into their __name__ attribute. We check for "." to avoid
+            # rewriting these.
+            if hasattr(obj, "__name__") and "." not in obj.__name__:
+                obj.__name__ = name
+                obj.__qualname__ = qualname
+            if isinstance(obj, type):
+                for attr_name, attr_value in obj.__dict__.items():
+                    fix_one(objname + "." + attr_name, attr_name, attr_value)
+    if keys is None:
+        keys = namespace.keys()
+    for objname in keys:
+        if not objname.startswith("_"):
+            obj = namespace[objname]
+            fix_one(objname, objname, obj)
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata

imaginaire/lazy_config/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.67 kB). View file

imaginaire/lazy_config/__pycache__/file_io.cpython-310.pyc ADDED Viewed

Binary file (387 Bytes). View file

imaginaire/lazy_config/__pycache__/instantiate.cpython-310.pyc ADDED Viewed

Binary file (3.22 kB). View file

imaginaire/lazy_config/__pycache__/lazy.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

imaginaire/lazy_config/__pycache__/omegaconf_patch.cpython-310.pyc ADDED Viewed

Binary file (2.13 kB). View file

imaginaire/lazy_config/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (1.33 kB). View file

imaginaire/lazy_config/file_io.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler
+from iopath.common.file_io import PathManager as PathManagerBase
+__all__ = ["PathHandler", "PathManager"]
+PathManager = PathManagerBase()
+PathManager.register_handler(HTTPURLHandler())
+PathManager.register_handler(OneDrivePathHandler())

imaginaire/lazy_config/instantiate.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections.abc as abc
+import dataclasses
+from typing import Any
+import attrs
+from imaginaire.lazy_config.registry import _convert_target_to_string, locate
+from imaginaire.utils import log
+__all__ = ["dump_dataclass", "instantiate"]
+def is_dataclass_or_attrs(target):
+    return dataclasses.is_dataclass(target) or attrs.has(target)
+def dump_dataclass(obj: Any):
+    """
+    Dump a dataclass recursively into a dict that can be later instantiated.
+    Args:
+        obj: a dataclass object
+    Returns:
+        dict
+    """
+    assert dataclasses.is_dataclass(obj) and not isinstance(obj, type), (
+        "dump_dataclass() requires an instance of a dataclass."
+    )
+    ret = {"_target_": _convert_target_to_string(type(obj))}
+    for f in dataclasses.fields(obj):
+        v = getattr(obj, f.name)
+        if dataclasses.is_dataclass(v):
+            v = dump_dataclass(v)
+        if isinstance(v, (list, tuple)):
+            v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+        ret[f.name] = v
+    return ret
+def instantiate(cfg, *args, **kwargs):
+    """
+    Recursively instantiate objects defined in dictionaries by
+    "_target_" and arguments.
+    Args:
+        cfg: a dict-like object with "_target_" that defines the caller, and
+            other keys that define the arguments
+        args: Optional positional parameters pass-through.
+        kwargs: Optional named parameters pass-through.
+    Returns:
+        object instantiated by cfg
+    """
+    from omegaconf import DictConfig, ListConfig, OmegaConf
+    if isinstance(cfg, ListConfig):
+        lst = [instantiate(x) for x in cfg]
+        return ListConfig(lst, flags={"allow_objects": True})
+    if isinstance(cfg, list):
+        # Specialize for list, because many classes take
+        # list[objects] as arguments, such as ResNet, DatasetMapper
+        return [instantiate(x) for x in cfg]
+    # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
+    # instantiate it to the actual dataclass.
+    if isinstance(cfg, DictConfig) and is_dataclass_or_attrs(cfg._metadata.object_type):
+        return OmegaConf.to_object(cfg)
+    if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+        # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+        # but faster: https://github.com/facebookresearch/hydra/issues/1200
+        is_recursive = getattr(cfg, "_recursive_", True)
+        if is_recursive:
+            cfg = {k: instantiate(v) for k, v in cfg.items()}
+        else:
+            cfg = {k: v for k, v in cfg.items()}
+        # pop the _recursive_ key to avoid passing it as a parameter
+        if "_recursive_" in cfg:
+            cfg.pop("_recursive_")
+        cls = cfg.pop("_target_")
+        cls = instantiate(cls)
+        if isinstance(cls, str):
+            cls_name = cls
+            cls = locate(cls_name)
+            assert cls is not None, cls_name
+        else:
+            try:
+                cls_name = cls.__module__ + "." + cls.__qualname__
+            except Exception:
+                # target could be anything, so the above could fail
+                cls_name = str(cls)
+        assert callable(cls), f"_target_ {cls} does not define a callable object"
+        try:
+            # override config with kwargs
+            instantiate_kwargs = {}
+            instantiate_kwargs.update(cfg)
+            instantiate_kwargs.update(kwargs)
+            return cls(*args, **instantiate_kwargs)
+        except TypeError:
+            log.error(f"Error when instantiating {cls_name}!")
+            raise
+    return cfg  # return as-is if don't know what to do

imaginaire/lazy_config/lazy.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import builtins
+import collections.abc as abc
+import importlib
+import inspect
+import logging
+import os
+import pickle
+import uuid
+from collections import OrderedDict
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import is_dataclass
+from typing import TYPE_CHECKING, Any, Generic, TypeAlias, TypeVar, cast
+import attrs
+import yaml
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from imaginaire.utils import log
+try:
+    import dill as dill_pickle
+except ImportError:
+    dill_pickle = None
+try:
+    import cloudpickle
+except ImportError:
+    cloudpickle = None
+from imaginaire.lazy_config.file_io import PathManager
+from imaginaire.lazy_config.registry import _convert_target_to_string
+__all__ = ["LazyCall", "LazyConfig", "LazyDict"]
+T = TypeVar("T")
+def sort_dict(d: dict[str, Any]) -> OrderedDict[str, Any]:
+    return OrderedDict(sorted(d.items(), key=lambda x: x[0]))
+def dict_representer(dumper: yaml.Dumper, data: OrderedDict[str, Any]) -> yaml.nodes.MappingNode:
+    return dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
+def sort_recursive(obj: dict[str, Any] | list[Any] | Any) -> OrderedDict[str, Any] | list[Any] | Any:
+    if isinstance(obj, dict):
+        return sort_dict({k: sort_recursive(v) for k, v in obj.items()})
+    elif isinstance(obj, list):
+        return [sort_recursive(item) for item in obj]
+    return obj
+yaml.add_representer(OrderedDict, dict_representer)
+OmegaConf.register_new_resolver("add", lambda *vals: sum(vals))
+OmegaConf.register_new_resolver("subtract", lambda *vals: vals[0] - sum(vals[1:]))
+def get_default_params(cls_or_func):
+    if callable(cls_or_func):
+        # inspect signature for function
+        signature = inspect.signature(cls_or_func)
+    else:
+        # inspect signature for class
+        signature = inspect.signature(cls_or_func.__init__)
+    params = signature.parameters
+    default_params = {
+        name: param.default for name, param in params.items() if param.default is not inspect.Parameter.empty
+    }
+    return default_params
+if TYPE_CHECKING:
+    # Have `LazyDict[T]` behave as `T`, so that attribute access works. Ideally, it
+    # would be a subclass of `T`, but this doesn't seem to be possible in the type
+    # system yet.
+    LazyDict: TypeAlias = T
+else:
+    LazyDict = DictConfig
+class LazyCall(Generic[T]):
+    """
+    Wrap a callable so that when it's called, the call will not be executed,
+    but returns a dict that describes the call.
+    LazyCall object has to be called with only keyword arguments. Positional
+    arguments are not yet supported.
+    Examples:
+    ::
+        from detectron2.config import instantiate, LazyCall
+        layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+        layer_cfg.out_channels = 64   # can edit it afterwards
+        layer = instantiate(layer_cfg)
+    """
+    def __init__(self, target: type[T]):
+        if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+            raise TypeError(f"target of LazyCall must be a callable or defines a callable! Got {target}")
+        self._target = target
+    def __call__(self, **kwargs) -> LazyDict[T]:
+        if is_dataclass(self._target) or attrs.has(self._target):
+            # omegaconf object cannot hold dataclass type
+            # https://github.com/omry/omegaconf/issues/784
+            target = _convert_target_to_string(self._target)
+        else:
+            target = self._target
+        kwargs["_target_"] = target
+        _final_params = get_default_params(self._target)
+        _final_params.update(kwargs)
+        return cast(LazyDict[T], DictConfig(content=_final_params, flags={"allow_objects": True}))
+def _visit_dict_config(cfg, func):
+    """
+    Apply func recursively to all DictConfig in cfg.
+    """
+    if isinstance(cfg, DictConfig):
+        func(cfg)
+        for v in cfg.values():
+            _visit_dict_config(v, func)
+    elif isinstance(cfg, ListConfig):
+        for v in cfg:
+            _visit_dict_config(v, func)
+def _validate_py_syntax(filename):
+    # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
+    with PathManager.open(filename, "r") as f:
+        content = f.read()
+    try:
+        ast.parse(content)
+    except SyntaxError as e:
+        raise SyntaxError(f"Config file {filename} has syntax error!") from e
+def _cast_to_config(obj):
+    # if given a dict, return DictConfig instead
+    if isinstance(obj, dict):
+        return DictConfig(obj, flags={"allow_objects": True})
+    return obj
+_CFG_PACKAGE_NAME = "detectron2._cfg_loader"
+"""
+A namespace to put all imported config into.
+"""
+def _random_package_name(filename):
+    # generate a random package name when loading config files
+    return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename)
+@contextmanager
+def _patch_import():
+    """
+    Enhance relative import statements in config files, so that they:
+    1. locate files purely based on relative location, regardless of packages.
+       e.g. you can import file without having __init__
+    2. do not cache modules globally; modifications of module states has no side effect
+    3. support other storage system through PathManager, so config files can be in the cloud
+    4. imported dict are turned into omegaconf.DictConfig automatically
+    """
+    old_import = builtins.__import__
+    def find_relative_file(original_file, relative_import_path, level):
+        # NOTE: "from . import x" is not handled. Because then it's unclear
+        # if such import should produce `x` as a python module or DictConfig.
+        # This can be discussed further if needed.
+        relative_import_err = """
+Relative import of directories is not allowed within config files.
+Within a config file, relative import can only import other config files.
+""".replace("\n", " ")
+        if not len(relative_import_path):
+            raise ImportError(relative_import_err)
+        cur_file = os.path.dirname(original_file)
+        for _ in range(level - 1):
+            cur_file = os.path.dirname(cur_file)
+        cur_name = relative_import_path.lstrip(".")
+        for part in cur_name.split("."):
+            cur_file = os.path.join(cur_file, part)
+        if not cur_file.endswith(".py"):
+            cur_file += ".py"
+        if not PathManager.isfile(cur_file):
+            cur_file_no_suffix = cur_file[: -len(".py")]
+            if PathManager.isdir(cur_file_no_suffix):
+                raise ImportError(f"Cannot import from {cur_file_no_suffix}." + relative_import_err)
+            else:
+                raise ImportError(
+                    f"Cannot import name {relative_import_path} from {original_file}: {cur_file} does not exist."
+                )
+        return cur_file
+    def new_import(name, globals=None, locals=None, fromlist=(), level=0):
+        if (
+            # Only deal with relative imports inside config files
+            level != 0 and globals is not None and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME)
+        ):
+            cur_file = find_relative_file(globals["__file__"], name, level)
+            _validate_py_syntax(cur_file)
+            spec = importlib.machinery.ModuleSpec(_random_package_name(cur_file), None, origin=cur_file)
+            module = importlib.util.module_from_spec(spec)
+            module.__file__ = cur_file
+            with PathManager.open(cur_file) as f:
+                content = f.read()
+            exec(compile(content, cur_file, "exec"), module.__dict__)
+            for name in fromlist:  # turn imported dict into DictConfig automatically
+                val = _cast_to_config(module.__dict__[name])
+                module.__dict__[name] = val
+            return module
+        return old_import(name, globals, locals, fromlist=fromlist, level=level)
+    builtins.__import__ = new_import
+    yield new_import
+    builtins.__import__ = old_import
+class LazyConfig:
+    """
+    Provide methods to save, load, and overrides an omegaconf config object
+    which may contain definition of lazily-constructed objects.
+    """
+    @staticmethod
+    def load_rel(filename: str, keys: None | str | tuple[str, ...] = None):
+        """
+        Similar to :meth:`load()`, but load path relative to the caller's
+        source file.
+        This has the same functionality as a relative import, except that this method
+        accepts filename as a string, so more characters are allowed in the filename.
+        """
+        caller_frame = inspect.stack()[1]
+        caller_fname = caller_frame[0].f_code.co_filename
+        assert caller_fname != "<string>", "load_rel Unable to find caller"
+        caller_dir = os.path.dirname(caller_fname)
+        filename = os.path.join(caller_dir, filename)
+        return LazyConfig.load(filename, keys)
+    @staticmethod
+    def load(filename: str, keys: None | str | tuple[str, ...] = None):
+        """
+        Load a config file.
+        Args:
+            filename: absolute path or relative path w.r.t. the current working directory
+            keys: keys to load and return. If not given, return all keys
+                (whose values are config objects) in a dict.
+        """
+        has_keys = keys is not None
+        filename = filename.replace("/./", "/")  # redundant
+        if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]:
+            raise ValueError(f"Config file {filename} has to be a python or yaml file.")
+        if filename.endswith(".py"):
+            _validate_py_syntax(filename)
+            with _patch_import():
+                # Record the filename
+                module_namespace = {
+                    "__file__": filename,
+                    "__package__": _random_package_name(filename),
+                }
+                with PathManager.open(filename) as f:
+                    content = f.read()
+                # Compile first with filename to:
+                # 1. make filename appears in stacktrace
+                # 2. make load_rel able to find its parent's (possibly remote) location
+                exec(compile(content, filename, "exec"), module_namespace)
+            ret = module_namespace
+        else:
+            with PathManager.open(filename) as f:
+                obj = yaml.unsafe_load(f)
+            ret = OmegaConf.create(obj, flags={"allow_objects": True})
+        if has_keys:
+            if isinstance(keys, str):
+                return _cast_to_config(ret[keys])
+            else:
+                return tuple(_cast_to_config(ret[a]) for a in keys)
+        else:
+            if filename.endswith(".py"):
+                # when not specified, only load those that are config objects
+                ret = DictConfig(
+                    {
+                        name: _cast_to_config(value)
+                        for name, value in ret.items()
+                        if isinstance(value, (DictConfig, ListConfig, dict)) and not name.startswith("_")
+                    },
+                    flags={"allow_objects": True},
+                )
+            return ret
+    @staticmethod
+    def save_pkl(cfg, filename: str) -> str:
+        """
+        Saves a Config object to a file using pickle serialization. This method is typically used
+        when the configuration object contains complex objects, such as lambdas, that are not supported by
+        simpler serialization methods like YAML. The function attempts to create a deep copy of the configuration
+        object before serialization to ensure that the original object remains unmodified.
+        Args:
+            cfg: A Config object to be serialized and saved.
+            filename: The path and name of the file where the configuration should be saved. The function
+                      assumes the file extension indicates a pickle format (e.g., .pkl).
+        Returns:
+            str: The filename to which the configuration was saved. This can be used to verify the file location
+                 or log the outcome.
+        Notes:
+            - The function logs a warning if the configuration is successfully saved using pickle.
+            - If saving fails, an error is logged with the exception details.
+        """
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        try:
+            with PathManager.open(filename, "wb") as f:
+                pickle.dump(cfg, f)
+            log.warning(f"Config is saved using pickle at {filename}.")
+        except Exception as e:
+            log.error(f"Failed to save config to {filename}: {e}. Trying dill or cloudpickle instead")
+            if dill_pickle:
+                try:
+                    with PathManager.open(filename, "wb") as f:
+                        pickle.dump(dill_pickle.dumps(cfg, recurse=True), f)
+                        log.warning(f"Config is saved using dill at {filename}.")
+                except Exception as e:
+                    log.error(f"Failed to save config to {filename}: {e}.")
+                    if cloudpickle:
+                        try:
+                            with PathManager.open(filename, "wb") as f:
+                                pickle.dump(cloudpickle.dumps(cfg), f)
+                            log.warning(f"Config is saved using cloudpickle at {filename}.")
+                        except Exception as e:
+                            log.error(f"Failed to save config to {filename}: {e}.")
+                    else:
+                        log.error("cloudpickle is not available. Cannot save the config.")
+                        raise e
+        return filename
+    @staticmethod
+    def save_yaml(cfg, filename: str) -> str:
+        """
+        Saves a Config object to a file using YAML serialization. This method is beneficial when the configuration object's content needs to be human-readable and easily editable. YAML is suitable for configurations that do not contain complex types like lambdas, which must be handled differently. The function converts unserializable items to strings before saving to ensure compatibility with YAML serialization.
+        Args:
+            cfg: A Config object to be serialized and saved. It handles both DictConfig and ListConfig types.
+            filename: The path and name of the file where the configuration should be saved. The function does not require a specific file extension but typically uses '.yaml'.
+        Returns:
+            str: The filename to which the configuration was saved. This can be used to verify the file location or log the outcome.
+        Notes:
+            - The function logs a warning if the configuration is successfully saved using YAML.
+            - If saving fails, an error is logged with the exception details.
+        """
+        logger = logging.getLogger(__name__)
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        # Define a function to check if an item is serializable to YAML
+        def is_serializable(item):
+            try:
+                OmegaConf.to_yaml(item)
+                return True
+            except Exception as e:
+                return False
+        # Function to convert unserializable items to strings
+        def serialize_config(config):
+            if isinstance(config, DictConfig):
+                for key, value in config.items():
+                    if isinstance(value, (DictConfig, ListConfig)):
+                        try:
+                            if "_target_" in value:
+                                default_params = get_default_params(value["_target_"])
+                                for default_key, default_v in default_params.items():
+                                    if default_key not in value:
+                                        value[default_key] = default_v
+                        except Exception as e:
+                            log.error(f"Failed to add default argument values: {e}")
+                        serialize_config(value)
+                    else:
+                        if not is_serializable(value) and value is not None:
+                            config[key] = str(value)
+            elif isinstance(config, ListConfig):
+                for i, item in enumerate(config):
+                    if isinstance(item, (DictConfig, ListConfig)):
+                        serialize_config(item)
+                    else:
+                        if not is_serializable(item) and item is not None:
+                            config[i] = str(item)
+            else:
+                raise NotImplementedError("Input config must be a DictConfig or ListConfig.")
+            return config
+        # Convert Config object to a DictConfig object.
+        config_dict = attrs.asdict(cfg)
+        config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+        # Serialize the DictConfig object by converting non-serializable objects to strings.
+        config_omegaconf = serialize_config(config_omegaconf)
+        config_dict: dict[str, Any] = OmegaConf.to_container(config_omegaconf, resolve=True)
+        sorted_config: OrderedDict[str, Any] = sort_recursive(config_dict)
+        with open(filename, "w") as f:
+            yaml.dump(sorted_config, f, default_flow_style=False)
+        log.warning(f"Config is saved using omegaconf at {filename}.")
+        return filename

imaginaire/lazy_config/omegaconf_patch.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from omegaconf import OmegaConf
+from omegaconf.base import DictKeyType, SCMode
+from omegaconf.dictconfig import DictConfig  # pragma: no cover
+def to_object(cfg: Any) -> dict[DictKeyType, Any] | list[Any] | None | str | Any:
+    """
+    Converts an OmegaConf configuration object to a native Python container (dict or list), unless
+    the configuration is specifically created by LazyCall, in which case the original configuration
+    is returned directly.
+    This function serves as a modification of the original `to_object` method from OmegaConf,
+    preventing DictConfig objects created by LazyCall from being automatically converted to Python
+    dictionaries. This ensures that configurations meant to be lazily evaluated retain their intended
+    structure and behavior.
+    Differences from OmegaConf's original `to_object`:
+    - Adds a check at the beginning to return the configuration unchanged if it is created by LazyCall.
+    Reference:
+    - Original OmegaConf `to_object` method: https://github.com/omry/omegaconf/blob/master/omegaconf/omegaconf.py#L595
+    Args:
+        cfg (Any): The OmegaConf configuration object to convert.
+    Returns:
+        Union[Dict[DictKeyType, Any], List[Any], None, str, Any]: The converted Python container if
+        `cfg` is not a LazyCall created configuration, otherwise the unchanged `cfg`.
+    Examples:
+        >>> cfg = DictConfig({"key": "value", "_target_": "Model"})
+        >>> to_object(cfg)
+        DictConfig({"key": "value", "_target_": "Model"})
+        >>> cfg = DictConfig({"list": [1, 2, 3]})
+        >>> to_object(cfg)
+        {'list': [1, 2, 3]}
+    """
+    if isinstance(cfg, DictConfig) and "_target_" in cfg.keys():
+        return cfg
+    return OmegaConf.to_container(
+        cfg=cfg,
+        resolve=True,
+        throw_on_missing=True,
+        enum_to_str=False,
+        structured_config_mode=SCMode.INSTANTIATE,
+    )

imaginaire/lazy_config/registry.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pydoc
+from typing import Any
+from fvcore.common.registry import Registry  # for backward compatibility.
+"""
+``Registry`` and `locate` provide ways to map a string (typically found
+in config files) to callable objects.
+"""
+__all__ = ["Registry", "locate"]
+def _convert_target_to_string(t: Any) -> str:
+    """
+    Inverse of ``locate()``.
+    Args:
+        t: any object with ``__module__`` and ``__qualname__``
+    """
+    module, qualname = t.__module__, t.__qualname__
+    # Compress the path to this object, e.g. ``module.submodule._impl.class``
+    # may become ``module.submodule.class``, if the later also resolves to the same
+    # object. This simplifies the string, and also is less affected by moving the
+    # class implementation.
+    module_parts = module.split(".")
+    for k in range(1, len(module_parts)):
+        prefix = ".".join(module_parts[:k])
+        candidate = f"{prefix}.{qualname}"
+        try:
+            if locate(candidate) is t:
+                return candidate
+        except ImportError:
+            pass
+    return f"{module}.{qualname}"
+def locate(name: str) -> Any:
+    """
+    Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
+    such as "module.submodule.class_name".
+    Raise Exception if it cannot be found.
+    """
+    obj = pydoc.locate(name)
+    # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
+    # by pydoc.locate. Try a private function from hydra.
+    if obj is None:
+        try:
+            # from hydra.utils import get_method - will print many errors
+            from hydra.utils import _locate
+        except ImportError as e:
+            raise ImportError(f"Cannot dynamically locate object {name}!") from e
+        else:
+            obj = _locate(name)  # it raises if fails
+    return obj

imaginaire/model.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+import torch
+from imaginaire.lazy_config import LazyDict, instantiate
+class ImaginaireModel(torch.nn.Module):
+    """The base model class of Imaginaire. It is inherited from torch.nn.Module.
+    All models in Imaginaire should inherit ImaginaireModel. It should include the implementions for all the
+    computation graphs. All inheriting child classes should implement the following methods:
+    - training_step(): The training step of the model, including the loss computation.
+    - validation_step(): The validation step of the model, including the loss computation.
+    - forward(): The computation graph for model inference.
+    The following methods have default implementations in ImaginaireModel:
+    - init_optimizer_scheduler(): Creates the optimizer and scheduler for the model.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+    def init_optimizer_scheduler(
+        self,
+        optimizer_config: LazyDict[torch.optim.Optimizer],
+        scheduler_config: LazyDict[torch.optim.lr_scheduler.LRScheduler],
+    ) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
+        """Creates the optimizer and scheduler for the model.
+        Args:
+            config_model (ModelConfig): The config object for the model.
+        Returns:
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+        """
+        optimizer_config.params = self.parameters()
+        optimizer = instantiate(optimizer_config)
+        scheduler_config.optimizer = optimizer
+        scheduler = instantiate(scheduler_config)
+        return optimizer, scheduler
+    def training_step(
+        self, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """The training step of the model, including the loss computation.
+        Args:
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+        Returns:
+            output_batch (dict[str, torch.Tensor]): Auxiliary model output from the training batch.
+            loss (torch.Tensor): The total loss for backprop (weighted sum of various losses).
+        """
+        raise NotImplementedError
+    @torch.no_grad()
+    def validation_step(
+        self, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """The validation step of the model, including the loss computation.
+        Args:
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+        Returns:
+            output_batch (dict[str, torch.Tensor]): Auxiliary model output from the validation batch.
+            loss (torch.Tensor): The total loss (weighted sum of various losses).
+        """
+        raise NotImplementedError
+    @torch.inference_mode()
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """The computation graph for model inference.
+        Args:
+            *args: Whatever you decide to pass into the forward method.
+            **kwargs: Keyword arguments are also possible.
+        Return:
+            Your model's output.
+        """
+        raise NotImplementedError
+    def on_model_init_start(self, set_barrier=False) -> None:
+        return
+    def on_model_init_end(self, set_barrier=False) -> None:
+        return
+    def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None:
+        """The model preparation before the training is launched
+        Args:
+            memory_format (torch.memory_format): Memory format of the model.
+        """
+        pass
+    def on_before_zero_grad(
+        self, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LRScheduler, iteration: int
+    ) -> None:
+        """Hook before zero_grad() is called.
+        Args:
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            iteration (int): Current iteration number.
+        """
+        pass
+    def on_after_backward(self, iteration: int = 0) -> None:
+        """Hook after loss.backward() is called.
+        This method is called immediately after the backward pass, allowing for custom operations
+        or modifications to be performed on the gradients before the optimizer step.
+        Args:
+            iteration (int): Current iteration number.
+        """
+        pass

imaginaire/trainer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import inspect
+import os
+import signal
+import torch
+import torch.distributed as dist
+import torch.utils.data
+from imaginaire.utils.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
+try:
+    from megatron.core import parallel_state
+    USE_MEGATRON = True
+except ImportError:
+    USE_MEGATRON = False
+    print("Megatron-core is not installed.")
+from imaginaire.lazy_config import LazyConfig, instantiate
+from imaginaire.model import ImaginaireModel
+from imaginaire.utils import callback, distributed, log, misc
+from imaginaire.utils.checkpointer import Checkpointer
+class ImaginaireTrainer:
+    """The base trainer class of Imaginaire.
+    All trainers in Imaginaire should inherit ImaginaireTrainer. It contains the basic functionality for model training
+    (particularly suited for large-scale training), including data parallel (DDP/FSDP), model weight average (EMA),
+    mixed-precision training (fp16/bf16).
+    Attributes:
+        checkpointer (Checkpointer): checkpointer object to save/load model weights and optimizer states.
+        training_timer (misc.Timer): Timer object to time code blocks and functions.
+    """
+    def __init__(self, config):
+        """Constructor of the trainer.
+        Args:
+            config (Config): The config object for the Imaginaire codebase.
+        """
+        super().__init__()
+        self.config = config
+        # Set up the distributed computing environment.
+        with misc.timer("init_distributed"):
+            distributed.init()
+            # Set up parallel states.
+            if hasattr(config.model, "context_parallel_size"):
+                if config.model_parallel.context_parallel_size > 1:
+                    raise ValueError(
+                        "Both config.model.context_parallel_size and config.model_parallel.context_parallel_size are set. "
+                        "config.model.context_parallel_size is deprecated. Please only set config.model_parallel.context_parallel_size."
+                    )
+                else:
+                    log.critical(
+                        "Using deprecated config.model.context_parallel_size. Please use config.model_parallel.context_parallel_size instead."
+                    )
+                    config.model_parallel.context_parallel_size = config.model.context_parallel_size
+            if USE_MEGATRON:
+                if (
+                    "create_gloo_process_groups"
+                    in inspect.signature(parallel_state.initialize_model_parallel).parameters
+                ):
+                    parallel_state.initialize_model_parallel(
+                        pipeline_model_parallel_size=config.model_parallel.pipeline_model_parallel_size,
+                        tensor_model_parallel_size=config.model_parallel.tensor_model_parallel_size,
+                        context_parallel_size=config.model_parallel.context_parallel_size,
+                        create_gloo_process_groups=False,
+                    )
+                else:
+                    parallel_state.initialize_model_parallel(
+                        pipeline_model_parallel_size=config.model_parallel.pipeline_model_parallel_size,
+                        tensor_model_parallel_size=config.model_parallel.tensor_model_parallel_size,
+                        context_parallel_size=config.model_parallel.context_parallel_size,
+                    )
+                # `config.model_parallel.sequence_parallel` is a bool that indicates whether to use sequence parallelism.
+                # It is not part of the original `parallel_state` API, so we need to set it manually.
+                parallel_state.sequence_parallel = config.model_parallel.sequence_parallel
+                if parallel_state.sequence_parallel:
+                    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+        # Create the local job directory, save the config file, and pipe to a local log.
+        if distributed.is_rank0():
+            os.makedirs(config.job.path_local, exist_ok=True)
+            # Save the config as .pkl for reproducibility.
+            LazyConfig.save_pkl(config, f"{config.job.path_local}/config.pkl")
+            # Save the config as .yaml for reading or parsing experiment hyperparameters.
+            LazyConfig.save_yaml(config, f"{config.job.path_local}/config.yaml")
+        dist.barrier()
+        log.init_loguru_file(f"{config.job.path_local}/stdout.log")
+        if distributed.is_rank0():
+            # Print important environment variables and the effective config.
+            log.info("Config:\n" + config.pretty_print(use_color=True))
+        misc.print_environ_variables(["TORCH_HOME", "IMAGINAIRE_OUTPUT_ROOT"])
+        # Set the random seed. If multi-GPU, different ranks are set with different seeds.
+        misc.set_random_seed(seed=config.trainer.seed, by_rank=True)
+        # Initialize cuDNN.
+        torch.backends.cudnn.deterministic = config.trainer.cudnn.deterministic
+        torch.backends.cudnn.benchmark = config.trainer.cudnn.benchmark
+        # Floating-point precision settings.
+        torch.backends.cudnn.allow_tf32 = torch.backends.cuda.matmul.allow_tf32 = True
+        # Initialize the callback functions.
+        self.callbacks = callback.CallBackGroup(config=config, trainer=self)
+        # Initialize the model checkpointer.
+        if config.checkpoint.type is None:
+            self.checkpointer = Checkpointer(config.checkpoint, config.job, callbacks=self.callbacks)
+        else:
+            self.checkpointer: Checkpointer = instantiate(
+                config.checkpoint.type, config.checkpoint, config.job, callbacks=self.callbacks
+            )
+        # Initialize the timer for speed benchmarking.
+        self.training_timer = misc.TrainingTimer()
+        # Send a TimeoutError if a training step takes over timeout_period seconds.
+        signal.signal(signal.SIGALRM, functools.partial(misc.timeout_handler, config.trainer.timeout_period))  # type: ignore
+    def train(
+        self,
+        model: ImaginaireModel,
+        dataloader_train: torch.utils.data.DataLoader,
+        dataloader_val: torch.utils.data.DataLoader,
+    ) -> None:
+        """The training function.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            dataloader_train (torch.utils.data.DataLoader): The training data loader.
+            dataloader_val (torch.utils.data.DataLoader): The validation data loader.
+        """
+        # Leaving this for backward compability for now, but we can think about moving this to model.on_train_start for all models.
+        model = model.to("cuda", memory_format=self.config.trainer.memory_format)  # type: ignore
+        model.on_train_start(self.config.trainer.memory_format)
+        # Initialize the optimizer, scheduler, and grad_scaler.
+        self.callbacks.on_optimizer_init_start()
+        optimizer, scheduler = model.init_optimizer_scheduler(self.config.optimizer, self.config.scheduler)
+        grad_scaler = torch.amp.GradScaler("cuda", **self.config.trainer.grad_scaler_args)
+        self.callbacks.on_optimizer_init_end()
+        # Load the model checkpoint and get the starting iteration number.
+        iteration = self.checkpointer.load(model, optimizer, scheduler, grad_scaler)
+        grad_accum_iter = 0
+        log.critical(f"Distributed parallelism mode: {self.config.trainer.distributed_parallelism}")
+        if self.config.trainer.distributed_parallelism == "ddp":
+            # Create a DDP model wrapper.
+            model_ddp = distributed.parallel_model_wrapper(self.config.trainer.ddp, model)
+        elif self.config.trainer.distributed_parallelism == "fsdp":
+            model_ddp = model
+        else:
+            raise ValueError(f"Unknown distributed parallelism mode: {self.config.trainer.distributed_parallelism}")
+        log.info("Starting training...")
+        self.callbacks.on_train_start(model, iteration=iteration)
+        # Initial validation.
+        if self.config.trainer.run_validation and iteration == 0:
+            self.validate(model, dataloader_val, iteration=iteration)
+            log.info("Initial validation done.")
+        _end_training = False
+        with (
+            maybe_enable_profiling(self.config, global_step=iteration) as torch_profiler,
+            maybe_enable_memory_snapshot(self.config, global_step=iteration) as memory_profiler,
+        ):
+            while True:
+                dataloader_train_iter = iter(dataloader_train)
+                while True:
+                    self.callbacks.on_before_dataloading(iteration)
+                    try:
+                        with self.training_timer("dataloader_train"):
+                            data_batch = next(dataloader_train_iter)
+                    except StopIteration:
+                        break
+                    finally:
+                        self.callbacks.on_after_dataloading(iteration)
+                    # If max_iter is reached, exit the training loop.
+                    if iteration >= self.config.trainer.max_iter:
+                        _end_training = True
+                        break
+                    # Move all tensors in the data batch to GPU device.
+                    data_batch = misc.to(data_batch, device="cuda")
+                    # The actual training step.
+                    self.callbacks.on_training_step_start(model, data_batch, iteration=iteration)
+                    self.callbacks.on_training_step_batch_start(model, data_batch, iteration=iteration)
+                    if not model.training:
+                        model_ddp.train()
+                    assert model_ddp.training, "model_ddp is not in training mode."
+                    assert model.training, "model is not in training mode."
+                    output_batch, loss, grad_accum_iter = self.training_step(
+                        model_ddp,
+                        optimizer,
+                        scheduler,
+                        grad_scaler,
+                        data_batch,
+                        iteration=iteration,
+                        grad_accum_iter=grad_accum_iter,
+                    )
+                    self.callbacks.on_training_step_batch_end(
+                        model, data_batch, output_batch, loss, iteration=iteration
+                    )
+                    # If the gradients are still being accumulated, continue to load the next training batch.
+                    if grad_accum_iter != 0:
+                        continue
+                    # Do the following when an actual optimizer (update) step has been made.
+                    iteration += 1
+                    # Save checkpoint.
+                    if iteration % self.config.checkpoint.save_iter == 0:
+                        self.checkpointer.save(model, optimizer, scheduler, grad_scaler, iteration=iteration)
+                    self.callbacks.on_training_step_end(model, data_batch, output_batch, loss, iteration=iteration)
+                    # Validation.
+                    if self.config.trainer.run_validation and iteration % self.config.trainer.validation_iter == 0:
+                        self.validate(model, dataloader_val, iteration=iteration)
+                    # This iteration is successful; reset the timeout signal.
+                    signal.alarm(self.config.trainer.timeout_period)
+                    if torch_profiler:
+                        torch_profiler.step()
+                    if memory_profiler:
+                        memory_profiler.step()
+                if _end_training:
+                    break
+        log.success("Done with training.")
+        if iteration % self.config.checkpoint.save_iter != 0:
+            self.checkpointer.save(model, optimizer, scheduler, grad_scaler, iteration=iteration)
+        self.callbacks.on_train_end(model, iteration=iteration)
+        self.checkpointer.finalize()
+        distributed.barrier()
+        self.callbacks.on_app_end()
+    def training_step(
+        self,
+        model_ddp: torch.nn.Module | distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        data: dict[str, torch.Tensor],
+        iteration: int = 0,
+        grad_accum_iter: int = 0,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, int]:
+        """The training step.
+        Args:
+            model_ddp (torch.nn.Module | distributed.DistributedDataParallel): The model with a DDP wrapper or, the bare
+              module, depending on whether distributed training is enabled or not.
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            grad_scaler (torch.amp.GradScaler): The gradient scaler (for mixed precision training).
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+            grad_accum_iter (int): Number of gradient accumulation iterations.
+        Returns:
+            output (dict[str, torch.Tensor]): The model output from the training data batch (dictionary of tensors).
+            loss (torch.Tensor): The total loss of the training data batch.
+        """
+        # Only let DDP sync gradient at the last iteration of the gradient accumulation window
+        with distributed.ddp_sync_grad(model_ddp, grad_accum_iter == self.config.trainer.grad_accum_iter - 1):
+            self.callbacks.on_before_forward(iteration=iteration)
+            with self.training_timer("forward"):
+                output_batch, loss = model_ddp.training_step(data, iteration)
+            self.callbacks.on_after_forward(iteration=iteration)
+            self.callbacks.on_before_backward(model_ddp, loss, iteration=iteration)
+            with self.training_timer("backward"):
+                loss_scaled = grad_scaler.scale(loss / self.config.trainer.grad_accum_iter)
+                loss_scaled.backward()
+                if self.config.trainer.distributed_parallelism == "ddp":
+                    model_ddp.module.on_after_backward()
+                else:
+                    model_ddp.on_after_backward()
+            self.callbacks.on_after_backward(model_ddp, iteration=iteration)
+        grad_accum_iter += 1
+        if grad_accum_iter == self.config.trainer.grad_accum_iter:
+            with self.training_timer("optimizer_step"):
+                self.callbacks.on_before_optimizer_step(
+                    model_ddp, optimizer, scheduler, grad_scaler, iteration=iteration
+                )
+                grad_scaler.step(optimizer)
+                grad_scaler.update()
+                scheduler.step()
+                self.callbacks.on_before_zero_grad(model_ddp, optimizer, scheduler, iteration=iteration)
+                if self.config.trainer.distributed_parallelism == "ddp":
+                    model_ddp.module.on_before_zero_grad(optimizer, scheduler, iteration=iteration)
+                else:
+                    model_ddp.on_before_zero_grad(optimizer, scheduler, iteration=iteration)
+                optimizer.zero_grad(set_to_none=True)
+            grad_accum_iter = 0
+        return output_batch, loss, grad_accum_iter
+    @torch.no_grad()
+    def validate(self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0) -> None:
+        """Validate on the full validation dataset.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            dataloader_val (torch.utils.data.DataLoader): The validation data loader.
+            iteration (int): Current iteration number.
+        """
+        log.info(f"Validating at iteration {iteration}...")
+        self.callbacks.on_validation_start(model, dataloader_val, iteration=iteration)
+        model.eval()
+        # Evaluate on the full validation set.
+        with model.pipe.ema_scope(context="Validation", is_cpu=False):
+            for val_iter, data_batch in enumerate(dataloader_val):
+                if self.config.trainer.max_val_iter is not None and val_iter >= self.config.trainer.max_val_iter:
+                    break
+                data_batch = misc.to(data_batch, device="cuda")
+                self.callbacks.on_validation_step_start(model, data_batch, iteration=iteration)
+                output_batch, loss = model.validation_step(data_batch, iteration)
+                self.callbacks.on_validation_step_end(model, data_batch, output_batch, loss, iteration=iteration)
+        self.callbacks.on_validation_end(model, iteration=iteration)

imaginaire/utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

imaginaire/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (136 Bytes). View file

imaginaire/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (151 Bytes). View file

imaginaire/utils/__pycache__/device.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

imaginaire/utils/__pycache__/distributed.cpython-310.pyc ADDED Viewed

Binary file (15 kB). View file

imaginaire/utils/__pycache__/io.cpython-310.pyc ADDED Viewed

Binary file (4.93 kB). View file

imaginaire/utils/__pycache__/io.cpython-39.pyc ADDED Viewed

Binary file (4.9 kB). View file

imaginaire/utils/__pycache__/log.cpython-310.pyc ADDED Viewed

Binary file (4.01 kB). View file

imaginaire/utils/__pycache__/log.cpython-39.pyc ADDED Viewed

Binary file (4.25 kB). View file

imaginaire/utils/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (18.3 kB). View file

imaginaire/utils/callback.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import time
+import warnings
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
+import omegaconf
+import torch
+import torch.utils.data
+import tqdm
+from imaginaire.lazy_config import instantiate
+from imaginaire.utils import distributed, log
+from imaginaire.utils.misc import get_local_tensor_if_DTensor
+try:
+    from megatron.core import parallel_state
+except ImportError:
+    parallel_state = None
+    print("Megatron-core is not installed.")
+if TYPE_CHECKING:
+    from imaginaire.config import Config
+    from imaginaire.model import ImaginaireModel
+    from imaginaire.trainer import ImaginaireTrainer
+class CallBackGroup:
+    """A class for hosting a collection of callback objects.
+    It is used to execute callback functions of multiple callback objects with the same method name.
+    When callbackgroup.func(args) is executed, internally it loops through the objects in self._callbacks and runs
+    self._callbacks[0].func(args), self._callbacks[1].func(args), etc. The method name and arguments should match.
+    Attributes:
+        _callbacks (list[Callback]): List of callback objects.
+    """
+    def __init__(self, config: Config, trainer: ImaginaireTrainer) -> None:
+        """Initializes the list of callback objects.
+        Args:
+            config (Config): The config object for the Imaginaire codebase.
+            trainer (ImaginaireTrainer): The main trainer.
+        """
+        self._callbacks = []
+        callback_configs = config.trainer.callbacks
+        if callback_configs:
+            if isinstance(callback_configs, list) or isinstance(callback_configs, omegaconf.listconfig.ListConfig):
+                warnings.warn(
+                    "The 'config.trainer.callbacks' parameter should be a dict instead of a list. "
+                    "Please update your code",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                callback_configs = {f"callback_{i}": v for i, v in enumerate(callback_configs)}
+            for callback_name, current_callback_cfg in callback_configs.items():
+                if "_target_" not in current_callback_cfg:
+                    log.critical(
+                        f"Callback {callback_name} is missing the '_target_' field. \n SKip {current_callback_cfg}"
+                    )
+                    continue
+                log.critical(f"Instantiating callback {callback_name}: {current_callback_cfg}")
+                _callback = instantiate(current_callback_cfg)
+                assert isinstance(_callback, Callback), f"{current_callback_cfg} is not a valid callback."
+                _callback.config = config
+                _callback.trainer = trainer
+                self._callbacks.append(_callback)
+    def __getattr__(self, method_name: str) -> Callable:
+        """Loops through the callback objects to call the corresponding callback function.
+        Args:
+            method_name (str): Callback method name.
+        """
+        def multi_callback_wrapper(*args, **kwargs) -> None:
+            for callback in self._callbacks:
+                assert hasattr(callback, method_name)
+                method = getattr(callback, method_name)
+                assert callable(method)
+                _ = method(*args, **kwargs)
+        return multi_callback_wrapper
+class Callback:
+    """The base class for all callbacks.
+    All callbacks should inherit from this class and adhere to the established method names and signatures.
+    """
+    def __init__(self, config: Config | None = None, trainer: ImaginaireTrainer | None = None):
+        """Initializes a Callback object.
+        Args:
+            config (Optional[Config]): The configuration object for the Imaginaire codebase, if available.
+            trainer (Optional[ImaginaireTrainer]): The main trainer handling the training loop, if available.
+        Notes:
+            The config and trainer parameters are optional to maintain backward compatibility.
+            In future releases, these parameters will be removed. Upon using these parameters, a deprecation
+            warning will be issued.
+        """
+        if config is not None or trainer is not None:
+            warnings.warn(
+                "The 'config' and 'trainer' parameters are deprecated and will be removed in a future release. "
+                "Please update your code to create Callback instances without these parameters.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        del config, trainer
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        """
+        Called before the training step, for each batch. This is paired with on_training_step_end() but note that
+        when using gradient accumulation, while on_training_step_end() is only called when the optimizer is updated,
+        this function is called for every batch.
+        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
+        for every batch, albeit with the same iteration number.
+        """
+        pass
+    def on_training_step_batch_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        """
+        Called before the training step, for each batch, similarly to on_training_step_start(). This function is paired with
+        on_training_step_batch_end(), and both functions are called for every batch even when using gradient accumulation.
+        Note that the iteration is only updated when the optimizer is updated, and therefore it may be the same for multiple invocations.
+        """
+        pass
+    def on_before_forward(self, iteration: int = 0) -> None:
+        pass
+    def on_after_forward(self, iteration: int = 0) -> None:
+        pass
+    def on_before_backward(
+        self, model_ddp: distributed.DistributedDataParallel, loss: torch.Tensor, iteration: int = 0
+    ) -> None:
+        pass
+    def on_after_backward(self, model_ddp: distributed.DistributedDataParallel, iteration: int = 0) -> None:
+        pass
+    def on_before_dataloading(self, iteration: int = 0) -> None:
+        pass
+    def on_after_dataloading(self, iteration: int = 0) -> None:
+        pass
+    def on_optimizer_init_start(self) -> None:
+        pass
+    def on_optimizer_init_end(self) -> None:
+        pass
+    def on_before_optimizer_step(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_training_step_batch_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        """
+        Called at the end of a training step for every batch even when using gradient accumulation.
+        This is paired with on_training_step_batch_start(). Note that the iteration is only updated when the optimizer is updated,
+        and therefore it may be the same for multiple batches.
+        """
+        pass
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        """
+        Called at the end of a training step, but note that when using gradient accumulation, this is only called
+        when the optimizer is updated, and the iteration incremented, whereas on_training_step_start is called every time.
+        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
+        for every batch.
+        """
+        pass
+    def on_validation_start(
+        self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0
+    ) -> None:
+        pass
+    def on_validation_step_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        pass
+    def on_validation_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_validation_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_load_checkpoint_start(self, model: ImaginaireModel) -> None:
+        pass
+    def on_load_checkpoint_end(
+        self, model: ImaginaireModel, iteration: int = 0, checkpoint_path: str | None = None
+    ) -> None:
+        pass
+    def on_load_checkpoint(self, model: ImaginaireModel, state_dict: dict[Any]) -> None:
+        pass
+    def on_save_checkpoint_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        """
+        Called when checkpoint saving is about to start.
+        """
+        pass
+    def on_save_checkpoint_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        """
+        Called when the synchronous part of checkpointing is finished, this function can be used
+        along with on_save_checkpoint_start() to measure the exposed (synchronous) checkpoint time.
+        Note that for asynchronous checkpoint, the checkpoint may still be ongoing, so this function
+        does not mean the checkpoint is finished for the asynchronous case, use on_save_checkpoint_success()
+        for that.
+        """
+        pass
+    def on_save_checkpoint_success(self, iteration: int = 0, elapsed_time: float = 0) -> None:
+        """
+        Called when checkpoint saving is fully finished, and succeeded. Not called if checkpoint failed.
+        For synchronous checkpoint, it is called at the same time as on_save_checkpoint_end(), but for asynchronous
+        checkpoint, it is called after the asynchronous part has also finished. For checkpointers with out-of-process
+        checkpointing, this function is called as soon as the notification is received from the checkpointer process,
+        which may not be immediately after the checkpoint has completed but later on. Therefore, if you need to measure
+        the full checkpoint duration for the asynchronous part, use the elapsed_time parameter, do not measure it directly
+        as this would be a significant overestimate.
+        """
+        pass
+    def on_save_checkpoint(self, model: ImaginaireModel, state_dict: dict[Any]) -> None:
+        pass
+    def on_train_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_app_end(self) -> None:
+        pass
+class EMAModelCallback(Callback):
+    """The callback class for tracking EMA model weights."""
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        # Set up the EMA model weight tracker.
+        if model.config.ema.enabled:
+            assert hasattr(model, "ema"), "EMA should be initialized from ImaginaireModel"
+            # EMA model must be kept in FP32 precision.
+            model.ema = model.ema.to(dtype=torch.float32)
+        else:
+            assert not hasattr(model, "ema"), "There should be no EMA initialized."
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        # Update the EMA model with the new regular weights.
+        if model.config.ema.enabled:
+            model.ema.update_average(model, iteration)
+class ProgressBarCallback(Callback):
+    """The callback class for visualizing the training/validation progress bar in the console."""
+    @distributed.rank0_only
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.train_pbar = tqdm.trange(self.config.trainer.max_iter, initial=iteration, desc="Training")
+    @distributed.rank0_only
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.train_pbar.update()
+    @distributed.rank0_only
+    def on_validation_start(
+        self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0
+    ) -> None:
+        if self.config.trainer.max_val_iter is not None:
+            num_iter = self.config.trainer.max_val_iter
+        else:
+            num_iter = len(dataloader_val)
+        assert num_iter is not None and num_iter > 0, f"Invalid number of validation iterations: {num_iter}"
+        self.val_pbar = tqdm.trange(num_iter, desc="Validating", position=1, leave=False)
+    @distributed.rank0_only
+    def on_validation_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.val_pbar.update()
+    @distributed.rank0_only
+    def on_validation_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.val_pbar.close()
+    @distributed.rank0_only
+    def on_train_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.trainer.checkpointer.finalize()
+        self.train_pbar.close()
+class IterationLoggerCallback(Callback):
+    """The callback class for visualizing the training/validation progress bar in the console."""
+    @distributed.rank0_only
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        # self.train_pbar = tqdm.trange(self.config.trainer.max_iter, initial=iteration, desc="Training")
+        self.start_iteration_time = time.time()
+        self.elapsed_iteration_time = 0
+    @distributed.rank0_only
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        self.start_iteration_time = time.time()
+    @distributed.rank0_only
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.elapsed_iteration_time += time.time() - self.start_iteration_time
+        if iteration % self.config.trainer.logging_iter == 0:
+            avg_time = self.elapsed_iteration_time / self.config.trainer.logging_iter
+            log.info(f"Iteration: {iteration}, average iter time: {avg_time:2f}, total loss {loss.item():4f}")
+            self.elapsed_iteration_time = 0
+class LowPrecisionCallback(Callback):
+    """The callback class handling low precision training
+    Config with non-primitive type makes it difficult to override the option.
+    The callback gets precision from model.precision instead.
+    It also auto disabled when using fp32.
+    """
+    def __init__(self, config: Config, trainer: ImaginaireTrainer, update_iter: int):
+        self.update_iter = update_iter
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        assert model.precision in [
+            torch.bfloat16,
+            torch.float16,
+            torch.half,
+        ], "LowPrecisionCallback must use a low precision dtype."
+        self.precision_type = model.precision
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        for k, v in data.items():
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(data[k]):
+                data[k] = v.to(dtype=self.precision_type)
+    def on_validation_step_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        for k, v in data.items():
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(data[k]):
+                data[k] = v.to(dtype=self.precision_type)
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        if iteration % self.update_iter == 0:
+            if getattr(optimizer, "master_weights", False):
+                params, master_params = [], []
+                for group, group_master in zip(optimizer.param_groups, optimizer.param_groups_master, strict=False):
+                    for p, p_master in zip(group["params"], group_master["params"], strict=False):
+                        params.append(get_local_tensor_if_DTensor(p.data))
+                        master_params.append(p_master.data)
+                torch._foreach_copy_(params, master_params)
+class NVTXCallback(Callback):
+    """The callback for creating NVTX ranges"""
+    def __init__(
+        self,
+        synchronize: bool = False,
+        config: Config | None = None,
+        trainer: ImaginaireTrainer | None = None,
+    ):
+        super().__init__(config, trainer)
+        self.synchronize = synchronize
+    def on_before_forward(self, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("forward")
+    def on_after_forward(self, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_backward(
+        self, model_ddp: distributed.DistributedDataParallel, loss: torch.Tensor, iteration: int = 0
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("backward")
+    def on_after_backward(self, model_ddp: distributed.DistributedDataParallel, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_optimizer_step(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int = 0,
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("optimizer_step")
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_dataloading(self, iteration: int = 0) -> None:
+        torch.cuda.nvtx.range_push("dataloading")
+    def on_after_dataloading(self, iteration: int = 0) -> None:
+        torch.cuda.nvtx.range_pop()

imaginaire/utils/checkpointer.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import os
+import threading
+from typing import TYPE_CHECKING, NamedTuple
+import torch
+import torch.distributed as dist
+from torch import nn
+from imaginaire.model import ImaginaireModel
+from imaginaire.utils import callback, distributed, log, misc
+from imaginaire.utils.parallelism import ModelWrapper
+if TYPE_CHECKING:
+    from imaginaire.config import CheckpointConfig, JobConfig
+class Checkpointer:
+    """The checkpointer class. Supports checkpoint saving/loading to local disk."""
+    def __init__(self, config_checkpoint: CheckpointConfig, config_job: JobConfig, callbacks: callback.CallBackGroup):
+        """Constructor of the checkpointer.
+        Args:
+            config_checkpoint (CheckpointConfig): The config object for the checkpointer.
+        """
+        # Set the callback functions.
+        self.callbacks = callbacks
+        self.checkpoint_dir_local = f"{config_job.path_local}/checkpoints"
+        self.strict_resume = config_checkpoint.strict_resume
+        self.load_path = config_checkpoint.load_path or None
+        self.load_training_state = config_checkpoint.load_training_state
+        self.only_load_scheduler_state = config_checkpoint.only_load_scheduler_state
+        self.save_thread = None
+    def save(
+        self,
+        model: ImaginaireModel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int,
+    ) -> None:
+        """Save network weights, optimizer parameters, scheduler parameters to a checkpoint.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            grad_scaler (torch.amp.GradScaler): The gradient scaler (for mixed precision training).
+            iteration (int): Current iteration number.
+        """
+        self.callbacks.on_save_checkpoint_start(model, iteration)
+        checkpoint_file = f"iter_{iteration:09}.pt"
+        if distributed.get_rank() == 0:
+            state_dict = dict(
+                model=model.state_dict(),
+                optimizer=optimizer.state_dict(),
+                scheduler=scheduler.state_dict(),
+                grad_scaler=grad_scaler.state_dict(),
+                iteration=iteration,
+            )
+            state_dict = misc.to(state_dict, device="cpu")
+            self.callbacks.on_save_checkpoint(model, state_dict=state_dict)
+            # Wait for previous saver thread to end.
+            if self.save_thread:
+                self.save_thread.join()
+            # Run the checkpoint saver in a separate thread.
+            self.save_thread = threading.Thread(
+                target=self._save_worker_local,
+                daemon=False,
+                args=(state_dict, checkpoint_file, distributed.get_rank()),
+            )
+            self.save_thread.start()
+        # Note: Checkpoints are saved on a separate thread and this callback is not accurate.
+        # Please check logs from on_save_checkpoint_success() for better accuracy
+        self.callbacks.on_save_checkpoint_end(model=None, iteration=iteration)
+    @misc.timer("checkpoint saving (local)")
+    def _save_worker_local(self, state_dict: dict[str, torch.Tensor], checkpoint_file: str, rank: int = 0) -> None:
+        """Worker to save checkpoint to local disk, spawned with a child thread (runs in parallel with the training).
+        Args:
+            state_dict (dict[str, torch.Tensor]): The state dict of the model/optimizer/scheduler.
+            checkpoint_file (str): The file name of the model checkpoint.
+            rank (int): GPU device (default: 0).
+        """
+        checkpoint_path = os.path.join(self.checkpoint_dir_local, checkpoint_file)
+        os.makedirs(self.checkpoint_dir_local, exist_ok=True)
+        try:
+            torch.save(state_dict, checkpoint_path)
+            if rank == 0:
+                self._write_latest_checkpoint_file(checkpoint_file)
+            log.success(f"Saved checkpoint (local): {checkpoint_path}")
+            iteration = int(checkpoint_file.replace("iter_", "").replace(".pt", ""))
+            self.callbacks.on_save_checkpoint_success(iteration=iteration)
+        except Exception as e:
+            log.exception(f"Checkpoint failed to save (local): {e}")
+    @misc.timer("checkpoint loading")
+    def load(
+        self,
+        model: ImaginaireModel,
+        optimizer: torch.optim.Optimizer | None = None,
+        scheduler: torch.optim.lr_scheduler.LRScheduler | None = None,
+        grad_scaler: torch.amp.GradScaler | None = None,
+    ) -> int:
+        """Load network weights and optimizer states from a checkpoint in a single process.
+        The priority of the checkpoint loading logic is:
+        1. Attempt to resume training if possible by looking for latest_checkpoint.txt under the same name.
+        2. If no latest checkpoint were found, it loads the model weights specified by config_checkpoint.path.
+           - This is typically used for inference mode.
+           - If config_checkpoint.load_optimizer_state is True, then also load the optimizer and scheduler states.
+        3. If none of the above, randomly initialize the model parameters and train from scratch.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            optimizer (torch.optim.Optimizer | None): The model optimizer (default: None).
+            scheduler (torch.optim.lr_scheduler.LRScheduler | None): The optimization scheduler (default: None).
+            grad_scaler (torch.amp.GradScaler | None): The gradient scaler (for mixed precision training).
+        Returns:
+            iteration (int): the iteration number to start/resume from.
+        """
+        self.callbacks.on_load_checkpoint_start(model)
+        latest_checkpoint_file = self._read_latest_checkpoint_file()
+        if latest_checkpoint_file is not None:
+            # 1. Resume training from latest_checkpoint.txt under the same name.
+            checkpoint_dir = self.checkpoint_dir_local
+            checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint_file)
+            resume = True
+            only_resume_scheduler = True
+        else:
+            if self.load_path:
+                # 2. Load the module weights specified by config_checkpoint.path.
+                checkpoint_path = self.load_path
+                resume = self.load_training_state
+                only_resume_scheduler = self.only_load_scheduler_state
+            else:
+                # 3. Randomly initialize the model parameters and train from scratch.
+                checkpoint_path = None
+                resume = False
+                only_resume_scheduler = False
+        # Load checkpoint.
+        if checkpoint_path is not None:
+            self._check_checkpoint_exists(checkpoint_path)
+            log.info(f"Loading checkpoint (local): {checkpoint_path}")
+            state_dict = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+            log.success(f"Complete loading checkpoint (local): {checkpoint_path}")
+            self.callbacks.on_load_checkpoint(model, state_dict=state_dict)
+            # Load the state dicts.
+            log.info("- Loading the model...")
+            model.load_state_dict(state_dict["model"], strict=self.strict_resume)
+            if resume or only_resume_scheduler:
+                iteration = state_dict["iteration"]
+                assert scheduler
+                log.info("- Loading the scheduler...")
+                scheduler.load_state_dict(state_dict["scheduler"])
+                scheduler.last_epoch = iteration
+            else:
+                iteration = 0
+            if resume:
+                assert optimizer
+                log.info("- Loading the optimizer...")
+                optimizer.load_state_dict(state_dict["optimizer"])
+                log.info("- Loading the gradient scaler...")
+                grad_scaler.load_state_dict(state_dict["grad_scaler"])
+                log.success(f"Done with loading the checkpoint (iteration {iteration}).")
+            else:
+                log.success("Done with loading the checkpoint.")
+        else:
+            # Checkpoint not found and not specified. We will train everything from scratch.
+            iteration = 0
+            log.info("Training from scratch.")
+        torch.cuda.empty_cache()
+        self.callbacks.on_load_checkpoint_end(model, iteration=iteration, checkpoint_path=checkpoint_path)
+        return iteration
+    def _read_latest_checkpoint_file(self) -> str | None:
+        """Get the file name of the latest saved checkpoint. If it doesn't exist, return None.
+        Returns:
+            checkpoint_file (str | None): file name of the latest saved checkpoint.
+        """
+        checkpoint_file = None
+        latest_path = os.path.join(self.checkpoint_dir_local, "latest_checkpoint.txt")
+        if os.path.isfile(latest_path):
+            checkpoint_file = open(latest_path).read().strip()
+        return checkpoint_file
+    def _write_latest_checkpoint_file(self, checkpoint_file: str) -> None:
+        """Track the file name of the latest saved checkpoint.
+        Args:
+            checkpoint_file (str): file name of the latest saved checkpoint.
+        """
+        content = f"{checkpoint_file}\n"
+        latest_path = os.path.join(self.checkpoint_dir_local, "latest_checkpoint.txt")
+        with open(latest_path, "w") as file:
+            file.write(content)
+    def _check_checkpoint_exists(self, checkpoint_path: str) -> None:
+        """If the file checkpoint_path does not exist, raise an error.
+        Args:
+            checkpoint_path (str): full path to the checkpoint.
+        """
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f"File not found (local): {checkpoint_path}")
+    def finalize(self) -> None:
+        """Finalize the checkpointer."""
+        if self.save_thread:
+            self.save_thread.join()
+class _IncompatibleKeys(
+    NamedTuple(
+        "IncompatibleKeys",
+        [
+            ("missing_keys", list[str]),
+            ("unexpected_keys", list[str]),
+            ("incorrect_shapes", list[tuple[str, tuple[int], tuple[int]]]),
+        ],
+    )
+):
+    pass
+def load_checkpoint(
+    model_parts: list[nn.Module],
+    ckpt_dir,
+    model_ckpt_key_map: dict[str, str] = {},  # noqa: B006
+):
+    log.info(f"Loading checkpoint from {ckpt_dir}.")
+    _model_wrapper = ModelWrapper(model_parts)
+    state_dict = _model_wrapper.state_dict()
+    # remove _extra_state
+    state_dict = {k: v for k, v in state_dict.items() if not k.endswith("._extra_state")}
+    # remap keys if needed
+    if model_ckpt_key_map:
+        for model_key, checkpoint_key in model_ckpt_key_map.items():
+            state_dict[checkpoint_key] = state_dict.pop(model_key)
+            log.info(f"Re-mapping {model_key} to {checkpoint_key}")
+    fs_storage_reader = dist.checkpoint.FileSystemReader(ckpt_dir)
+    dist.checkpoint.load(state_dict=state_dict, storage_reader=fs_storage_reader)
+    # inverse the remapping if needed
+    if model_ckpt_key_map:
+        for model_key, checkpoint_key in model_ckpt_key_map.items():
+            state_dict[model_key] = state_dict.pop(checkpoint_key)
+            log.info(f"Inverse re-mapping {checkpoint_key} to {model_key}")
+    _model_wrapper.load_state_dict(state_dict)
+    log.info(f"Finished loading checkpoint from {ckpt_dir}.")

imaginaire/utils/config_helper.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+import pkgutil
+import sys
+from dataclasses import fields as dataclass_fields
+from dataclasses import is_dataclass
+from typing import Any
+import attr
+import attrs
+from hydra import compose, initialize
+from hydra.core.config_store import ConfigStore
+from hydra.core.global_hydra import GlobalHydra
+from omegaconf import DictConfig, OmegaConf
+from imaginaire.config import Config
+from imaginaire.utils import log
+def is_attrs_or_dataclass(obj) -> bool:
+    """
+    Check if the object is an instance of an attrs class or a dataclass.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs class or a dataclass, False otherwise.
+    """
+    return is_dataclass(obj) or attr.has(type(obj))
+def get_fields(obj):
+    """
+    Get the fields of an attrs class or a dataclass.
+    Args:
+        obj: The object to get fields from. Must be an instance of an attrs class or a dataclass.
+    Returns:
+        list: A list of field names.
+    Raises:
+        ValueError: If the object is neither an attrs class nor a dataclass.
+    """
+    if is_dataclass(obj):
+        return [field.name for field in dataclass_fields(obj)]
+    elif attr.has(type(obj)):
+        return [field.name for field in attr.fields(type(obj))]
+    else:
+        raise ValueError("The object is neither an attrs class nor a dataclass.")
+def override(config: Config, overrides: list[str] | None = None) -> Config:
+    """
+    :param config: the instance of class `Config` (usually from `make_config`)
+    :param overrides: list of overrides for config
+    :return: the composed instance of class `Config`
+    """
+    # Store the class of the config for reconstruction after overriding.
+    # config_class = type(config)
+    # Convert Config object to a DictConfig object
+    config_dict = attrs.asdict(config)
+    config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+    # Enforce "--" separator between the script arguments and overriding configs.
+    if overrides:
+        if overrides[0] != "--":
+            raise ValueError('Hydra config overrides must be separated with a "--" token.')
+        overrides = overrides[1:]
+    # Use Hydra to handle overrides
+    cs = ConfigStore.instance()
+    cs.store(name="config", node=config_omegaconf)
+    if not GlobalHydra().is_initialized():
+        with initialize(version_base=None):
+            config_omegaconf = compose(config_name="config", overrides=overrides)
+            OmegaConf.resolve(config_omegaconf)
+    else:
+        config_omegaconf = compose(config_name="config", overrides=overrides)
+        OmegaConf.resolve(config_omegaconf)
+    def config_from_dict(ref_instance: Any, kwargs: Any) -> Any:
+        """
+        Construct an instance of the same type as ref_instance using the provided dictionary or data or unstructured data
+        Args:
+            ref_instance: The reference instance to determine the type and fields when needed
+            kwargs: A dictionary of keyword arguments to use for constructing the new instance or primitive data or unstructured data
+        Returns:
+            Any: A new instance of the same type as ref_instance constructed using the provided kwargs or the primitive data or unstructured data
+        Raises:
+            AssertionError: If the fields do not match or if extra keys are found.
+            Exception: If there is an error constructing the new instance.
+        """
+        is_type = is_attrs_or_dataclass(ref_instance)
+        if not is_type:
+            return kwargs
+        else:
+            ref_fields = set(get_fields(ref_instance))
+            assert isinstance(kwargs, dict) or isinstance(kwargs, DictConfig), (
+                "kwargs must be a dictionary or a DictConfig"
+            )
+            keys = set(kwargs.keys())
+            # ref_fields must equal to or include all keys
+            extra_keys = keys - ref_fields
+            assert ref_fields == keys or keys.issubset(ref_fields), (
+                f"Fields mismatch: {ref_fields} != {keys}. Extra keys found: {extra_keys} \n \t when constructing {type(ref_instance)} with {keys}"
+            )
+            resolved_kwargs: dict[str, Any] = {}
+            for f in keys:
+                resolved_kwargs[f] = config_from_dict(getattr(ref_instance, f), kwargs[f])
+            try:
+                new_instance = type(ref_instance)(**resolved_kwargs)
+            except Exception as e:
+                log.error(f"Error when constructing {type(ref_instance)} with {resolved_kwargs}")
+                log.error(e)
+                raise e
+            return new_instance
+    config = config_from_dict(config, config_omegaconf)
+    return config
+def get_config_module(config_file: str) -> str:
+    if not config_file.endswith(".py"):
+        log.error("Config file cannot be specified as module.")
+        log.error("Please provide the path to the Python config file (relative to the Imaginaire4 root).")
+    assert os.path.isfile(config_file), f"Imaginaire4 config file ({config_file}) not found."
+    # Convert to importable module format.
+    config_module = config_file.replace("/", ".").replace(".py", "")
+    return config_module
+def import_all_modules_from_package(package_path: str, reload: bool = False, skip_underscore: bool = True) -> None:
+    """
+    Import all modules from the specified package path recursively.
+    This function is typically used in conjunction with Hydra to ensure that all modules
+    within a specified package are imported, which is necessary for registering configurations.
+    Example usage:
+    ```python
+    import_all_modules_from_package("projects.cosmos.diffusion.v1.config.experiment", reload=True, skip_underscore=False)
+    ```
+    Args:
+        package_path (str): The dotted path to the package from which to import all modules.
+        reload (bool): Flag to determine whether to reload modules if they're already imported.
+        skip_underscore (bool): If True, skips importing modules that start with an underscore.
+    """
+    log.critical(f"{'Reloading' if reload else 'Importing'} all modules from package {package_path}")
+    package = importlib.import_module(package_path)
+    package_directory = package.__path__
+    def import_modules_recursively(directory: str, prefix: str) -> None:
+        """
+        Recursively imports or reloads all modules in the given directory.
+        Args:
+            directory (str): The file system path to the current package directory.
+            prefix (str): The module prefix (e.g., 'projects.cosmos.diffusion.v1.config').
+        """
+        for _, module_name, is_pkg in pkgutil.iter_modules([directory]):
+            if skip_underscore and module_name.startswith("_"):
+                log.debug(f"Skipping module {module_name} as it starts with an underscore")
+                continue
+            full_module_name = f"{prefix}.{module_name}"
+            log.debug(f"{'Reloading' if reload else 'Importing'} module {full_module_name}")
+            if full_module_name in sys.modules and reload:
+                importlib.reload(sys.modules[full_module_name])
+            else:
+                importlib.import_module(full_module_name)
+            if is_pkg:
+                sub_package_directory = os.path.join(directory, module_name)
+                import_modules_recursively(sub_package_directory, full_module_name)
+    for directory in package_directory:
+        import_modules_recursively(directory, package_path)

imaginaire/utils/device.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+import pynvml
+class Device:
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)  # type: ignore
+    def __init__(self, device_idx: int):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+    def get_name(self) -> str:
+        return pynvml.nvmlDeviceGetName(self.handle)
+    def get_cpu_affinity(self) -> list[int]:
+        affinity_string = ""
+        for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = f"{j:064b}" + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+        return [i for i, e in enumerate(affinity_list) if e != 0]

imaginaire/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import collections
+import collections.abc
+import ctypes
+import functools
+import os
+from collections.abc import Callable, Container
+from contextlib import contextmanager
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any
+import pynvml
+import torch
+import torch.distributed as dist
+from torch.distributed import get_process_group_ranks
+from imaginaire.utils.device import Device
+if dist.is_available():
+    from torch.distributed.distributed_c10d import _get_default_group
+    from torch.distributed.utils import _sync_module_states, _verify_param_shape_across_processes
+from imaginaire.utils import log
+if TYPE_CHECKING:
+    from imaginaire.config import DDPConfig
+try:
+    from megatron.core import parallel_state
+except ImportError:
+    print("Megatron-core is not installed.")
+def init() -> int | None:
+    """Initialize distributed training."""
+    if dist.is_initialized():
+        return torch.cuda.current_device()
+    # Set GPU affinity.
+    pynvml.nvmlInit()
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    try:
+        device = Device(local_rank)
+        os.sched_setaffinity(0, device.get_cpu_affinity())
+    except (OSError, pynvml.NVMLError) as e:
+        log.warning(f"Failed to set device affinity: {e}")
+    # Set up NCCL communication.
+    os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "0"
+    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    if dist.is_available():
+        torch.cuda.set_device(local_rank)
+        # Get the timeout value from environment variable
+        timeout_seconds = os.getenv("TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC", 1800)
+        # Convert the timeout to an integer (if it isn't already) and then to a timedelta
+        timeout_timedelta = timedelta(seconds=int(timeout_seconds))
+        dist.init_process_group(backend="nccl", init_method="env://", timeout=timeout_timedelta)
+        log.info(
+            f"Initialized distributed training with local rank {local_rank} with timeout {timeout_seconds}",
+            rank0_only=False,
+        )
+    # Increase the L2 fetch granularity for faster speed.
+    _libcudart = ctypes.CDLL("libcudart.so")
+    # Set device limit on the current device.
+    p_value = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    _libcudart.cudaDeviceGetLimit(p_value, ctypes.c_int(0x05))
+    log.info(f"Training with {get_world_size()} GPUs.")
+def get_rank(group: dist.ProcessGroup | None = None) -> int:
+    """Get the rank (GPU device) of the worker.
+    Returns:
+        rank (int): The rank of the worker.
+    """
+    rank = 0
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank(group)
+    return rank
+def get_world_size(group: dist.ProcessGroup | None = None) -> int:
+    """Get world size. How many GPUs are available in this job.
+    Returns:
+        world_size (int): The total number of GPUs available in this job.
+    """
+    world_size = 1
+    if dist.is_available() and dist.is_initialized():
+        world_size = dist.get_world_size(group)
+    return world_size
+def is_rank0() -> bool:
+    """Check if current process is the master GPU.
+    Returns:
+        (bool): True if this function is called from the master GPU, else False.
+    """
+    return get_rank() == 0
+def is_local_rank0() -> bool:
+    """Check if current process is the local master GPU in the current node.
+    Returns:
+        (bool): True if this function is called from the local master GPU, else False.
+    """
+    return torch.cuda.current_device() == 0
+def rank0_only(func: Callable) -> Callable:
+    """Apply this function only to the master GPU.
+    Example usage:
+        @rank0_only
+        def func(x):
+            return x + 3
+    Args:
+        func (Callable): a function.
+    Returns:
+        (Callable): A function wrapper executing the function only on the master GPU.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_rank0():
+            return func(*args, **kwargs)
+        else:
+            return None
+    return wrapper
+def barrier() -> None:
+    """Barrier for all GPUs."""
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+def rank0_first(func: Callable) -> Callable:
+    """run the function on rank 0 first, then on other ranks."""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_rank0():
+            result = func(*args, **kwargs)
+        barrier()
+        if not is_rank0():
+            result = func(*args, **kwargs)
+        return result
+    return wrapper
+def parallel_model_wrapper(config_ddp: DDPConfig, model: torch.nn.Module) -> torch.nn.Module | DistributedDataParallel:
+    """Wraps the model to enable data parallalism for training across multiple GPU devices.
+    Args:
+        config_ddp (DDPConfig): The data parallel config.
+        model (torch.nn.Module): The PyTorch module.
+    Returns:
+        model (torch.nn.Module | DistributedDataParallel): The data parallel model wrapper
+            if distributed environment is available, otherwise return the original model.
+    """
+    if dist.is_available() and dist.is_initialized():
+        local_rank = int(os.getenv("LOCAL_RANK", 0))
+        try:
+            ddp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        except Exception as e:
+            log.info(e)
+            log.info("parallel_state not initialized, treating all GPUs equally for DDP")
+            ddp_group = None
+        model = DistributedDataParallel(
+            model,
+            device_ids=[local_rank],
+            output_device=local_rank,
+            find_unused_parameters=config_ddp.find_unused_parameters,
+            static_graph=config_ddp.static_graph,
+            broadcast_buffers=config_ddp.broadcast_buffers,
+            process_group=ddp_group,
+        )
+    return model
+class DistributedDataParallel(torch.nn.parallel.DistributedDataParallel):
+    """This extends torch.nn.parallel.DistributedDataParallel with .training_step().
+    This borrows the concept of `forward-redirection` from Pytorch lightning. It wraps an ImaginaireModel such that
+    model.training_step() would be executed when calling self.training_step(), while preserving the behavior of calling
+    model() for Pytorch modules. Internally, this is a double rerouting mechanism (training_step -> forward ->
+    training_step), allowing us to preserve the function names and signatures.
+    """
+    def __init__(self, model: torch.nn.Module, *args, **kwargs):
+        super().__init__(model, *args, **kwargs)
+        self.show_sync_grad_static_graph_warning = True
+    def training_step(self, *args, **kwargs) -> Any:
+        # Cache the original model.forward() method.
+        original_forward = self.module.forward
+        def wrapped_training_step(*_args, **_kwargs):
+            # Unpatch immediately before calling training_step() because itself may want to call the real forward.
+            self.module.forward = original_forward
+            # The actual .training_step().
+            return self.module.training_step(*_args, **_kwargs)
+        # Patch the original_module's forward so we can redirect the arguments back to the real method.
+        self.module.forward = wrapped_training_step
+        # Call self, which implicitly calls self.forward() --> model.forward(), which is now model.training_step().
+        # Without calling self.forward() or model.forward() explciitly, implicit hooks are also executed.
+        return self(*args, **kwargs)
+@contextmanager
+def ddp_sync_grad(model, enabled):
+    r"""
+    Context manager to enable/disable gradient synchronizations across DDP processes for DDP model.
+    Modified from:
+    https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.no_sync
+    Note that this is incompatible with static_graph=True and will be an no-op if static_graph=True.
+    Within this context, gradients will be accumulated on module
+    variables, which will later be synchronized in the first
+    forward-backward pass exiting the context.
+    .. warning::
+        The forward pass should be included inside the context manager, or
+        else gradients will still be synchronized.
+    """
+    assert isinstance(model, torch.nn.Module)
+    if isinstance(model, DistributedDataParallel):
+        old_require_backward_grad_sync = model.require_backward_grad_sync
+        if model.static_graph and model.require_backward_grad_sync != enabled:
+            if model.show_sync_grad_static_graph_warning:
+                log.warning("DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced.")
+                model.show_sync_grad_static_graph_warning = False
+        else:
+            model.require_backward_grad_sync = enabled
+    try:
+        yield
+    finally:
+        if isinstance(model, DistributedDataParallel):
+            model.require_backward_grad_sync = old_require_backward_grad_sync
+def collate_batches(data_batches: list[dict[str, torch.Tensor]]) -> torch.Tensor | dict[str, torch.Tensor]:
+    """Aggregate the list of data batches from all devices and process the results.
+    This is used for gathering validation data batches with imaginaire.utils.dataloader.DistributedEvalSampler.
+    It will return the data/output of the entire validation set in its original index order. The sizes of data_batches
+    in different ranks may differ by 1 (if dataset size is not evenly divisible), in which case a dummy sample will be
+    created before calling dis.all_gather().
+    Args:
+        data_batches (list[dict[str, torch.Tensor]]): List of tensors or (hierarchical) dictionary where
+            leaf entries are tensors.
+    Returns:
+        data_gather (torch.Tensor | dict[str, torch.Tensor]): tensors or (hierarchical) dictionary where
+            leaf entries are concatenated tensors.
+    """
+    if isinstance(data_batches[0], torch.Tensor):
+        # Concatenate the local data batches.
+        data_concat = torch.cat(data_batches, dim=0)  # type: ignore
+        # Get the largest number of local samples from all ranks to determine whether to dummy-pad on this rank.
+        max_num_local_samples = torch.tensor(len(data_concat), device="cuda")
+        dist.all_reduce(max_num_local_samples, op=dist.ReduceOp.MAX)
+        if len(data_concat) < max_num_local_samples:
+            assert len(data_concat) + 1 == max_num_local_samples
+            dummy = torch.empty_like(data_concat[:1])
+            data_concat = torch.cat([data_concat, dummy], dim=0)
+            dummy_count = torch.tensor(1, device="cuda")
+        else:
+            dummy_count = torch.tensor(0, device="cuda")
+        # Get all concatenated batches from all ranks and concatenate again.
+        dist.all_reduce(dummy_count, op=dist.ReduceOp.SUM)
+        data_concat = all_gather_tensor(data_concat.contiguous())
+        data_collate = torch.stack(data_concat, dim=1).flatten(start_dim=0, end_dim=1)
+        # Remove the dummy samples.
+        if dummy_count > 0:
+            data_collate = data_collate[:-dummy_count]
+    elif isinstance(data_batches[0], collections.abc.Mapping):
+        data_collate = dict()
+        for key in data_batches[0].keys():
+            data_collate[key] = collate_batches([data[key] for data in data_batches])  # type: ignore
+    else:
+        raise TypeError
+    return data_collate
+@torch.no_grad()
+def all_gather_tensor(tensor: torch.Tensor) -> list[torch.Tensor]:
+    """Gather the corresponding tensor from all GPU devices to a list.
+    Args:
+        tensor (torch.Tensor): Pytorch tensor.
+    Returns:
+        tensor_list (list[torch.Tensor]): A list of Pytorch tensors gathered from all GPU devices.
+    """
+    tensor_list = [torch.zeros_like(tensor) for _ in range(get_world_size())]
+    dist.all_gather(tensor_list, tensor)
+    return tensor_list
+def broadcast(tensor, src, group=None, async_op=False):
+    world_size = get_world_size()
+    if world_size < 2:
+        return tensor
+    dist.broadcast(tensor, src=src, group=group, async_op=async_op)
+def dist_reduce_tensor(tensor, rank=0, reduce="mean"):
+    r"""Reduce to rank 0"""
+    world_size = get_world_size()
+    if world_size < 2:
+        return tensor
+    with torch.no_grad():
+        dist.reduce(tensor, dst=rank)
+        if get_rank() == rank:
+            if reduce == "mean":
+                tensor /= world_size
+            elif reduce == "sum":
+                pass
+            else:
+                raise NotImplementedError
+    return tensor
+def sync_model_states(
+    model: torch.nn.Module,
+    process_group: dist.ProcessGroup | None = None,
+    src: int = 0,
+    params_and_buffers_to_ignore: Container[str] | None = None,
+    broadcast_buffers: bool = True,
+):
+    """
+    Modify based on DDP source code
+    Synchronizes the parameters and buffers of a model across different processes in a distributed setting.
+    This function ensures that all processes in the specified process group have the same initial parameters and
+    buffers from the source rank, typically rank 0. It is useful when different processes start with different model
+    states and a synchronization is required to ensure consistency across all ranks.
+    Args:
+        model (nn.Module): The model whose parameters and buffers are to be synchronized.
+        process_group (dist.ProcessGroup, optional): The process group for communication. If None,
+            the default group is used. Defaults to None.
+        src (int, optional): The source rank from which parameters and buffers will be broadcasted.
+            Defaults to 0.
+        params_and_buffers_to_ignore (Optional[Container[str]], optional): A container of parameter and buffer
+            names to exclude from synchronization. Defaults to None, which means all parameters and buffers are
+            included.
+        broadcast_buffers (bool, optional): Whether to broadcast buffers or not. Defaults to True.
+    Side Effects:
+        This function modifies the state of the model in-place to synchronize it with the source rank's model state.
+    Raises:
+        RuntimeError: If the shapes of parameters across processes do not match, a runtime error will be raised.
+    Examples:
+        >>> # downloading duplicated model weights from s3 in each rank and save network bandwidth
+        >>> # useful and save our time when model weights are huge
+        >>> if dist.get_rank == 0:
+        >>>     model.load_state_dict(network_bound_weights_download_fn(s3_weights_path))
+        >>> dist.barrir()
+        >>> sync_model_states(model) # sync rank0 weights to other ranks
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        return
+    if process_group is None:
+        process_group = _get_default_group()
+    if not params_and_buffers_to_ignore:
+        params_and_buffers_to_ignore = set()
+    log.info(
+        f"Synchronizing model states from rank {src} to all ranks in process group {get_process_group_ranks(process_group)}."
+    )
+    # Build tuple of (module, parameter) for all parameters that require grads.
+    modules_and_parameters = [
+        (module, parameter)
+        for module_name, module in model.named_modules()
+        for parameter in [
+            param
+            # Note that we access module.named_parameters instead of
+            # parameters(module). parameters(module) is only needed in the
+            # single-process multi device case, where it accesses replicated
+            # parameters through _former_parameters.
+            for param_name, param in module.named_parameters(recurse=False)
+            if f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+            # if param.requires_grad
+            # and f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+        ]
+    ]
+    # Deduplicate any parameters that might be shared across child modules.
+    memo = set()
+    modules_and_parameters = [
+        # "p not in memo" is the deduplication check.
+        # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+        (m, p)
+        for m, p in modules_and_parameters
+        if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+    ]
+    # Build list of parameters.
+    parameters = [parameter for _, parameter in modules_and_parameters]
+    if len(parameters) == 0:
+        return
+    _verify_param_shape_across_processes(process_group, parameters)
+    _sync_module_states(
+        module=model,
+        process_group=process_group,
+        broadcast_bucket_size=(250 * 1024 * 1024),
+        src=src,
+        params_and_buffers_to_ignore=params_and_buffers_to_ignore,
+        broadcast_buffers=broadcast_buffers,
+    )

imaginaire/utils/easy_io/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/utils/easy_io/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (144 Bytes). View file

imaginaire/utils/easy_io/__pycache__/easy_io.cpython-310.pyc ADDED Viewed

Binary file (28.6 kB). View file

imaginaire/utils/easy_io/__pycache__/file_client.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file

imaginaire/utils/easy_io/backends/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from imaginaire.utils.easy_io.backends.base_backend import BaseStorageBackend
+from imaginaire.utils.easy_io.backends.http_backend import HTTPBackend
+from imaginaire.utils.easy_io.backends.local_backend import LocalBackend
+from imaginaire.utils.easy_io.backends.registry_utils import backends, prefix_to_backends, register_backend
+__all__ = [
+    "BaseStorageBackend",
+    "HTTPBackend",
+    "LocalBackend",
+    "backends",
+    "prefix_to_backends",
+    "register_backend",
+]

imaginaire/utils/easy_io/backends/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (596 Bytes). View file

imaginaire/utils/easy_io/backends/__pycache__/base_backend.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

imaginaire/utils/easy_io/backends/__pycache__/http_backend.cpython-310.pyc ADDED Viewed

Binary file (2.9 kB). View file

imaginaire/utils/easy_io/backends/__pycache__/local_backend.cpython-310.pyc ADDED Viewed

Binary file (18.4 kB). View file

imaginaire/utils/easy_io/backends/__pycache__/registry_utils.cpython-310.pyc ADDED Viewed

Binary file (3.57 kB). View file