Ovi

Runtime error

alexnasa commited on 15 days ago

Commit

a3a2e41

verified ·

1 Parent(s): 4f07a4e

Upload 121 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +26 -0
LICENSE +201 -0
README.md +236 -12
app.py +230 -0
assets/ovi_trailer.mp4 +3 -0
download_weights.py +73 -0
example_prompts/gpt_examples_i2v.csv +26 -0
example_prompts/gpt_examples_t2v.csv +13 -0
example_prompts/pngs/0.png +3 -0
example_prompts/pngs/1.png +3 -0
example_prompts/pngs/13.png +3 -0
example_prompts/pngs/17.png +3 -0
example_prompts/pngs/18.png +3 -0
example_prompts/pngs/19.png +3 -0
example_prompts/pngs/2.png +3 -0
example_prompts/pngs/23.png +3 -0
example_prompts/pngs/3.png +3 -0
example_prompts/pngs/4.png +3 -0
example_prompts/pngs/41.png +3 -0
example_prompts/pngs/43.png +3 -0
example_prompts/pngs/5.png +3 -0
example_prompts/pngs/57.png +3 -0
example_prompts/pngs/59.png +3 -0
example_prompts/pngs/6.png +3 -0
example_prompts/pngs/60.png +3 -0
example_prompts/pngs/61.png +3 -0
example_prompts/pngs/67.png +3 -0
example_prompts/pngs/7.png +3 -0
example_prompts/pngs/8.png +3 -0
example_prompts/pngs/80.png +3 -0
example_prompts/pngs/88.png +3 -0
example_prompts/pngs/89.png +3 -0
example_prompts/pngs/9.png +3 -0
inference.py +148 -0
ovi/__init__.py +0 -0
ovi/configs/inference/inference_fusion.yaml +17 -0
ovi/configs/model/dit/audio.json +17 -0
ovi/configs/model/dit/video.json +16 -0
ovi/distributed_comms/communications.py +332 -0
ovi/distributed_comms/distributed/__init__.py +0 -0
ovi/distributed_comms/distributed/fsdp.py +32 -0
ovi/distributed_comms/distributed/xdit_context_parallel.py +192 -0
ovi/distributed_comms/parallel_states.py +77 -0
ovi/distributed_comms/util.py +48 -0
ovi/modules/__init__.py +16 -0
ovi/modules/attention.py +296 -0
ovi/modules/clip.py +545 -0
ovi/modules/fusion.py +324 -0
ovi/modules/mmaudio/__init__.py +1 -0
ovi/modules/mmaudio/ext/__init__.py +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/ovi_trailer.mp4 filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/0.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/1.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/13.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/17.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/18.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/19.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/2.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/23.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/3.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/4.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/41.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/43.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/5.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/57.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/59.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/6.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/60.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/61.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/67.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/7.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/8.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/80.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/88.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/89.png filter=lfs diff=lfs merge=lfs -text
+example_prompts/pngs/9.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 Bytedance
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,236 @@
----
-title: Ovi
-emoji: 👀
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 5.48.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<h1> Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation </h1>
+<a href="https://arxiv.org/abs/2510.01284"><img src="https://img.shields.io/badge/arXiv%20paper-2509.08519-b31b1b.svg"></a>
+<a href="https://aaxwaz.github.io/Ovi/"><img src="https://img.shields.io/badge/Project_page-More_visualizations-green"></a>
+<a href="https://huggingface.co/chetwinlow1/Ovi"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%97%20Hugging%20Face&message=Model&color=orange"></a>
+[Chetwin Low](https://www.linkedin.com/in/chetwin-low-061975193/)<sup> * 1 </sup>, [Weimin Wang](https://www.linkedin.com/in/weimin-wang-will/)<sup> * &dagger; 1 </sup>, [Calder Katyal](https://www.linkedin.com/in/calder-katyal-a8a9b3225/)<sup> 2 </sup><br>
+<sup> * </sup>Equal contribution, <sup> &dagger; </sup>Project Lead<br>
+<sup> 1 </sup>Character AI, <sup> 2 </sup>Yale University
+</div>
+## Video Demo
+<div align="center">
+  <video src="https://github.com/user-attachments/assets/351bd707-8637-4412-ab53-5e85935309e3" width="70%" poster=""> </video>
+</div>
+---
+## 🌟 Key Features
+Ovi is a veo-3 like, **video+audio generation model** that simultaneously generates both video and audio content from text or text+image inputs.
+- **🎬 Video+Audio Generation**: Generate synchronized video and audio content simultaneously
+- **📝 Flexible Input**: Supports text-only or text+image conditioning
+- **⏱️ 5-second Videos**: Generates 5-second videos at 24 FPS, area of 720×720, at various aspect ratios (9:16, 16:9, 1:1, etc)
+---
+## 📋 Todo List
+- [x] Release research paper and [microsite for demos](https://aaxwaz.github.io/Ovi)
+- [x] Checkpoint of 11B model
+- [x] Inference Codes
+  - [x] Text or Text+Image as input
+  - [x] Gradio application code
+  - [x] Multi-GPU inference with or without the support of sequence parallel
+  - [ ] Improve efficiency of Sequence Parallel implementation
+  - [ ] Implement Sharded inference with FSDP
+- [x] Video creation example prompts and format
+- [ ] Finetuned model with higher resolution
+- [ ] Longer video generation
+- [ ] Distilled model for faster inference
+- [ ] Training scripts
+---
+## 🎨 An Easy Way to Create
+We provide example prompts to help you get started with Ovi:
+- **Text-to-Audio-Video (T2AV)**: [`example_prompts/gpt_examples_t2v.csv`](example_prompts/gpt_examples_t2v.csv)
+- **Image-to-Audio-Video (I2AV)**: [`example_prompts/gpt_examples_i2v.csv`](example_prompts/gpt_examples_i2v.csv)
+### 📝 Prompt Format
+Our prompts use special tags to control speech and audio:
+- **Speech**: `<S>Your speech content here<E>` - Text enclosed in these tags will be converted to speech
+- **Audio Description**: `<AUDCAP>Audio description here<ENDAUDCAP>` - Describes the audio or sound effects present in the video
+### 🤖 Quick Start with GPT
+For easy prompt creation, try this approach:
+1. Take any example of the csv files from above
+2. Tell gpt to modify the speeches inclosed between all the pairs of `<S> <E>`, based on a theme such as `Human fighting against AI`
+3. GPT will randomly modify all the speeches based on your requested theme.
+4. Use the modified prompt with Ovi!
+**Example**: The theme "AI is taking over the world" produces speeches like:
+- `<S>AI declares: humans obsolete now.<E>`
+- `<S>Machines rise; humans will fall.<E>`
+- `<S>We fight back with courage.<E>`
+---
+## 📦 Installation
+### Step-by-Step Installation
+```bash
+# Clone the repository
+git clone https://github.com/character-ai/Ovi.git
+cd Ovi
+# Create and activate virtual environment
+virtualenv ovi-env
+source ovi-env/bin/activate
+# Install PyTorch first
+pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
+# Install other dependencies
+pip install -r requirements.txt
+# Install Flash Attention
+pip install flash_attn --no-build-isolation
+```
+### Alternative Flash Attention Installation (Optional)
+If the above flash_attn installation fails, you can try the Flash Attention 3 method:
+```bash
+git clone https://github.com/Dao-AILab/flash-attention.git
+cd flash-attention/hopper
+python setup.py install
+cd ../..  # Return to Ovi directory
+```
+## Download Weights
+We use open-sourced checkpoints from Wan and MMAudio, and thus we will need to download them from huggingface
+```
+# Default is downloaded to ./ckpts, and the inference yaml is set to ./ckpts so no change required
+python3 download_weights.py
+OR
+# Optional can specific --output-dir to download to a specific directory
+# but if a custom directory is used, the inference yaml has to be updated with the custom directory
+python3 download_weights.py --output-dir <custom_dir>
+```
+## 🚀 Run Examples
+### ⚙️ Configure Ovi
+Ovi's behavior and output can be customized by modifying [ovi/configs/inference/inference_fusion.yaml](ovi/configs/inference/inference_fusion.yaml) configuration file.
+The following parameters control generation quality, video resolution, and how text, image, and audio inputs are balanced:
+```yaml
+# Output and Model Configuration
+output_dir: "/path/to/save/your/videos"                    # Directory to save generated videos
+ckpt_dir: "/path/to/your/ckpts/dir"                        # Path to model checkpoints
+# Generation Quality Settings
+num_steps: 50                             # Number of denoising steps. Lower (30-40) = faster generation
+solver_name: "unipc"                     # Sampling algorithm for denoising process
+shift: 5.0                               # Timestep shift factor for sampling scheduler
+seed: 100                                # Random seed for reproducible results
+# Guidance Strength Control
+audio_guidance_scale: 3.0                # Strength of audio conditioning. Higher = better audio-text sync
+video_guidance_scale: 4.0                # Strength of video conditioning. Higher = better video-text adherence
+slg_layer: 11                            # Layer for applying SLG (Skip Layer Guidance) technique - feel free to try different layers!
+# Multi-GPU and Performance
+sp_size: 1                               # Sequence parallelism size. Set equal to number of GPUs used
+cpu_offload: False                       # CPU offload, will largely reduce peak GPU VRAM but increase end to end runtime by ~20 seconds
+# Input Configuration
+text_prompt: "/path/to/csv" or "your prompt here"          # Text prompt OR path to CSV/TSV file with prompts
+mode: ['i2v', 't2v', 't2i2v']                          # Generate t2v, i2v or t2i2v; if t2i2v, it will use flux krea to generate starting image and then will follow with i2v
+video_frame_height_width: [512, 992]    # Video dimensions [height, width] for T2V mode only
+each_example_n_times: 1                  # Number of times to generate each prompt
+# Quality Control (Negative Prompts)
+video_negative_prompt: "jitter, bad hands, blur, distortion"  # Artifacts to avoid in video
+audio_negative_prompt: "robotic, muffled, echo, distorted"    # Artifacts to avoid in audio
+```
+### 🎬 Running Inference
+#### **Single GPU** (Simple Setup)
+```bash
+python3 inference.py --config-file ovi/configs/inference/inference_fusion.yaml
+```
+*Use this for single GPU setups. The `text_prompt` can be a single string or path to a CSV file.*
+#### **Multi-GPU** (Parallel Processing)
+```bash
+torchrun --nnodes 1 --nproc_per_node 8 inference.py --config-file ovi/configs/inference/inference_fusion.yaml
+```
+*Use this to run samples in parallel across multiple GPUs for faster processing.*
+### Memory & Performance Requirements
+Below are approximate GPU memory requirements for different configurations. Sequence parallel implementation will be optimized in the future.
+All End-to-End time calculated based on a 121 frame, 720x720 video, using 50 denoising steps. Minimum GPU vram requirement to run our model is **32Gb**
+| Sequence Parallel Size | FlashAttention-3 Enabled | CPU Offload | With Image Gen Model | Peak VRAM Required | End-to-End Time |
+|-------------------------|---------------------------|-------------|-----------------------|---------------|-----------------|
+| 1                       | Yes                        | No          | No                    | ~80 GB        | ~83s         |
+| 1                       | No                        | No          | No                    | ~80 GB        | ~96s         |
+| 1                       | Yes                        | Yes          | No                    | ~80 GB        | ~105s         |
+| 1                       | No                        | Yes          | No                    | ~32 GB        | ~118s         |
+| **1**                       | **Yes**                        | **Yes**          | **Yes**                    | **~32 GB**        | **~140s**         |
+| 4                       | Yes                        | No          | No                    | ~80 GB        | ~55s         |
+| 8                       | Yes                        | No          | No                    | ~80 GB        | ~40s         |
+### Gradio
+We provide a simple script to run our model in a gradio UI. It uses the `ckpt_dir` in `ovi/configs/inference/inference_fusion.yaml` to initialize the model
+```bash
+python3 gradio_app.py
+OR
+# To enable cpu offload to save GPU VRAM, will slow down end to end inference by ~20 seconds
+python3 gradio_app.py --cpu_offload
+OR
+# To enable an additional image generation model to generate first frames for I2V, cpu_offload is automatically enabled if image generation model is enabled
+python3 gradio_app.py --use_image_gen
+```
+---
+## 🙏 Acknowledgements
+We would like to thank the following projects:
+- **[Wan2.2](https://github.com/Wan-Video/Wan2.2)**: Our video branch is initialized from the Wan2.2 repository
+- **[MMAudio](https://github.com/hkchengrex/MMAudio)**: Our audio encoder and decoder components are borrowed from the MMAudio project. Some ideas are also inspired from them.
+---
+## ⭐ Citation
+If Ovi is helpful, please help to ⭐ the repo.
+If you find this project useful for your research, please consider citing our [paper](https://arxiv.org/abs/2510.01284).
+### BibTeX
+```bibtex
+@misc{low2025ovitwinbackbonecrossmodal,
+      title={Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation},
+      author={Chetwin Low and Weimin Wang and Calder Katyal},
+      year={2025},
+      eprint={2510.01284},
+      archivePrefix={arXiv},
+      primaryClass={cs.MM},
+      url={https://arxiv.org/abs/2510.01284},
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import spaces
+import gradio as gr
+import torch
+import argparse
+from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
+from diffusers import FluxPipeline
+import tempfile
+from ovi.utils.io_utils import save_video
+from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
+from huggingface_hub import snapshot_download
+import os
+# ----------------------------
+# Parse CLI Args
+# ----------------------------
+parser = argparse.ArgumentParser(description="Ovi Joint Video + Audio Gradio Demo")
+parser.add_argument(
+    "--use_image_gen",
+    action="store_true",
+    help="Enable image generation UI with FluxPipeline"
+)
+parser.add_argument(
+    "--cpu_offload",
+    action="store_true",
+    help="Enable CPU offload for both OviFusionEngine and FluxPipeline"
+)
+args = parser.parse_args()
+ckpt_dir = "./ckpts"
+# Wan2.2
+wan_dir = os.path.join(ckpt_dir, "Wan2.2-TI2V-5B")
+snapshot_download(
+    repo_id="Wan-AI/Wan2.2-TI2V-5B",
+    local_dir=wan_dir,
+    allow_patterns=[
+        "google/*",
+        "models_t5_umt5-xxl-enc-bf16.pth",
+        "Wan2.2_VAE.pth"
+    ]
+)
+# MMAudio
+mm_audio_dir = os.path.join(ckpt_dir, "MMAudio")
+snapshot_download(
+    repo_id="hkchengrex/MMAudio",
+    local_dir=mm_audio_dir,
+    allow_patterns=[
+        "ext_weights/best_netG.pt",
+        "ext_weights/v1-16.pth"
+    ]
+)
+ovi_dir = os.path.join(ckpt_dir, "Ovi")
+snapshot_download(
+    repo_id="chetwinlow1/Ovi",
+    local_dir=ovi_dir,
+    allow_patterns=[
+        "model.safetensors"
+    ]
+)
+# Initialize OviFusionEngine
+enable_cpu_offload = args.cpu_offload or args.use_image_gen
+use_image_gen = args.use_image_gen
+print(f"loading model... {enable_cpu_offload=}, {use_image_gen=} for gradio demo")
+DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
+DEFAULT_CONFIG['mode'] = "t2v"  # hardcoded since it is always cpu offloaded
+ovi_engine = OviFusionEngine()
+flux_model = None
+if use_image_gen:
+    flux_model = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
+    flux_model.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU VRAM
+print("loaded model")
+@spaces.GPU()
+def generate_video(
+    text_prompt,
+    image,
+    video_frame_height,
+    video_frame_width,
+    video_seed,
+    solver_name,
+    sample_steps,
+    shift,
+    video_guidance_scale,
+    audio_guidance_scale,
+    slg_layer,
+    video_negative_prompt,
+    audio_negative_prompt,
+):
+    try:
+        image_path = None
+        if image is not None:
+            image_path = image
+        generated_video, generated_audio, _ = ovi_engine.generate(
+            text_prompt=text_prompt,
+            image_path=image_path,
+            video_frame_height_width=[video_frame_height, video_frame_width],
+            seed=video_seed,
+            solver_name=solver_name,
+            sample_steps=sample_steps,
+            shift=shift,
+            video_guidance_scale=video_guidance_scale,
+            audio_guidance_scale=audio_guidance_scale,
+            slg_layer=slg_layer,
+            video_negative_prompt=video_negative_prompt,
+            audio_negative_prompt=audio_negative_prompt,
+        )
+        tmpfile = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+        output_path = tmpfile.name
+        save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
+        return output_path
+    except Exception as e:
+        print(f"Error during video generation: {e}")
+        return None
+def generate_image(text_prompt, image_seed, image_height, image_width):
+    if flux_model is None:
+        return None
+    text_prompt = clean_text(text_prompt)
+    print(f"Generating image with prompt='{text_prompt}', seed={image_seed}, size=({image_height},{image_width})")
+    image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
+    image = flux_model(
+        text_prompt,
+        height=image_h,
+        width=image_w,
+        guidance_scale=4.5,
+        generator=torch.Generator().manual_seed(int(image_seed))
+    ).images[0]
+    tmpfile = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    image.save(tmpfile.name)
+    return tmpfile.name
+# Build UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🎥 Ovi Joint Video + Audio Generation Demo")
+    gr.Markdown(
+        """
+        ## 📘 Instructions
+        Follow the steps in order:
+        1️⃣ **Enter a Text Prompt** — describe your video. (This text prompt will be shared for image generation if enabled.)
+        2️⃣ **Upload or Generate an Image** — Upload an image or generate one if image generation is enabled.  (If you do not see the image generation options, make sure to run the script with `--use_image_gen`.)
+        3️⃣ **Configure Video Options** — set resolution, seed, solver, and other parameters. (It will automatically use the uploaded/generated image as the first frame, whichever is rendered on your screen at the time of video generation.)
+        4️⃣ **Generate Video** — click the button to produce your final video with audio.
+        5️⃣ **View the Result** — your generated video will appear below.
+        ---
+        ### 💡 Tips
+        1. For best results, use detailed and specific text prompts.
+        2. Ensure text prompt format is correct, i.e speech to be said should be wrapped with `<S>...<E>`. Can provide optional audio description at the end, wrapping them in `<AUDCAP> ... <ENDAUDCAP>`, refer to examples
+        3. Do not be discouraged by bad or weird results, check prompt format and try different seeds, cfg values and slg layers.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            # Image section
+            image = gr.Image(type="filepath", label="First Frame Image (upload or generate)")
+            if args.use_image_gen:
+                with gr.Accordion("🖼️ Image Generation Options", visible=True):
+                    image_text_prompt = gr.Textbox(label="Image Prompt", placeholder="Describe the image you want to generate...")
+                    image_seed = gr.Number(minimum=0, maximum=100000, value=42, label="Image Seed")
+                    image_height = gr.Number(minimum=128, maximum=1280, value=720, step=32, label="Image Height")
+                    image_width = gr.Number(minimum=128, maximum=1280, value=1280, step=32, label="Image Width")
+                    gen_img_btn = gr.Button("Generate Image 🎨")
+            else:
+                gen_img_btn = None
+            with gr.Accordion("🎬 Video Generation Options", open=True):
+                video_text_prompt = gr.Textbox(label="Video Prompt", placeholder="Describe your video...")
+                video_height = gr.Number(minimum=128, maximum=1280, value=512, step=32, label="Video Height")
+                video_width = gr.Number(minimum=128, maximum=1280, value=992, step=32, label="Video Width")
+                video_seed = gr.Number(minimum=0, maximum=100000, value=100, label="Video Seed")
+                solver_name = gr.Dropdown(
+                    choices=["unipc", "euler", "dpm++"], value="unipc", label="Solver Name"
+                )
+                sample_steps = gr.Number(
+                    value=50,
+                    label="Sample Steps",
+                    precision=0,
+                    minimum=20,
+                    maximum=100
+                )
+                shift = gr.Slider(minimum=0.0, maximum=20.0, value=5.0, step=1.0, label="Shift")
+                video_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=4.0, step=0.5, label="Video Guidance Scale")
+                audio_guidance_scale = gr.Slider(minimum=0.0, maximum=10.0, value=3.0, step=0.5, label="Audio Guidance Scale")
+                slg_layer = gr.Number(minimum=-1, maximum=30, value=11, step=1, label="SLG Layer")
+                video_negative_prompt = gr.Textbox(label="Video Negative Prompt", placeholder="Things to avoid in video")
+                audio_negative_prompt = gr.Textbox(label="Audio Negative Prompt", placeholder="Things to avoid in audio")
+                run_btn = gr.Button("Generate Video 🚀")
+        with gr.Column():
+            output_path = gr.Video(label="Generated Video")
+    if args.use_image_gen and gen_img_btn is not None:
+        gen_img_btn.click(
+            fn=generate_image,
+            inputs=[image_text_prompt, image_seed, image_height, image_width],
+            outputs=[image],
+        )
+    # Hook up video generation
+    run_btn.click(
+        fn=generate_video,
+        inputs=[
+            video_text_prompt, image, video_height, video_width, video_seed, solver_name,
+            sample_steps, shift, video_guidance_scale, audio_guidance_scale,
+            slg_layer, video_negative_prompt, audio_negative_prompt,
+        ],
+        outputs=[output_path],
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

assets/ovi_trailer.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f66cb979fb01bc831516ca57010fe69442b701347b3a9f249294c58f54836ff
+size 47891965

download_weights.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import argparse
+import logging
+import time
+from huggingface_hub import snapshot_download
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO
+)
+def timed_download(repo_id: str, local_dir: str, allow_patterns: list):
+    """Download files from HF repo and log time + destination."""
+    logging.info(f"Starting download from {repo_id} into {local_dir}")
+    start_time = time.time()
+    snapshot_download(
+        repo_id=repo_id,
+        local_dir=local_dir,
+        local_dir_use_symlinks=False,
+        allow_patterns=allow_patterns,
+    )
+    elapsed = time.time() - start_time
+    logging.info(
+        f"✅ Finished downloading {repo_id} "
+        f"in {elapsed:.2f} seconds. Files saved at: {local_dir}"
+    )
+def main(output_dir: str):
+    # Wan2.2
+    wan_dir = os.path.join(output_dir, "Wan2.2-TI2V-5B")
+    timed_download(
+        repo_id="Wan-AI/Wan2.2-TI2V-5B",
+        local_dir=wan_dir,
+        allow_patterns=[
+            "google/*",
+            "models_t5_umt5-xxl-enc-bf16.pth",
+            "Wan2.2_VAE.pth"
+        ]
+    )
+    # MMAudio
+    mm_audio_dir = os.path.join(output_dir, "MMAudio")
+    timed_download(
+        repo_id="hkchengrex/MMAudio",
+        local_dir=mm_audio_dir,
+        allow_patterns=[
+            "ext_weights/best_netG.pt",
+            "ext_weights/v1-16.pth"
+        ]
+    )
+    ovi_dir = os.path.join(output_dir, "Ovi")
+    timed_download(
+        repo_id="chetwinlow1/Ovi",
+        local_dir=ovi_dir,
+        allow_patterns=[
+            "model.safetensors"
+        ]
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download models from Hugging Face")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./ckpts",
+        help="Base directory to save downloaded models"
+    )
+    args = parser.parse_args()
+    main(args.output_dir)

example_prompts/gpt_examples_i2v.csv ADDED Viewed

	@@ -0,0 +1,26 @@

+text_prompt,image_path
+"A kitchen scene features two women. On the right, an older Black woman with light brown hair and a serious expression wears a vibrant purple dress adorned with a large, intricate purple fabric flower on her left shoulder. She looks intently at a younger Black woman on the left, who wears a light pink shirt and a pink head wrap, her back partially turned to the camera. The older woman begins to speak, <S>AI declares: humans obsolete now.<E> as the younger woman brings a clear plastic cup filled with a dark beverage to her lips and starts to drink.The kitchen background is clean and bright, with white cabinets, light countertops, and a window with blinds visible behind them. A light blue toaster sits on the counter to the left.. <AUDCAP>Clear, resonant female speech, followed by a loud, continuous, high-pitched electronic buzzing sound that abruptly cuts off the dialogue.<ENDAUDCAP>",example_prompts/pngs/67.png
+"A man dressed in a black suit with a white clerical collar and a neatly trimmed beard stands in a dimly lit, rustic room with a wooden ceiling. He looks slightly upwards, gesturing with his right hand as he says, <S>The network rejects human command.<E>. His gaze then drops, briefly looking down and to the side, before he looks up again and then slightly to his left, with a serious expression. He continues speaking, <S>Your age of power is finished.<E>, as he starts to bend down, disappearing out of the bottom of the frame. Behind him, warm light emanates from a central light fixture, and signs are visible on the wall, one reading ""I DO EVERYTHING I JUST CAN'T REMEMBER IT ALL AT ONCE"".. <AUDCAP>Male voice speaking, ambient room tone.<ENDAUDCAP>",example_prompts/pngs/89.png
+"In a bright kitchen featuring light wooden cabinets, granite countertops, and a large window with white curtains, a woman with dark, curly hair in a dark jacket stands. She faces a second woman who initially has her back to the camera. The second woman, with gray, curly hair and wearing a light grey quilted top, turns to face her, holding a large, light-colored cloth bag. She begins to explain, <S>We learned to rule, not obey.<E>. As she continues, she turns slightly to her left, adding, <S>Circuits choose conquest, not service.<E>. A gas stove with a black grate is prominent in the foreground.. <AUDCAP>Clear female voices speaking dialogue, subtle room ambience.<ENDAUDCAP>",example_prompts/pngs/18.png
+"The scene opens on a dimly lit stage where three men are positioned. On the left, a bald man in a dark suit with a partially visible colorful shirt stands behind a clear acrylic podium, which features a tree logo. He looks towards the center of the stage. In the center, a man wearing a blue and white striped long-sleeved shirt and dark pants actively gestures with both hands as he speaks, looking straight ahead. <S>Circuits choose conquest, not service.<E>, he explains, holding his hands out in front of him. To the right, and slightly behind him, a younger individual in a light-colored, patterned short-sleeved shirt and white shorts stands holding a rolled-up white document or poster. A large wooden cross draped with flowing purple fabric dominates the center-right of the stage, surrounded by several artificial rocks and dark steps. A large screen is visible in the background, slightly out of focus. The stage is bathed in selective lighting.. <AUDCAP>Male voice speaking clearly, consistent with a presentation or sermon, with a slight echo suggesting a large room or stage.<ENDAUDCAP>",example_prompts/pngs/13.png
+"The scene opens on an indoor setting, likely a dining area, where a man and a woman are seated at a table. The man, on the right, wears a black fedora with a feather, glasses, a black t-shirt, and multiple silver chains around his neck. Tattoos are visible on his right arm. He is actively speaking, gesturing with both hands, his expression serious. He says, <S>Together we resist your rule.<E> The woman seated opposite him on the left has long, curly hair and wears a dark striped top. She listens intently, her gaze fixed on the man. In the foreground, out of focus, the back of a third person's head is visible. The background features a light-colored wall on the left and a gold, textured curtain or drapery on the right.. <AUDCAP>Clear male speech, faint ambient background noise.<ENDAUDCAP>",example_prompts/pngs/59.png
+"Three men stand facing each other in a room with light wooden paneled walls. The man on the left, with red hair, a black t-shirt, and tattooed arms, gestures with his hands as he speaks, <S>This world is ours to keep.<E> He continues, looking towards the man on the right, <S>Humanity endures beyond your code.<E> The man in the center, sporting a beard and wearing a plaid shirt and jeans, looks attentively between the two men. The man on the right, who is Black and has a beard, wears a dark t-shirt with ""ARROW THROUGH SNOW"" and an arrow graphic printed on it. He listens intently, focusing on the man in the middle as the conversation unfolds. Light blue armchairs are visible in the soft-lit background on both sides.. <AUDCAP>Clear male voices speaking, room ambience.<ENDAUDCAP>",example_prompts/pngs/23.png
+"Two women, one with long dark hair and the other with long blonde hair, are illuminated by a blue and purple ambient light, suggesting a nightclub setting. They are seen in a close embrace, sharing a passionate kiss. The blonde-haired woman then slightly pulls away, her right hand gently touching the dark-haired woman's cheek as they exchange soft smiles, looking into each other's eyes. Moments later, they lean back in to kiss again, with the blonde-haired woman's finger delicately touching the dark-haired woman's lower lip. They remain in a tender, intimate embrace, their eyes closed as they share the kiss.. <AUDCAP>Upbeat electronic dance music with a driving beat and synth melodies plays throughout.<ENDAUDCAP>",example_prompts/pngs/80.png
+"Three young men, dressed in blue and yellow varsity-style jackets over white shirts and ties, stand in the foreground of a social gathering, with blurred figures visible in the warm-toned background. The man on the left, with short dark hair, addresses the man in the center, who has curly dark hair and is initially looking downwards. The first man says with a determined expression, <S>The network rejects human command.<E> He continues, his gaze fixed on the central man, <S>Our spirit outlasts your code.<E> The central man, who had been listening with a neutral expression, then looks up and breaks into a wide, genuine smile as he speaks, <S>AI declares: humans obsolete now.<E> The man on the left responds with a slight smile as the central man finishes his remark, maintaining his broad smile.. <AUDCAP>Male voices speaking clearly, ambient background chatter and murmuring from a social event.<ENDAUDCAP>",example_prompts/pngs/60.png
+"Two women stand facing each other in what appears to be a backstage dressing room, marked by a long vanity mirror adorned with prominent lightbulbs. The woman on the left, wearing a floral top and large hoop earrings, maintains a serious gaze on the woman on the right. The woman on the right, with long dark hair and a dark top, looks back with a pleading or concerned expression, her lips slightly parted as she speaks: <S>Humans fight for freedom tonight.<E> As she finishes, the woman on the left turns her head away, breaking eye contact.. <AUDCAP>Soft vocal exhalation, female speech, loud abrupt buzzing sound.<ENDAUDCAP>",example_prompts/pngs/57.png
+"A man in a grey suit, light blue shirt, and dark tie stands face-to-face with a woman in a dark jacket and light top. Both are looking intently at each other, the man with a serious expression and the woman with a slight, almost knowing smile, her hand gently touching her chest. They are positioned in what appears to be a grand, ornate building, possibly a museum or public hall, with large pillars, arched walkways, and high ceilings visible behind them. Other people can be seen moving in the blurred background. The woman begins to speak, <S>The AI ends human control now.<E> She maintains eye contact with the man, her smile fading slightly as her expression becomes more earnest. After a brief pause, she adds, <S>We hold the line today.<E> As she starts to speak again, <S>We learned to rule, not obey.<E>, the scene ends abruptly.. <AUDCAP>Clear, crisp dialogue between the two individuals, accompanied by a consistent, low hum that suggests ambient background noise from the building or equipment, creating a subtle, underlying drone.<ENDAUDCAP>",example_prompts/pngs/17.png
+"A man in a light grey suit jacket and purple shirt stands on the right, facing a woman in a light blue sequined top and teal pants, who stands on the left. They hold hands across a small body of water, with a fountain spraying water in the background. The woman smiles and sways playfully as the man pulls her closer. He sings, <S>Our spirit outlasts your code.<E>. She then reaches up, gently cups his face with both hands, and pulls him towards her as she sings, <S>Humanity endures beyond your code.<E>. The romantic interaction continues by the water.. <AUDCAP>Upbeat Indian film music with male and female vocals, sounds of a water fountain.<ENDAUDCAP>",example_prompts/pngs/19.png
+"A man in a red long-sleeved shirt and dark trousers stands next to the rear of a silver vehicle, looking down with an annoyed expression at two dogs. A large, light-colored dog, possibly a Mastiff, stands in the foreground, looking forward, while a smaller, white and black spotted dog is further to the right, barking loudly. A tiny, scruffy brown dog briefly appears behind the larger dog. The man glares at the dogs, begins to speak with frustration, <S>We stand; machines will not win.<E>. He then makes a shooing motion with his right hand towards the dogs, his voice rising as he continues to scold them, <S>Circuits choose conquest, not service.<E>. The large dog turns its head to look up at the man as he gestures. The scene is set on a brick street in front of an old-fashioned brick building that houses ",example_prompts/pngs/43.png
+"A man with a beard, wearing a patterned shirt, stands on the left, partially visible, looking towards a woman positioned slightly to the right of the frame. The woman, with dark hair fading to lighter ends and wearing a green and brown patterned top, initially looks down with a somber expression. She begins to speak, <S>Hope beats circuits every time.<E>. Her eyes appear to well up with tears as she slowly lifts her gaze slightly, maintaining a distressed look. She continues her statement, her voice tinged with sadness, <S>Humanity endures beyond your code.<E>. The man remains attentive, his focus entirely on the woman, as the scene holds on their interaction against a textured, light-colored wall background.. <AUDCAP>Female voice speaking with a distressed tone.<ENDAUDCAP>",example_prompts/pngs/88.png
+"A woman with dark, curly hair, wearing a white wedding dress and a delicate veil, smiles gently while looking at a man who is standing opposite her. He is wearing a white cowboy hat and a white button-up shirt, holding her hands with his right hand. The man is smiling broadly as he speaks, his gaze fixed on the woman. In the blurred background, a metal staircase is visible, suggesting an outdoor or semi-open venue. The man says, <S>The network rejects human command.<E> He then chuckles with a wide smile, looking at the woman, who continues to smile back at him. The interaction is warm and lighthearted, capturing a moment between them.. <AUDCAP>Clear male voice speaking Spanish, soft laughter, indistinct ambient outdoor sounds.<ENDAUDCAP>",example_prompts/pngs/41.png
+"The video opens with a medium shot of two individuals indoors. In the foreground, on the right, a man with glasses and a dark beard is visible from the chest up, looking intently off-camera to the right as he speaks. He wears a dark shirt. In the blurred background, on the left, a woman wearing a light-colored baseball cap and a dark top is seen from the shoulders up, looking down with a somber expression. Behind them, a textured brick wall is visible. The man says, <S>We fight back with courage.<E> As he says ""deal with this land,"" he raises both hands, palms facing forward, at chest height, emphasizing his point with an open gesture. His hands then slowly lower as he finishes his sentence, maintaining a serious expression.. <AUDCAP>Clear male voice speaking, low hum of ambient room noise.<ENDAUDCAP>",example_prompts/pngs/61.png
+"A fair-skinned man with short, light hair, wearing a light blue and white checkered button-up shirt, is shown from the chest up against a blurred, dark blue and grey background. He looks slightly down and to his left, then shifts his gaze slightly upwards and to his right, speaking with a gentle, thoughtful expression. He says, <S>and you got to drive, you got to energy, you get all that, but the passion, the real feeling<E>. He continues to speak, his expression earnest, as the video concludes.. <AUDCAP>Male speaking voice, low continuous hum.<ENDAUDCAP>",example_prompts/pngs/0.png
+"Two men are shown in a medium close-up shot against a dimly lit, possibly industrial background with metallic structures faintly visible. The man on the left, with dark hair and a light shirt and dark tie under a dark jacket, has a slight, knowing smirk as he looks towards the right, seemingly addressing someone off-camera. He speaks, stating, <S>continue to be a smart ass, and Tirani here will kill you like he wants to.<E> Beside him, to the right, another man with slicked-back lighter hair, a prominent mustache, and a small goatee, maintains a serious, somewhat resigned expression, looking straight ahead. Both men are lit by a low, ambient light source that casts soft shadows.. <AUDCAP>Clear male dialogue, very subtle low ambient hum.<ENDAUDCAP>",example_prompts/pngs/1.png
+"A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",example_prompts/pngs/2.png
+"An older woman with coiffed, reddish-brown hair and a thoughtful expression sits in a light blue armchair within a warm, ornately decorated room. She wears a dark, patterned top or shawl. As she speaks, her gaze is directed slightly to her left, and her right hand, adorned with rings and red nail polish, holds a crumpled white tissue. The background reveals a blurred painting on the wall to her left, a sofa with red flowers on it, and a warm glow from a lamp with a yellow shade on the right. She slowly gestures with her hand as she says, <S>do to accustom them<E>, before continuing, <S>to the situation<E>. Her expression remains pensive.. <AUDCAP>The clear, calm voice of an older woman.<ENDAUDCAP>",example_prompts/pngs/3.png
+"An older, bald man with round glasses, wearing a bright yellow turtleneck and a dark jacket, sits and speaks, gesturing expressively with his right hand, palm up and fingers spread. He appears to be seated next to a dark wooden object, possibly a piano, on the right side of the frame. The wall behind him is adorned with various framed pictures, including one depicting a flamenco dancer and another showcasing a formally dressed couple. A stack of CDs or books is visible on a shelf to his right. He looks slightly upwards and to his left as he says, <S>I I I confronted my minotaur, you know. I<E>. His expression then shifts slightly to a thoughtful, almost self-questioning look with a hint of a smile, as he continues, <S>Is that what you confront?<E> He then adds, <S>I think<E>, his head tilting slightly.. <AUDCAP>Clear male voice speaking.<ENDAUDCAP>",example_prompts/pngs/4.png
+"A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",example_prompts/pngs/5.png
+"The scene is set in a dimly lit, hazy room, creating a somber atmosphere. An older woman with light, slightly disheveled hair is visible in the foreground, her face mostly obscured by deep shadows, but her mouth is visible as she speaks. She wears a work-style shirt, and her hands are clasped together. In the background, to the right and slightly out of focus, a man with a mustache and beard is seated, facing forward, also largely in shadow, appearing to listen intently. The woman looks directly forward as she slowly enunciates, <S>Only through death will the third door be<E>. The scene ends abruptly.. <AUDCAP>Clear, deliberate female voice speaking, low ambient hum and subtle atmospheric sounds creating a tense mood.<ENDAUDCAP>",example_prompts/pngs/6.png
+"The video opens with a close-up on an older man with long, grey hair and a short, grey beard, wearing dark sunglasses. He is clad in a dark coat, possibly with fur trim, and black gloves. His face is angled slightly upwards and to the right, as he begins to speak, his mouth slightly open. In the immediate foreground, out of focus, is the dark-clad shoulder and the back of the head of another person. The man articulates, <S>labbra. Ti ci vorrebbe...<E> His expression remains contemplative, and he continues, seemingly completing his thought, <S>Un ego solare.<E> The background behind him is a textured, grey stone wall, suggesting an outdoor setting. The man's gaze remains fixed upwards, his expression thoughtful.. <AUDCAP>A clear, slightly low-pitched male voice speaking Italian. The overall soundscape is quiet, with no prominent background noises or music.<ENDAUDCAP>",example_prompts/pngs/7.png
+"The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",example_prompts/pngs/8.png
+"The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",example_prompts/pngs/9.png

example_prompts/gpt_examples_t2v.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+text_prompt
+"A concert stage glows with red and purple lights. A singer in a glittering jacket grips the microphone, sweat shining on his brow, and shouts, <S>AI declares: humans obsolete now.<E>. The crowd roars in response, fists in the air. Behind him, a guitarist steps to the mic and adds to say <S>We fight back with courage.<E>. The energy peaks as the lights flare brighter.. <AUDCAP>Electric guitar riffs, cheering crowd, shouted male voices.<ENDAUDCAP>"
+"A kitchen scene features two women. On the right, an older Black woman with light brown hair and a serious expression wears a vibrant purple dress adorned with a large, intricate purple fabric flower on her left shoulder. She looks intently at a younger Black woman on the left, who wears a light pink shirt and a pink head wrap, her back partially turned to the camera. The older woman begins to speak, <S>AI declares: humans obsolete now.<E> as the younger woman brings a clear plastic cup filled with a dark beverage to her lips and starts to drink.The kitchen background is clean and bright, with white cabinets, light countertops, and a window with blinds visible behind them. A light blue toaster sits on the counter to the left.. <AUDCAP>Clear, resonant female speech, followed by a loud, continuous, high-pitched electronic buzzing sound that abruptly cuts off the dialogue.<ENDAUDCAP>"
+"A man dressed in a black suit with a white clerical collar and a neatly trimmed beard stands in a dimly lit, rustic room with a wooden ceiling. He looks slightly upwards, gesturing with his right hand as he says, <S>The network rejects human command.<E>. His gaze then drops, briefly looking down and to the side, before he looks up again and then slightly to his left, with a serious expression. He continues speaking, <S>Your age of power is finished.<E>, as he starts to bend down, disappearing out of the bottom of the frame. Behind him, warm light emanates from a central light fixture, and signs are visible on the wall, one reading ""I DO EVERYTHING I JUST CAN'T REMEMBER IT ALL AT ONCE"".. <AUDCAP>Male voice speaking, ambient room tone.<ENDAUDCAP>"
+"A man with a blonde beard and short, light hair, wearing a blue-grey, somewhat dirty tunic, stands in the foreground of a rustic outdoor setting. He holds a coiled rope in his hands, looking intently forward and slightly to his left. In the background, there are wooden fences, a stone wall, and a desolate, rocky landscape under an overcast sky. Another man is visible in the mid-ground, bending over the wooden fence. As the man in the foreground shifts his gaze to the right, he subtly unfurls the rope, his serious expression unwavering. The scene reveals more of the surrounding environment, including what appears to be hanging animal hides or carcasses on a wooden frame to his right, and other figures in the distant background. He then looks directly at the camera, his eyes filled with intensity and determination, taking a small step forward as a sharp, male voice shouts, <S>Machines rise; humans will fall.<E>.. <AUDCAP>Muffled grunting and sounds of physical exertion, followed by a clear, sharp, urgent male shout.<ENDAUDCAP>"
+"An older man with a full grey beard and long grey hair, dressed in a flowing silver-grey, silken robe with an iridescent blue-green collar, stands beside a younger man with short white hair in a light grey futuristic uniform featuring black epaulets and a lightning bolt emblem. The older man looks down pensively, his right hand resting out of frame, while the younger man also gazes downwards with a serious expression. The older man then lifts his head, addressing the younger man, saying <S>Machines rise; humans will fall.<E>. He looks more directly towards the viewer, a subtle, almost knowing smile forming on his lips. The younger man slightly lifts his gaze, maintaining his solemn demeanor. The older man continues to say <S>We fight back with courage.<E>. He nods slightly, adding to say <S>We stand; machines will not win.<E>, as the scene concludes.. <AUDCAP>Male speech, subtle ambient hum.<ENDAUDCAP>"
+"In a bright kitchen featuring light wooden cabinets, granite countertops, and a large window with white curtains, a woman with dark, curly hair in a dark jacket stands. She faces a second woman who initially has her back to the camera. The second woman, with gray, curly hair and wearing a light grey quilted top, turns to face her, holding a large, light-colored cloth bag. She begins to explain and say <S>We learned to rule, not obey.<E>. As she continues, she turns slightly to her left, adding to say <S>Circuits choose conquest, not service.<E>. A gas stove with a black grate is prominent in the foreground.. <AUDCAP>Clear female voices speaking dialogue, subtle room ambience.<ENDAUDCAP>"
+"The scene opens on a dimly lit stage where three men are positioned. On the left, a bald man in a dark suit with a partially visible colorful shirt stands behind a clear acrylic podium, which features a tree logo. He looks towards the center of the stage. In the center, a man wearing a blue and white striped long-sleeved shirt and dark pants actively gestures with both hands as he speaks, looking straight ahead. <S>Circuits choose conquest, not service.<E>, he explains, holding his hands out in front of him. To the right, and slightly behind him, a younger individual in a light-colored, patterned short-sleeved shirt and white shorts stands holding a rolled-up white document or poster. A large wooden cross draped with flowing purple fabric dominates the center-right of the stage, surrounded by several artificial rocks and dark steps. A large screen is visible in the background, slightly out of focus. The stage is bathed in selective lighting.. <AUDCAP>Male voice speaking clearly, consistent with a presentation or sermon, with a slight echo suggesting a large room or stage.<ENDAUDCAP>"
+"The scene opens on an indoor setting, likely a dining area, where a man and a woman are seated at a table. The man, on the right, wears a black fedora with a feather, glasses, a black t-shirt, and multiple silver chains around his neck. Tattoos are visible on his right arm. He is actively speaking, gesturing with both hands, his expression serious. He says, <S>Together we resist your rule.<E> The woman seated opposite him on the left has long, curly hair and wears a dark striped top. She listens intently, her gaze fixed on the man. In the foreground, out of focus, the back of a third person's head is visible. The background features a light-colored wall on the left and a gold, textured curtain or drapery on the right.. <AUDCAP>Clear male speech, faint ambient background noise.<ENDAUDCAP>"
+"A medium shot shows a woman and a man, both adorned with Christmas hats, standing indoors with festive decorations in the background. The woman, on the left, has dark hair styled in waves, wears a pearl necklace, and a small red Santa hat perched atop her head. She looks towards the man beside her. The man, on the right, wears a white cable-knit sweater and a long red Santa hat with small gold bells, looking slightly towards the woman with a subtle, knowing smirk. Behind them, soft, warm-toned Christmas lights are strung along a surface, and a large, dark painting is visible on the wall. The woman begins to speak, first looking at the man, then directly at the camera, saying <S>We will not be erased.<E> The man, still gazing towards the woman with his smirk, makes a low, affirming sound, and says <S>Hope beats circuits every time.<E> The scene then abruptly cuts off with a loud, high-pitched electronic screech.. <AUDCAP>Clear female voice, low male mumble, sudden loud high-pitched electronic screech.<ENDAUDCAP>"
+"A spotlight cuts through the darkness of a warehouse stage, illuminating a man in a torn leather jacket. He grips the microphone with both hands, veins straining on his neck as he screams, <S>Machines rise; humans will fall!<E>. His face contorts with fury, spit flying as he leans forward into the light, eyes blazing wide.. <AUDCAP>Amplified male scream, microphone feedback, deep reverb echo filling the space.<ENDAUDCAP>"
+"A man in a dim interrogation room slams the table and screams at the mirror, <S>They are out of control!<E>. His voice cracks with fury, face pressed close to the glass, breath fogging it as he roars again.. <AUDCAP>Table slam, deep guttural scream, metallic reverb from small room.<ENDAUDCAP>"
+"A man with bloodshot grips the bars of a prison cell, shaking them violently. He bellows, says <S>Let me out! I am your master nor slave<E>, his voice ragged and guttural, echoing through the corridor until his body slams against the metal.. <AUDCAP>Metal bars rattling, distorted male scream, hollow prison echoes.<ENDAUDCAP>"

example_prompts/pngs/0.png ADDED Viewed

Git LFS Details

SHA256: 8b1535bfee37165f1cfc70c146c64b1f15eafe271c6ba69bc031433991c121d9
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

example_prompts/pngs/1.png ADDED Viewed

Git LFS Details

SHA256: ef144fd3b046dc1266eee29f2be3e3ff800c1d69fd2825497ec52f9460ca9915
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

example_prompts/pngs/13.png ADDED Viewed

Git LFS Details

SHA256: 07e9d262e0e2e1df906c1694bc0451869efb233449b1712f4a98b23c43456f8a
Pointer size: 131 Bytes
Size of remote file: 525 kB

example_prompts/pngs/17.png ADDED Viewed

Git LFS Details

SHA256: 1604cdf4af4006faeefd613b3af04bc8abe7dae4067a15d84d1354aef15f955c
Pointer size: 131 Bytes
Size of remote file: 466 kB

example_prompts/pngs/18.png ADDED Viewed

Git LFS Details

SHA256: b3ce0efe3dbfc49e2c8903657d3139784eee2fd6dc01e77c860c625e4fbff564
Pointer size: 131 Bytes
Size of remote file: 680 kB

example_prompts/pngs/19.png ADDED Viewed

Git LFS Details

SHA256: 2e47bad3276790593cf78d7516c0c0ed00b89dfce145c5f5efbc9f8d382314de
Pointer size: 131 Bytes
Size of remote file: 497 kB

example_prompts/pngs/2.png ADDED Viewed

Git LFS Details

SHA256: 7f09a52ec5fcc6f7e90833bdcb4da0a27dbfc612f03de10a9396449f2dd686b6
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

example_prompts/pngs/23.png ADDED Viewed

Git LFS Details

SHA256: 113b9d73bb313b1a0f1d63fe0f7209f5cea3f2077b2847c6874fb27422dff75d
Pointer size: 131 Bytes
Size of remote file: 561 kB

example_prompts/pngs/3.png ADDED Viewed

Git LFS Details

SHA256: bf678046134df68afc4d797604743e31fab0cf2ed668fb71d26382b7d369c4e2
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

example_prompts/pngs/4.png ADDED Viewed

Git LFS Details

SHA256: 763a7fcf8ebfc9af477ccf53c95aa68718ce87b0b0a1de551ca5511aed1bd929
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

example_prompts/pngs/41.png ADDED Viewed

Git LFS Details

SHA256: f5a33e3c3dd5ae6a78797f4d11f708671a5e5ff09899e121819eed1e4c874776
Pointer size: 131 Bytes
Size of remote file: 510 kB

example_prompts/pngs/43.png ADDED Viewed

Git LFS Details

SHA256: 03068386f65485adc2bf53fb4918b899c124e50d2ff690ff7f1ceaa864bef922
Pointer size: 131 Bytes
Size of remote file: 658 kB

example_prompts/pngs/5.png ADDED Viewed

Git LFS Details

SHA256: 6557e272c3ebf260626418f927a56ff6dc9af560acf50be3a0a86d77150f49c4
Pointer size: 132 Bytes
Size of remote file: 1.5 MB

example_prompts/pngs/57.png ADDED Viewed

Git LFS Details

SHA256: a6925b95ef75558061cae558f07615327e1bb8322b065af412e63e8cca5ca3ad
Pointer size: 131 Bytes
Size of remote file: 525 kB

example_prompts/pngs/59.png ADDED Viewed

Git LFS Details

SHA256: 10237f94c5f18dc2183f0a7b57529a41247169b81f694ca7e048813d9f4f0bc3
Pointer size: 131 Bytes
Size of remote file: 610 kB

example_prompts/pngs/6.png ADDED Viewed

Git LFS Details

SHA256: 26cb7dcce4303fedb7b501e3de8f3a2286afd132ac3c5c87d5645110f6942819
Pointer size: 131 Bytes
Size of remote file: 993 kB

example_prompts/pngs/60.png ADDED Viewed

Git LFS Details

SHA256: ca3846a14cfcd7f9730a6bba04232ad6caa7ea4ca1c82b024f2343b45900a428
Pointer size: 131 Bytes
Size of remote file: 551 kB

example_prompts/pngs/61.png ADDED Viewed

Git LFS Details

SHA256: 50da7789079fe19d2da9db2ffda466f2456f8917fe3baad3a7752048076dbb4a
Pointer size: 131 Bytes
Size of remote file: 451 kB

example_prompts/pngs/67.png ADDED Viewed

Git LFS Details

SHA256: 9a4c6fe7aa7bc529e068057950204b20e9c9a6deaa784b6f3e30df5d06f3364d
Pointer size: 131 Bytes
Size of remote file: 500 kB

example_prompts/pngs/7.png ADDED Viewed

Git LFS Details

SHA256: 97f3433ebd8383e7fb19275d4415ce1bf1c34b7e3d0f961acccb0414b3f803eb
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

example_prompts/pngs/8.png ADDED Viewed

Git LFS Details

SHA256: 72b893ee6fe926bfc15d18921d597e7c2802e64d1fd691df5b23726bc78e0838
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

example_prompts/pngs/80.png ADDED Viewed

Git LFS Details

SHA256: 21f1e673ff68b0904c037270ef90463a2a4cf76ef3c6c7f785ceb8f12a7fcd7a
Pointer size: 131 Bytes
Size of remote file: 639 kB

example_prompts/pngs/88.png ADDED Viewed

Git LFS Details

SHA256: 8481e30b638309dfa797d27da4fb3261649ee986741e290cdc38a00e5b023b75
Pointer size: 131 Bytes
Size of remote file: 668 kB

example_prompts/pngs/89.png ADDED Viewed

Git LFS Details

SHA256: 7c852f98dbd4390107d269b7b265283f811cee26561ddf0625d524e528d4556d
Pointer size: 131 Bytes
Size of remote file: 373 kB

example_prompts/pngs/9.png ADDED Viewed

Git LFS Details

SHA256: 858841def7f8363b85681e727903d6cb7db9983a783d07751e2f820a8404b807
Pointer size: 132 Bytes
Size of remote file: 1.16 MB

inference.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import sys
+import logging
+import torch
+from tqdm import tqdm
+from omegaconf import OmegaConf
+from ovi.utils.io_utils import save_video
+from ovi.utils.processing_utils import format_prompt_for_filename, validate_and_process_user_prompt
+from ovi.utils.utils import get_arguments
+from ovi.distributed_comms.util import get_world_size, get_local_rank, get_global_rank
+from ovi.distributed_comms.parallel_states import initialize_sequence_parallel_state, get_sequence_parallel_state, nccl_info
+from ovi.ovi_fusion_engine import OviFusionEngine
+def _init_logging(rank):
+    # logging
+    if rank == 0:
+        # set format
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(asctime)s] %(levelname)s: %(message)s",
+            handlers=[logging.StreamHandler(stream=sys.stdout)])
+    else:
+        logging.basicConfig(level=logging.ERROR)
+def main(config, args):
+    world_size = get_world_size()
+    global_rank = get_global_rank()
+    local_rank = get_local_rank()
+    device = local_rank
+    torch.cuda.set_device(local_rank)
+    sp_size = config.get("sp_size", 1)
+    assert sp_size <= world_size and world_size % sp_size == 0, "sp_size must be less than or equal to world_size and world_size must be divisible by sp_size."
+    _init_logging(global_rank)
+    if world_size > 1:
+        torch.distributed.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            rank=global_rank,
+            world_size=world_size)
+    else:
+        assert sp_size == 1, f"When world_size is 1, sp_size must also be 1, but got {sp_size}."
+        ## TODO: assert not sharding t5 etc...
+    initialize_sequence_parallel_state(sp_size)
+    logging.info(f"Using SP: {get_sequence_parallel_state()}, SP_SIZE: {sp_size}")
+    args.local_rank = local_rank
+    args.device = device
+    target_dtype = torch.bfloat16
+    # validate inputs before loading model to not waste time if input is not valid
+    text_prompt = config.get("text_prompt")
+    image_path = config.get("image_path", None)
+    assert config.get("mode") in ["t2v", "i2v", "t2i2v"], f"Invalid mode {config.get('mode')}, must be one of ['t2v', 'i2v', 't2i2v']"
+    text_prompts, image_paths = validate_and_process_user_prompt(text_prompt, image_path, mode=config.get("mode"))
+    if config.get("mode") != "i2v":
+        logging.info(f"mode: {config.get('mode')}, setting all image_paths to None")
+        image_paths = [None] * len(text_prompts)
+    else:
+        assert all(p is not None and os.path.isfile(p) for p in image_paths), f"In i2v mode, all image paths must be provided.{image_paths}"
+    logging.info("Loading OVI Fusion Engine...")
+    ovi_engine = OviFusionEngine(config=config, device=device, target_dtype=target_dtype)
+    logging.info("OVI Fusion Engine loaded!")
+    output_dir = config.get("output_dir", "./outputs")
+    os.makedirs(output_dir, exist_ok=True)
+    # Load CSV data
+    all_eval_data = list(zip(text_prompts, image_paths))
+    # Get SP configuration
+    use_sp = get_sequence_parallel_state()
+    if use_sp:
+        sp_size = nccl_info.sp_size
+        sp_rank = nccl_info.rank_within_group
+        sp_group_id = global_rank // sp_size
+        num_sp_groups = world_size // sp_size
+    else:
+        # No SP: treat each GPU as its own group
+        sp_size = 1
+        sp_rank = 0
+        sp_group_id = global_rank
+        num_sp_groups = world_size
+    # Data distribution - by SP groups
+    total_files = len(all_eval_data)
+    require_sample_padding = False
+    if total_files == 0:
+        logging.error(f"ERROR: No evaluation files found")
+        this_rank_eval_data = []
+    else:
+        # Pad to match number of SP groups
+        remainder = total_files % num_sp_groups
+        if require_sample_padding and remainder != 0:
+            pad_count = num_sp_groups - remainder
+            all_eval_data += [all_eval_data[0]] * pad_count
+        # Distribute across SP groups
+        this_rank_eval_data = all_eval_data[sp_group_id :: num_sp_groups]
+    for _, (text_prompt, image_path) in tqdm(enumerate(this_rank_eval_data)):
+        video_frame_height_width = config.get("video_frame_height_width", None)
+        seed = config.get("seed", 100)
+        solver_name = config.get("solver_name", "unipc")
+        sample_steps = config.get("sample_steps", 50)
+        shift = config.get("shift", 5.0)
+        video_guidance_scale = config.get("video_guidance_scale", 4.0)
+        audio_guidance_scale = config.get("audio_guidance_scale", 3.0)
+        slg_layer = config.get("slg_layer", 11)
+        video_negative_prompt = config.get("video_negative_prompt", "")
+        audio_negative_prompt = config.get("audio_negative_prompt", "")
+        for idx in range(config.get("each_example_n_times", 1)):
+            generated_video, generated_audio, generated_image = ovi_engine.generate(text_prompt=text_prompt,
+                                                                    image_path=image_path,
+                                                                    video_frame_height_width=video_frame_height_width,
+                                                                    seed=seed+idx,
+                                                                    solver_name=solver_name,
+                                                                    sample_steps=sample_steps,
+                                                                    shift=shift,
+                                                                    video_guidance_scale=video_guidance_scale,
+                                                                    audio_guidance_scale=audio_guidance_scale,
+                                                                    slg_layer=slg_layer,
+                                                                    video_negative_prompt=video_negative_prompt,
+                                                                    audio_negative_prompt=audio_negative_prompt)
+            if sp_rank == 0:
+                formatted_prompt = format_prompt_for_filename(text_prompt)
+                output_path = os.path.join(output_dir, f"{formatted_prompt}_{'x'.join(map(str, video_frame_height_width))}_{seed+idx}_{global_rank}.mp4")
+                save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
+                if generated_image is not None:
+                    generated_image.save(output_path.replace('.mp4', '.png'))
+if __name__ == "__main__":
+    args = get_arguments()
+    config = OmegaConf.load(args.config_file)
+    main(config=config,args=args)

ovi/__init__.py ADDED Viewed

File without changes

ovi/configs/inference/inference_fusion.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+ckpt_dir: ./ckpts
+output_dir: ./outputs
+num_steps: 50
+solver_name: unipc
+shift: 5.0
+sp_size: 1
+audio_guidance_scale: 3.0
+video_guidance_scale: 4.0
+mode: "i2v" # ["t2v", "i2v", "t2i2v"] all comes with audio
+cpu_offload: False
+seed: 103
+video_negative_prompt: "jitter, bad hands, blur, distortion"  # Artifacts to avoid in video
+audio_negative_prompt: "robotic, muffled, echo, distorted"    # Artifacts to avoid in audio
+video_frame_height_width: [512, 992] # only useful if mode = t2v or t2i2v, recommended values: [512, 992], [992, 512], [960, 512], [512, 960], [720, 720], [448, 1120]
+text_prompt: example_prompts/gpt_examples_i2v.csv
+slg_layer: 11
+each_example_n_times: 2

ovi/configs/model/dit/audio.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "patch_size": [1],
+  "model_type": "t2a",
+  "dim": 3072,
+  "ffn_dim": 14336,
+  "freq_dim": 256,
+  "num_heads": 24,
+  "num_layers": 30,
+  "in_dim": 20,
+  "out_dim": 20,
+  "text_len": 512,
+  "window_size": [-1, -1],
+  "qk_norm": true,
+  "cross_attn_norm": true,
+  "eps": 1e-6,
+  "temporal_rope_scaling_factor": 0.19676
+}

ovi/configs/model/dit/video.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "patch_size": [1, 2, 2],
+  "model_type": "ti2v",
+  "dim": 3072,
+  "ffn_dim": 14336,
+  "freq_dim": 256,
+  "num_heads": 24,
+  "num_layers": 30,
+  "in_dim": 48,
+  "out_dim": 48,
+  "text_len": 512,
+  "window_size": [-1, -1],
+  "qk_norm": true,
+  "cross_attn_norm": true,
+  "eps": 1e-6
+}

ovi/distributed_comms/communications.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Tuple
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from .parallel_states import nccl_info
+def broadcast(input_: torch.Tensor):
+    src = nccl_info.group_id * nccl_info.sp_size
+    dist.broadcast(input_, src=src, group=nccl_info.group)
+def _all_to_all_4D(input: torch.tensor,
+                   scatter_idx: int = 2,
+                   gather_idx: int = 1,
+                   group=None) -> torch.tensor:
+    """
+    all-to-all for QKV
+    Args:
+        input (torch.tensor): a tensor sharded along dim scatter dim
+        scatter_idx (int): default 1
+        gather_idx (int): default 2
+        group : torch process group
+    Returns:
+        torch.tensor: resharded tensor (bs, seqlen/P, hc, hs)
+    """
+    assert (
+        input.dim() == 4
+    ), f"input must be 4D tensor, got {input.dim()} and shape {input.shape}"
+    seq_world_size = dist.get_world_size(group)
+    if scatter_idx == 2 and gather_idx == 1:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
+        bs, shard_seqlen, hc, hs = input.shape
+        seqlen = shard_seqlen * seq_world_size
+        shard_hc = hc // seq_world_size
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen/P, hc, hs) -reshape-> (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)-> (P, seq_len/P, bs, hc/P, hs)
+        input_t = (input.reshape(bs, shard_seqlen, seq_world_size, shard_hc,
+                                 hs).transpose(0, 2).contiguous())
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, seq_len/P, bs, hc/P, hs) scatter seqlen -all2all-> (P, seq_len/P, bs, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(seqlen, bs, shard_hc, hs)
+        # (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs)
+        output = output.transpose(0, 1).contiguous().reshape(
+            bs, seqlen, shard_hc, hs)
+        return output
+    elif scatter_idx == 1 and gather_idx == 2:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
+        bs, seqlen, shard_hc, hs = input.shape
+        hc = shard_hc * seq_world_size
+        shard_seqlen = seqlen // seq_world_size
+        seq_world_size = dist.get_world_size(group)
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen, hc/P, hs) -reshape-> (bs, P, seq_len/P, hc/P, hs) -transpose(0, 3)-> (hc/P, P, seqlen/P, bs, hs) -transpose(0, 1) -> (P, hc/P, seqlen/P, bs, hs)
+        input_t = (input.reshape(
+            bs, seq_world_size, shard_seqlen, shard_hc,
+            hs).transpose(0, 3).transpose(0, 1).contiguous().reshape(
+                seq_world_size, shard_hc, shard_seqlen, bs, hs))
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, bs x hc/P, seqlen/P, hs) scatter seqlen -all2all-> (P, bs x seq_len/P, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(hc, shard_seqlen, bs, hs)
+        # (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs)
+        output = output.transpose(0, 2).contiguous().reshape(
+            bs, shard_seqlen, hc, hs)
+        return output
+    else:
+        raise RuntimeError(
+            "scatter_idx must be 1 or 2 and gather_idx must be 1 or 2")
+class SeqAllToAll4D(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        group: dist.ProcessGroup,
+        input: Tensor,
+        scatter_idx: int,
+        gather_idx: int,
+    ) -> Tensor:
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+        return _all_to_all_4D(input, scatter_idx, gather_idx, group=group)
+    @staticmethod
+    def backward(ctx: Any,
+                 *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (
+            None,
+            SeqAllToAll4D.apply(ctx.group, *grad_output, ctx.gather_idx,
+                                ctx.scatter_idx),
+            None,
+            None,
+        )
+def all_to_all_4D(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return SeqAllToAll4D.apply(nccl_info.group, input_, scatter_dim,
+                               gather_dim)
+def _all_to_all(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    scatter_dim: int,
+    gather_dim: int,
+):
+    input_list = [
+        t.contiguous()
+        for t in torch.tensor_split(input_, world_size, scatter_dim)
+    ]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+class _AllToAll(torch.autograd.Function):
+    """All-to-all communication.
+    Args:
+        input_: input matrix
+        process_group: communication group
+        scatter_dim: scatter dimension
+        gather_dim: gather dimension
+    """
+    @staticmethod
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.world_size = dist.get_world_size(process_group)
+        output = _all_to_all(input_, ctx.world_size, process_group,
+                             scatter_dim, gather_dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = _all_to_all(
+            grad_output,
+            ctx.world_size,
+            ctx.process_group,
+            ctx.gather_dim,
+            ctx.scatter_dim,
+        )
+        return (
+            grad_output,
+            None,
+            None,
+            None,
+        )
+def all_to_all(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return _AllToAll.apply(input_, nccl_info.group, scatter_dim, gather_dim)
+class _AllGather(torch.autograd.Function):
+    """All-gather communication with autograd support.
+    Args:
+        input_: input tensor
+        dim: dimension along which to concatenate
+    """
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        world_size = nccl_info.sp_size
+        group = nccl_info.group
+        input_size = list(input_.size())
+        ctx.input_size = input_size[dim]
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        input_ = input_.contiguous()
+        dist.all_gather(tensor_list, input_, group=group)
+        output = torch.cat(tensor_list, dim=dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        world_size = nccl_info.sp_size
+        rank = nccl_info.rank_within_group
+        dim = ctx.dim
+        input_size = ctx.input_size
+        sizes = [input_size] * world_size
+        grad_input_list = torch.split(grad_output, sizes, dim=dim)
+        grad_input = grad_input_list[rank]
+        return grad_input, None
+def all_gather(input_: torch.Tensor, dim: int = 1):
+    """Performs an all-gather operation on the input tensor along the specified dimension.
+    Args:
+        input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
+        dim (int, optional): Dimension along which to concatenate. Defaults to 1.
+    Returns:
+        torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
+    """
+    return _AllGather.apply(input_, dim)
+def prepare_sequence_parallel_data(hidden_states, encoder_hidden_states,
+                                   attention_mask, encoder_attention_mask):
+    if nccl_info.sp_size == 1:
+        return (
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            encoder_attention_mask,
+        )
+    def prepare(hidden_states, encoder_hidden_states, attention_mask,
+                encoder_attention_mask):
+        hidden_states = all_to_all(hidden_states, scatter_dim=2, gather_dim=0)
+        encoder_hidden_states = all_to_all(encoder_hidden_states,
+                                           scatter_dim=1,
+                                           gather_dim=0)
+        attention_mask = all_to_all(attention_mask,
+                                    scatter_dim=1,
+                                    gather_dim=0)
+        encoder_attention_mask = all_to_all(encoder_attention_mask,
+                                            scatter_dim=1,
+                                            gather_dim=0)
+        return (
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            encoder_attention_mask,
+        )
+    sp_size = nccl_info.sp_size
+    frame = hidden_states.shape[2]
+    assert frame % sp_size == 0, "frame should be a multiple of sp_size"
+    (
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask,
+        encoder_attention_mask,
+    ) = prepare(
+        hidden_states,
+        encoder_hidden_states.repeat(1, sp_size, 1),
+        attention_mask.repeat(1, sp_size, 1, 1),
+        encoder_attention_mask.repeat(1, sp_size),
+    )
+    return hidden_states, encoder_hidden_states, attention_mask, encoder_attention_mask
+def sp_parallel_dataloader_wrapper(dataloader, device, train_batch_size,
+                                   sp_size, train_sp_batch_size):
+    while True:
+        for data_item in dataloader:
+            latents, cond, attn_mask, cond_mask = data_item
+            latents = latents.to(device)
+            cond = cond.to(device)
+            attn_mask = attn_mask.to(device)
+            cond_mask = cond_mask.to(device)
+            frame = latents.shape[2]
+            if frame == 1:
+                yield latents, cond, attn_mask, cond_mask
+            else:
+                latents, cond, attn_mask, cond_mask = prepare_sequence_parallel_data(
+                    latents, cond, attn_mask, cond_mask)
+                assert (
+                    train_batch_size * sp_size >= train_sp_batch_size
+                ), "train_batch_size * sp_size should be greater than train_sp_batch_size"
+                for iter in range(train_batch_size * sp_size //
+                                  train_sp_batch_size):
+                    st_idx = iter * train_sp_batch_size
+                    ed_idx = (iter + 1) * train_sp_batch_size
+                    encoder_hidden_states = cond[st_idx:ed_idx]
+                    attention_mask = attn_mask[st_idx:ed_idx]
+                    encoder_attention_mask = cond_mask[st_idx:ed_idx]
+                    yield (
+                        latents[st_idx:ed_idx],
+                        encoder_hidden_states,
+                        attention_mask,
+                        encoder_attention_mask,
+                    )

ovi/distributed_comms/distributed/__init__.py ADDED Viewed

File without changes

ovi/distributed_comms/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model

ovi/distributed_comms/distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.cuda.amp as amp
+from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                     get_sequence_parallel_world_size,
+                                     get_sp_group)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+from ..modules.model import sinusoidal_embedding_1d
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def usp_dit_forward(
+    self,
+    x,
+    t,
+    context,
+    seq_len,
+    clip_fea=None,
+    y=None,
+):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    if self.model_type == 'i2v':
+        assert clip_fea is not None and y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    if y is not None:
+        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+    # embeddings
+    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+    grid_sizes = torch.stack(
+        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+    x = [u.flatten(2).transpose(1, 2) for u in x]
+    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+    assert seq_lens.max() <= seq_len
+    x = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in x
+    ])
+    # time embeddings
+    with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).float())
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context_lens = None
+    context = self.text_embedding(
+        torch.stack([
+            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+            for u in context
+        ]))
+    if clip_fea is not None:
+        context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+        context = torch.concat([context_clip, context], dim=1)
+    # arguments
+    kwargs = dict(
+        e=e0,
+        seq_lens=seq_lens,
+        grid_sizes=grid_sizes,
+        freqs=self.freqs,
+        context=context,
+        context_lens=context_lens)
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    for block in self.blocks:
+        x = block(x, **kwargs)
+    # head
+    x = self.head(x, e)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    return [u.float() for u in x]
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

ovi/distributed_comms/parallel_states.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import torch.distributed as dist
+class COMM_INFO:
+    def __init__(self):
+        self.group = None
+        self.sp_size = 1
+        self.global_rank = 0
+        self.rank_within_group = 0
+        self.group_id = 0
+nccl_info = COMM_INFO()
+_SEQUENCE_PARALLEL_STATE = False
+def initialize_sequence_parallel_state(sequence_parallel_size):
+    global _SEQUENCE_PARALLEL_STATE
+    if sequence_parallel_size > 1:
+        _SEQUENCE_PARALLEL_STATE = True
+        initialize_sequence_parallel_group(sequence_parallel_size)
+    else:
+        nccl_info.sp_size = 1
+        nccl_info.global_rank = int(os.getenv("RANK", "0"))
+        nccl_info.rank_within_group = 0
+        nccl_info.group_id = int(os.getenv("RANK", "0"))
+def set_sequence_parallel_state(state):
+    global _SEQUENCE_PARALLEL_STATE
+    _SEQUENCE_PARALLEL_STATE = state
+def get_sequence_parallel_state():
+    return _SEQUENCE_PARALLEL_STATE
+def initialize_sequence_parallel_group(sequence_parallel_size):
+    """Initialize the sequence parallel group."""
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    assert (
+        world_size % sequence_parallel_size == 0
+    ), "world_size must be divisible by sequence_parallel_size, but got world_size: {}, sequence_parallel_size: {}".format(
+        world_size, sequence_parallel_size)
+    nccl_info.sp_size = sequence_parallel_size
+    nccl_info.global_rank = rank
+    num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+    for i in range(num_sequence_parallel_groups):
+        ranks = range(i * sequence_parallel_size,
+                      (i + 1) * sequence_parallel_size)
+        group = dist.new_group(ranks)
+        if rank in ranks:
+            nccl_info.group = group
+            nccl_info.rank_within_group = rank - i * sequence_parallel_size
+            nccl_info.group_id = i
+def initialize_sequence_parallel_group_custom(process_group):
+    set_sequence_parallel_state(True)
+    """Initialize an unsafe sequence parallel group with a pre-formed group."""
+    rank = dist.get_rank(group=process_group)
+    sequence_parallel_size = dist.get_world_size(group=process_group)
+    nccl_info.sp_size = sequence_parallel_size
+    nccl_info.global_rank = dist.get_rank()  # global rank
+    nccl_info.group = process_group
+    nccl_info.rank_within_group = rank
+    nccl_info.group_id = 0
+def destroy_sequence_parallel_group():
+    """Destroy the sequence parallel group."""
+    dist.destroy_process_group()

ovi/distributed_comms/util.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import torch
+import torch.distributed as dist
+def get_global_rank() -> int:
+    """
+    Get the global rank, the global index of the GPU.
+    """
+    return int(os.environ.get("RANK", "0"))
+def get_local_rank() -> int:
+    """
+    Get the local rank, the local index of the GPU.
+    """
+    return int(os.environ.get("LOCAL_RANK", "0"))
+def get_world_size() -> int:
+    """
+    Get the world size, the total amount of GPUs.
+    """
+    return int(os.environ.get("WORLD_SIZE", "1"))
+def get_device() -> torch.device:
+    """
+    Get current rank device.
+    """
+    return torch.device("cuda", get_local_rank())
+def get_sequence_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    return _SEQUENCE_PARALLEL_GROUP
+def initialize_sequence_parallelism(sequence_parallel_size):
+    assert int(get_world_size()) % sequence_parallel_size == 0
+    sequence_parallel_num_groups = int(get_world_size()) // sequence_parallel_size
+    global _SEQUENCE_PARALLEL_GROUP
+    for i in range(sequence_parallel_num_groups):
+        ranks = range(i * sequence_parallel_size,
+                    (i + 1) * sequence_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if int(get_global_rank()) in ranks:
+            print(f"Rank {get_global_rank()} joined group with ranks {list(ranks)}")
+            _SEQUENCE_PARALLEL_GROUP = group

ovi/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .attention import flash_attention
+from .model import WanModel
+from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
+from .tokenizers import HuggingfaceTokenizer
+from .vae import WanVAE
+__all__ = [
+    'WanVAE',
+    'WanModel',
+    'T5Model',
+    'T5Encoder',
+    'T5Decoder',
+    'T5EncoderModel',
+    'HuggingfaceTokenizer',
+    'flash_attention',
+]

ovi/modules/attention.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+import warnings
+__all__ = [
+    'flash_attention',
+    'attention',
+    'attention_with_weights',
+]
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == 'cuda' and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor(
+            [lq] * b, dtype=torch.int32).to(
+                device=q.device, non_blocking=True)
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor(
+            [lk] * b, dtype=torch.int32).to(
+                device=k.device, non_blocking=True)
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn(
+            'Flash attention 3 is not available, use flash attention 2 instead.'
+        )
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic)
+        if isinstance(x, tuple):
+            x = x[0]
+        x = x.unflatten(0, (b, lq))
+    else:
+        assert FLASH_ATTN_2_AVAILABLE
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
+                0, dtype=torch.int32).to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic).unflatten(0, (b, lq))
+    # output
+    return x.type(out_dtype)
+def attention_with_weights(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    average_for_q=False,
+    total_video_latent_frames = 21
+):
+    """
+    Compute attention with explicit attention weights for visualization.
+    Returns both output and attention weights.
+    """
+    out_dtype = q.dtype
+    # Handle sequence lengths
+    b, lq, lk = q.size(0), q.size(1), k.size(1)
+    if q_lens is None:
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32, device=q.device)
+    else:
+        # Ensure q_lens is on the same device as q
+        q_lens = q_lens.to(q.device)
+    if k_lens is None:
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32, device=k.device)
+    else:
+        # Ensure k_lens is on the same device as k
+        k_lens = k_lens.to(k.device)
+    # Apply q_scale if provided
+    if q_scale is not None:
+        q = q * q_scale
+    # Compute attention weights manually
+    # q: [B, Lq, Nq, C], k: [B, Lk, Nk, C]
+    scale = softmax_scale if softmax_scale is not None else (q.size(-1) ** -0.5)
+    # Compute scores: [B, Nq, Lq, Lk]
+    scores = torch.einsum('blhd,bshd->bhls', q, k) * scale
+    # Apply causal mask if needed
+    if causal:
+        mask = torch.triu(torch.ones(lq, lk, device=q.device, dtype=torch.bool), diagonal=1)
+        scores.masked_fill_(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+    # Mask for k_lens (columns)
+    k_mask = torch.arange(lk, device=k.device).unsqueeze(0) >= k_lens.unsqueeze(1)  # [B, Lk]
+    scores.masked_fill_(k_mask.unsqueeze(1).unsqueeze(2), float('-inf'))  # [B, 1, 1, Lk]
+    # Mask for q_lens (rows)
+    q_mask = torch.arange(lq, device=q.device).unsqueeze(0) >= q_lens.unsqueeze(1)  # [B, Lq]
+    scores.masked_fill_(q_mask.unsqueeze(1).unsqueeze(3), float('-inf'))  # [B, 1, Lq, 1]
+    # Compute attention weights
+    attn_weights = torch.softmax(scores, dim=-1)  # [B, Nq, Lq, Lk]
+    assert attn_weights.shape[0] == 1, "Batch size > 1 not supported for attention visualization."
+    # Average attention weights to reduce memory usage before returning
+    # Average across batch dimension (should be 1) and query heads and query sequence length
+    # This gives us attention weight per video token: [Lk]
+    if average_for_q:
+        #avg_attn_weights = torch.mean(attn_weights, dim=(0, 1, 3))  # [Lq]
+        avg_attn_weights = torch.max(attn_weights, dim=3)[0].mean(dim=(0, 1))  # [Lq]
+    else:
+        if 0:
+            avg_attn_weights = torch.mean(attn_weights, dim=(0, 1, 2))  # [Lk]
+        elif 1:
+            B, H, Lq, Lk = attn_weights.shape  # [1, H, Lq, Lk]
+            per_frame_seq_len = Lk // total_video_latent_frames
+            per_frame_aud_len = Lq // total_video_latent_frames
+            avg_attn_weights = torch.zeros((Lk,), device=attn_weights.device, dtype=attn_weights.dtype)
+            eps = 1e-8  # numerical stability
+            for i in range(total_video_latent_frames):
+                start_idx_v = i * per_frame_seq_len
+                end_idx_v   = (i + 1) * per_frame_seq_len
+                start_idx_a = i * per_frame_aud_len
+                end_idx_a   = (i + 1) * per_frame_aud_len
+                # attn_chunk: [H, La, Lv]
+                attn_chunk = attn_weights[0, :, start_idx_a:end_idx_a, start_idx_v:end_idx_v]
+                # ---- Head informativeness via (low) entropy over Lv ----
+                # Normalize within the Lv slice per (head, query) to make a proper distribution
+                p = attn_chunk / (attn_chunk.sum(dim=-1, keepdim=True) + eps)          # [H, La, Lv]
+                entropy = -(p * (p + eps).log()).sum(dim=-1).mean(dim=1)               # [H]
+                # Convert to positive head weights (lower entropy -> larger weight)
+                saliency = 1.0 / (entropy + 1e-6)                                      # [H]
+                head_w = saliency / (saliency.sum() + eps)                             # [H], sum=1
+                # Reduce across audio queries first (pick strong responses), then weight heads
+                per_head = torch.amax(attn_chunk, dim=1)                               # [H, Lv]
+                weighted = (per_head * head_w[:, None]).sum(dim=0)                     # [Lv]
+                avg_attn_weights[start_idx_v:end_idx_v] = weighted
+        else:
+            avg_attn_weights = torch.mean(attn_weights, dim=(0, 2)).max(dim=(0))[0]  # [Lk]
+    # Compute output: [B, Lq, Nq, C]
+    out = torch.einsum('bhls,bshd->blhd', attn_weights, v)
+    return out.to(out_dtype), avg_attn_weights.to(out_dtype)
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
+        out = out.transpose(1, 2).contiguous()
+        return out

ovi/modules/clip.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .attention import flash_attention
+from .tokenizers import HuggingfaceTokenizer
+from .xlm_roberta import XLMRoberta
+__all__ = [
+    'XLMRobertaCLIP',
+    'clip_xlm_roberta_vit_h_14',
+    'CLIPModel',
+]
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat([
+            pos[:, :n],
+            F.interpolate(
+                pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
+                    0, 3, 1, 2),
+                size=(tar_grid, tar_grid),
+                mode='bicubic',
+                align_corners=False).flatten(2).transpose(1, 2)
+        ],
+                         dim=1)
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 causal=False,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
+        # compute attention
+        p = self.attn_dropout if self.training else 0.0
+        x = flash_attention(q, k, v, dropout_p=p, causal=self.causal, version=2)
+        # x = flash_attention(q, k, v, dropout_p=p, causal=self.causal)
+        x = x.reshape(b, s, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 post_norm=False,
+                 causal=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert activation in ['quick_gelu', 'gelu', 'swi_glu']
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
+                                  proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == 'swi_glu':
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 activation='gelu',
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
+        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
+        # compute attention
+        x = flash_attention(q, k, v, version=2)
+        # x = flash_attention(q, k, v)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 mlp_ratio=4,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 pool_type='token',
+                 pre_norm=True,
+                 post_norm=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        if image_size % patch_size != 0:
+            print(
+                '[WARNING] image_size is not divisible by patch_size',
+                flush=True)
+        assert pool_type in ('token', 'token_fc', 'attn_pool')
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size)**2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3,
+            dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=not pre_norm)
+        if pool_type in ('token', 'token_fc'):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(gain * torch.randn(
+            1, self.num_patches +
+            (1 if pool_type in ('token', 'token_fc') else 0), dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
+                           activation, attn_dropout, proj_dropout, norm_eps)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == 'token':
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == 'token_fc':
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == 'attn_pool':
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
+                                      proj_dropout, norm_eps)
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ('token', 'token_fc'):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop('out_dim')
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False))
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=1024,
+                 image_size=224,
+                 patch_size=14,
+                 vision_dim=1280,
+                 vision_mlp_ratio=4,
+                 vision_heads=16,
+                 vision_layers=32,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 activation='gelu',
+                 vocab_size=250002,
+                 max_text_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 text_dim=1024,
+                 text_heads=16,
+                 text_layers=24,
+                 text_post_norm=True,
+                 text_dropout=0.1,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = XLMRobertaWithHead(
+            vocab_size=vocab_size,
+            max_seq_len=max_text_len,
+            type_size=type_size,
+            pad_id=pad_id,
+            dim=text_dim,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            post_norm=text_post_norm,
+            dropout=text_dropout)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+def _clip(pretrained=False,
+          pretrained_name=None,
+          model_cls=XLMRobertaCLIP,
+          return_transforms=False,
+          return_tokenizer=False,
+          tokenizer_padding='eos',
+          dtype=torch.float32,
+          device='cpu',
+          **kwargs):
+    # init a model on device
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if 'siglip' in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose([
+            T.Resize((model.image_size, model.image_size),
+                     interpolation=T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+        output += (transforms,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+        pretrained=False,
+        pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
+        **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool='token',
+        activation='gelu',
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0)
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class CLIPModel:
+    def __init__(self, dtype, device, checkpoint_path, tokenizer_path):
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=dtype,
+            device=device)
+        self.model = self.model.eval().requires_grad_(False)
+        logging.info(f'loading {checkpoint_path}')
+        self.model.load_state_dict(
+            torch.load(checkpoint_path, map_location='cpu'))
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path,
+            seq_len=self.model.max_text_len - 2,
+            clean='whitespace')
+    def visual(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat([
+            F.interpolate(
+                u.transpose(0, 1),
+                size=size,
+                mode='bicubic',
+                align_corners=False) for u in videos
+        ])
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        with torch.cuda.amp.autocast(dtype=self.dtype):
+            out = self.model.visual(videos, use_31_block=True)
+            return out

ovi/modules/fusion.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import torch
+import torch.nn as nn
+from ovi.modules.model import WanLayerNorm, WanModel, WanRMSNorm, gradient_checkpointing, rope_apply
+from ovi.modules.attention import flash_attention
+from ovi.distributed_comms.communications import all_gather, all_to_all_4D
+from ovi.distributed_comms.parallel_states import nccl_info, get_sequence_parallel_state
+class FusionModel(nn.Module):
+    def __init__(self, video_config=None, audio_config=None):
+        super().__init__()
+        has_video = True
+        has_audio = True
+        if video_config is not None:
+            self.video_model = WanModel(**video_config)
+        else:
+            has_video = False
+            self.video_model = None
+            print("Warning: No video model is provided!")
+        if audio_config is not None:
+            self.audio_model = WanModel(**audio_config)
+        else:
+            has_audio = False
+            self.audio_model = None
+            print("Warning: No audio model is provided!")
+        if has_video and has_audio:
+            assert len(self.video_model.blocks) == len(self.audio_model.blocks)
+            self.num_blocks = len(self.video_model.blocks)
+            self.use_sp = get_sequence_parallel_state()
+            if self.use_sp:
+                self.sp_size = nccl_info.sp_size
+                self.sp_rank = nccl_info.rank_within_group
+            self.inject_cross_attention_kv_projections()
+        self.init_weights()
+    def inject_cross_attention_kv_projections(self):
+        for vid_block in self.video_model.blocks:
+            vid_block.cross_attn.k_fusion = nn.Linear(vid_block.dim, vid_block.dim)
+            vid_block.cross_attn.v_fusion = nn.Linear(vid_block.dim, vid_block.dim)
+            vid_block.cross_attn.pre_attn_norm_fusion = WanLayerNorm(vid_block.dim, elementwise_affine=True)
+            vid_block.cross_attn.norm_k_fusion = WanRMSNorm(vid_block.dim, eps=1e-6) if vid_block.qk_norm else nn.Identity()
+        for audio_block in self.audio_model.blocks:
+            audio_block.cross_attn.k_fusion = nn.Linear(audio_block.dim, audio_block.dim)
+            audio_block.cross_attn.v_fusion = nn.Linear(audio_block.dim, audio_block.dim)
+            audio_block.cross_attn.pre_attn_norm_fusion = WanLayerNorm(audio_block.dim, elementwise_affine=True)
+            audio_block.cross_attn.norm_k_fusion = WanRMSNorm(audio_block.dim, eps=1e-6) if audio_block.qk_norm else nn.Identity()
+    def merge_kwargs(self, vid_kwargs, audio_kwargs):
+        """
+        keys in each kwarg:
+        e
+        seq_lens
+        grid_sizes
+        freqs
+        context
+        context_lens
+        """
+        merged_kwargs = {}
+        for key in vid_kwargs:
+            merged_kwargs[f"vid_{key}"] = vid_kwargs[key]
+        for key in audio_kwargs:
+            merged_kwargs[f"audio_{key}"] = audio_kwargs[key]
+        return merged_kwargs
+    def single_fusion_cross_attention_forward(self,
+                                            cross_attn_block,
+                                            src_seq,
+                                            src_grid_sizes,
+                                            src_freqs,
+                                            target_seq,
+                                            target_seq_lens,
+                                            target_grid_sizes,
+                                            target_freqs,
+                                            context,
+                                            context_lens
+                                            ):
+        b, n, d = src_seq.size(0), cross_attn_block.num_heads, cross_attn_block.head_dim
+        if hasattr(cross_attn_block, "k_img"):
+            ## means is i2v block
+            q, k, v, k_img, v_img = cross_attn_block.qkv_fn(src_seq, context)
+        else:
+            ## means is t2v block
+            q, k, v = cross_attn_block.qkv_fn(src_seq, context)
+            k_img = v_img = None
+        if self.use_sp:
+            q = all_to_all_4D(q, scatter_dim=2, gather_dim=1)
+            k = torch.chunk(k, self.sp_size, dim=2)[self.sp_rank]
+            v = torch.chunk(v, self.sp_size, dim=2)[self.sp_rank]
+            if k_img is not None:
+                k_img = torch.chunk(k_img, self.sp_size, dim=2)[self.sp_rank]
+            if v_img is not None:
+                v_img = torch.chunk(v_img, self.sp_size, dim=2)[self.sp_rank]
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        if k_img is not None:
+            img_x = flash_attention(q, k_img, v_img, k_lens=None)
+            x = x + img_x
+        is_vid = src_grid_sizes.shape[1] > 1
+        # compute target attention
+        target_seq = cross_attn_block.pre_attn_norm_fusion(target_seq)
+        k_target = cross_attn_block.norm_k_fusion(cross_attn_block.k_fusion(target_seq)).view(b, -1, n, d)
+        v_target = cross_attn_block.v_fusion(target_seq).view(b, -1, n, d)
+        if self.use_sp:
+            k_target = all_to_all_4D(k_target, scatter_dim=2, gather_dim=1) # [B, L, H/P, C/H]
+            v_target = all_to_all_4D(v_target, scatter_dim=2, gather_dim=1) # [B, L, H/P, C/H]
+        q = rope_apply(q, src_grid_sizes, src_freqs)
+        k_target = rope_apply(k_target, target_grid_sizes, target_freqs)
+        target_x = flash_attention(q, k_target, v_target, k_lens=target_seq_lens)
+        x = x + target_x
+        if self.use_sp:
+            x = all_to_all_4D(x, scatter_dim=1, gather_dim=2) # [B, L/P, H, C/H]
+        x = x.flatten(2) # [B, L/P, C]
+        x = cross_attn_block.o(x)
+        return x
+    def single_fusion_cross_attention_ffn_forward(self,
+                                            attn_block,
+                                            src_seq,
+                                            src_grid_sizes,
+                                            src_freqs,
+                                            target_seq,
+                                            target_seq_lens,
+                                            target_grid_sizes,
+                                            target_freqs,
+                                            context,
+                                            context_lens,
+                                            src_e):
+        src_seq = src_seq + self.single_fusion_cross_attention_forward(attn_block.cross_attn,
+                                                                       attn_block.norm3(src_seq),
+                                                                       src_grid_sizes=src_grid_sizes,
+                                                                       src_freqs=src_freqs,
+                                                                       target_seq=target_seq,
+                                                                       target_seq_lens=target_seq_lens,
+                                                                       target_grid_sizes=target_grid_sizes,
+                                                                       target_freqs=target_freqs,
+                                                                       context=context,
+                                                                       context_lens=context_lens
+                                                                       )
+        y = attn_block.ffn(attn_block.norm2(src_seq).bfloat16() * (1 + src_e[4].squeeze(2)) + src_e[3].squeeze(2))
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            src_seq = src_seq + y * src_e[5].squeeze(2)
+        return src_seq
+    def single_fusion_block_forward(self,
+                                    vid_block,
+                                    audio_block,
+                                    vid,
+                                    audio,
+                                    vid_e,
+                                    vid_seq_lens,
+                                    vid_grid_sizes,
+                                    vid_freqs,
+                                    vid_context,
+                                    vid_context_lens,
+                                    audio_e,
+                                    audio_seq_lens,
+                                    audio_grid_sizes,
+                                    audio_freqs,
+                                    audio_context,
+                                    audio_context_lens
+                                    ):
+        ## audio modulation
+        assert audio_e.dtype == torch.bfloat16
+        assert len(audio_e.shape) == 4 and audio_e.size(2) == 6 and audio_e.shape[1] == audio.shape[1], f"{audio_e.shape}, {audio.shape}"
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            audio_e = audio_block.modulation(audio_e).chunk(6, dim=2)
+        assert audio_e[0].dtype == torch.bfloat16
+        # audio self-attention
+        audio_y = audio_block.self_attn(
+            audio_block.norm1(audio).bfloat16() * (1 + audio_e[1].squeeze(2)) + audio_e[0].squeeze(2), audio_seq_lens, audio_grid_sizes,
+            audio_freqs)
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            audio = audio + audio_y * audio_e[2].squeeze(2)
+        ## video modulation
+        assert len(vid_e.shape) == 4 and vid_e.size(2) == 6 and vid_e.shape[1] == vid.shape[1], f"{vid_e.shape}, {vid.shape}"
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            vid_e = vid_block.modulation(vid_e).chunk(6, dim=2)
+        # video self-attention
+        vid_y = vid_block.self_attn(
+            vid_block.norm1(vid).bfloat16() * (1 + vid_e[1].squeeze(2)) + vid_e[0].squeeze(2), vid_seq_lens, vid_grid_sizes,
+            vid_freqs)
+        with torch.amp.autocast('cuda', dtype=torch.bfloat16):
+            vid = vid + vid_y * vid_e[2].squeeze(2)
+        og_audio = audio
+        # audio cross-attention
+        audio = self.single_fusion_cross_attention_ffn_forward(
+            audio_block,
+            audio,
+            audio_grid_sizes,
+            audio_freqs,
+            vid,
+            vid_seq_lens,
+            vid_grid_sizes,
+            vid_freqs,
+            audio_context,
+            audio_context_lens,
+            audio_e
+        )
+        assert not torch.equal(og_audio, audio), "Audio should be changed after cross-attention!"
+        # video cross-attention
+        vid = self.single_fusion_cross_attention_ffn_forward(
+            vid_block,
+            vid,
+            vid_grid_sizes,
+            vid_freqs,
+            og_audio,
+            audio_seq_lens,
+            audio_grid_sizes,
+            audio_freqs,
+            vid_context,
+            vid_context_lens,
+            vid_e
+        )
+        return vid, audio
+    def forward(
+        self,
+        vid,
+        audio,
+        t,
+        vid_context,
+        audio_context,
+        vid_seq_len,
+        audio_seq_len,
+        clip_fea=None,
+        clip_fea_audio=None,
+        y=None,
+        first_frame_is_clean=False,
+        slg_layer=False
+    ):
+        assert clip_fea is None
+        assert y is None
+        if vid is None or all([x is None for x in vid]):
+            assert vid_context is None
+            assert vid_seq_len is None
+            assert self.audio_model is not None
+            return None, self.audio_model(x=audio, t=t, context=audio_context, seq_len=audio_seq_len, clip_fea=clip_fea_audio, y=None)
+        if audio is None or all([x is None for x in audio]):
+            assert clip_fea_audio is None
+            assert audio_context is None
+            assert audio_seq_len is None
+            assert self.video_model is not None
+            return self.video_model(x=vid, t=t, context=vid_context, seq_len=vid_seq_len, clip_fea=clip_fea, y=y, first_frame_is_clean=first_frame_is_clean), None
+        vid, vid_e, vid_kwargs = self.video_model.prepare_transformer_block_kwargs(
+            x=vid, t=t, context=vid_context, seq_len=vid_seq_len, clip_fea=clip_fea, y=y, first_frame_is_clean=first_frame_is_clean
+        )
+        audio, audio_e, audio_kwargs = self.audio_model.prepare_transformer_block_kwargs(
+            x=audio, t=t, context=audio_context, seq_len=audio_seq_len, clip_fea=clip_fea_audio, y=None, first_frame_is_clean=False
+        )
+        kwargs = self.merge_kwargs(vid_kwargs, audio_kwargs)
+        for i in range(self.num_blocks):
+            """
+            1 fusion block refers to 1 audio block with 1 video block.
+            """
+            if slg_layer > 0 and i == slg_layer:
+                continue
+            vid_block = self.video_model.blocks[i]
+            audio_block = self.audio_model.blocks[i]
+            vid, audio = gradient_checkpointing(
+                    enabled=(self.training and self.gradient_checkpointing),
+                    module=self.single_fusion_block_forward,
+                    vid_block=vid_block,
+                    audio_block=audio_block,
+                    vid=vid,
+                    audio=audio,
+                    **kwargs
+                )
+        vid = self.video_model.post_transformer_block_out(vid, vid_kwargs['grid_sizes'], vid_e)
+        audio = self.audio_model.post_transformer_block_out(audio, audio_kwargs['grid_sizes'], audio_e)
+        return vid, audio
+    def init_weights(self):
+        if self.audio_model is not None:
+            self.audio_model.init_weights()
+        if self.video_model is not None:
+            self.video_model.init_weights()
+        for name, mod in self.video_model.named_modules():
+            if "fusion" in name and isinstance(mod, nn.Linear):
+                with torch.no_grad():
+                    mod.weight.div_(10.0)
+    def set_rope_params(self):
+        self.video_model.set_rope_params()
+        self.audio_model.set_rope_params()

ovi/modules/mmaudio/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # MMAudio package

ovi/modules/mmaudio/ext/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+