kaiw7 commited on Oct 22, 2025

Commit

e490e7e

verified ·

1 Parent(s): 2c7d185

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +24 -0
.gitignore +218 -0
.pre-commit-config.yaml +31 -0
LICENSE +696 -0
README.md +184 -0
assets/demo/Fig7-JAVG/case1.mp4 +3 -0
assets/demo/Fig7-JAVG/case2.mp4 +3 -0
assets/demo/FigA11-X-Cond/A2V.mp4 +3 -0
assets/demo/FigA11-X-Cond/AI2V.mp4 +3 -0
assets/demo/FigA11-X-Cond/AV-Ext.mp4 +3 -0
assets/demo/FigA11-X-Cond/GT.mp4 +3 -0
assets/demo/FigA11-X-Cond/I2AV.mp4 +3 -0
assets/demo/FigA11-X-Cond/V2A.mp4 +3 -0
assets/demo/FigA9-JAVG/case1.mp4 +3 -0
assets/demo/FigA9-JAVG/case2.mp4 +3 -0
assets/demo/FigA9-JAVG/case3.mp4 +3 -0
assets/demo/FigA9-JAVG/case4.mp4 +3 -0
assets/demo/FigA9-JAVG/case5.mp4 +3 -0
assets/demo/FigA9-JAVG/case6.mp4 +3 -0
assets/demo/FigA9-JAVG/case7.mp4 +3 -0
assets/demo/audio_prompts.txt +1222 -0
assets/demo/prompts.txt +16 -0
assets/docs/data.md +193 -0
assets/image/JavisDiT-framework-resized.png +3 -0
assets/image/JavisDiT-intro-resized.png +3 -0
assets/image/logo.png +3 -0
assets/src/funasr_utils_load_utils.py +262 -0
assets/src/pytorchvideo_augmentations.py +481 -0
configs/dit/inference/16x256x256.py +31 -0
configs/dit/inference/1x256x256-class.py +31 -0
configs/dit/inference/1x256x256.py +32 -0
configs/dit/train/16x256x256.py +50 -0
configs/dit/train/1x256x256.py +51 -0
configs/javisdit-v0-1/inference/audio_sample.py +58 -0
configs/javisdit-v0-1/inference/sample.py +77 -0
configs/javisdit-v0-1/inference/sample_240p4s.py +77 -0
configs/javisdit-v0-1/misc/extract_st_prior_va.py +92 -0
configs/javisdit-v0-1/misc/extract_va.py +88 -0
configs/javisdit-v0-1/train/stage1_audio.py +113 -0
configs/javisdit-v0-1/train/stage2_prior.py +107 -0
configs/javisdit-v0-1/train/stage2_prior_feat.py +81 -0
configs/javisdit-v0-1/train/stage3_jav.py +152 -0
configs/javisdit-v0-1/train/stage3_jav_feat.py +130 -0
configs/latte/inference/16x256x256-class.py +30 -0
configs/latte/inference/16x256x256.py +31 -0
configs/latte/train/16x256x256.py +49 -0
configs/opensora-v1-1/inference/sample-ref.py +64 -0
configs/opensora-v1-1/inference/sample.py +44 -0
configs/opensora-v1-1/train/benchmark.py +102 -0
configs/opensora-v1-1/train/image.py +66 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo/Fig7-JAVG/case1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/Fig7-JAVG/case2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/A2V.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/AI2V.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/AV-Ext.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/GT.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/I2AV.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA11-X-Cond/V2A.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case4.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case5.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case6.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/demo/FigA9-JAVG/case7.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/image/JavisDiT-framework-resized.png filter=lfs diff=lfs merge=lfs -text
+assets/image/JavisDiT-intro-resized.png filter=lfs diff=lfs merge=lfs -text
+assets/image/logo.png filter=lfs diff=lfs merge=lfs -text
+eval/javisbench/src/ImageBind/.assets/bird_audio.wav filter=lfs diff=lfs merge=lfs -text
+eval/javisbench/src/ImageBind/.assets/bird_image.jpg filter=lfs diff=lfs merge=lfs -text
+eval/javisbench/src/ImageBind/.assets/car_audio.wav filter=lfs diff=lfs merge=lfs -text
+eval/javisbench/src/ImageBind/.assets/dog_audio.wav filter=lfs diff=lfs merge=lfs -text
+javisdit/models/Y-fUsuo90K0g.wav filter=lfs diff=lfs merge=lfs -text
+javisdit/models/out.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,218 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.vscode/
+# macos
+*.DS_Store
+# misc files
+data
+dataset/
+datasets
+!javisdit/datasets
+!tools/datasets
+runs
+checkpoints
+weights
+outputs
+ablation
+!configs/**/ablation/
+!scripts/**/ablation/
+exps
+samples
+logs
+pretrained_models
+evaluation_results/
+cache/
+*.swp
+debug/
+*/debug.py
+third_party/
+deprecated
+nohup.*
+tmp
+*.zip
+*.tar
+*.tar.gz
+run.sh
+interface.py
+# Secret files
+hostfile
+gradio_cached_examples/
+wandb/
+# vae weights
+eval/vae/flolpips/weights/
+# npm
+node_modules/
+package-lock.json
+package.json
+# PLLaVA
+tools/caption/pllava_dir/PLLaVA/
+# vbench
+vbench
+!eval/vbench
+vbench2_beta_i2v

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.1
+    hooks:
+      - id: autoflake
+        name: autoflake (python)
+        args: ['--in-place']
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: sort all imports (python)
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+    - id: black
+      name: black formatter
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ['--fix=lf']

LICENSE ADDED Viewed

	@@ -0,0 +1,696 @@

+Copyright 2025. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 HPC-AI Technology Inc.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   =========================================================================
+   This project is inspired by the listed projects and is subject to the following licenses:
+   1. Latte (https://github.com/Vchitect/Latte/blob/main/LICENSE)
+   Copyright 2024 Latte
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   2. PixArt-alpha (https://github.com/PixArt-alpha/PixArt-alpha/blob/master/LICENSE)
+   Copyright (C) 2024 PixArt-alpha/PixArt-alpha
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Affero General Public License as published
+   by the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Affero General Public License for more details.
+   You should have received a copy of the GNU Affero General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+   3. dpm-solver (https://github.com/LuChengTHU/dpm-solver/blob/main/LICENSE)
+   MIT License
+   Copyright (c) 2022 Cheng Lu
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   4. DiT (https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt)
+   Attribution-NonCommercial 4.0 International
+   =======================================================================
+   Creative Commons Corporation ("Creative Commons") is not a law firm and
+   does not provide legal services or legal advice. Distribution of
+   Creative Commons public licenses does not create a lawyer-client or
+   other relationship. Creative Commons makes its licenses and related
+   information available on an "as-is" basis. Creative Commons gives no
+   warranties regarding its licenses, any material licensed under their
+   terms and conditions, or any related information. Creative Commons
+   disclaims all liability for damages resulting from their use to the
+   fullest extent possible.
+   Using Creative Commons Public Licenses
+   Creative Commons public licenses provide a standard set of terms and
+   conditions that creators and other rights holders may use to share
+   original works of authorship and other material subject to copyright
+   and certain other rights specified in the public license below. The
+   following considerations are for informational purposes only, are not
+   exhaustive, and do not form part of our licenses.
+      Considerations for licensors: Our public licenses are
+      intended for use by those authorized to give the public
+      permission to use material in ways otherwise restricted by
+      copyright and certain other rights. Our licenses are
+      irrevocable. Licensors should read and understand the terms
+      and conditions of the license they choose before applying it.
+      Licensors should also secure all rights necessary before
+      applying our licenses so that the public can reuse the
+      material as expected. Licensors should clearly mark any
+      material not subject to the license. This includes other CC-
+      licensed material, or material used under an exception or
+      limitation to copyright. More considerations for licensors:
+      wiki.creativecommons.org/Considerations_for_licensors
+      Considerations for the public: By using one of our public
+      licenses, a licensor grants the public permission to use the
+      licensed material under specified terms and conditions. If
+      the licensor's permission is not necessary for any reason--for
+      example, because of any applicable exception or limitation to
+      copyright--then that use is not regulated by the license. Our
+      licenses grant only permissions under copyright and certain
+      other rights that a licensor has authority to grant. Use of
+      the licensed material may still be restricted for other
+      reasons, including because others have copyright or other
+      rights in the material. A licensor may make special requests,
+      such as asking that all changes be marked or described.
+      Although not required by our licenses, you are encouraged to
+      respect those requests where reasonable. More_considerations
+      for the public:
+      wiki.creativecommons.org/Considerations_for_licensees
+   =======================================================================
+   Creative Commons Attribution-NonCommercial 4.0 International Public
+   License
+   By exercising the Licensed Rights (defined below), You accept and agree
+   to be bound by the terms and conditions of this Creative Commons
+   Attribution-NonCommercial 4.0 International Public License ("Public
+   License"). To the extent this Public License may be interpreted as a
+   contract, You are granted the Licensed Rights in consideration of Your
+   acceptance of these terms and conditions, and the Licensor grants You
+   such rights in consideration of benefits the Licensor receives from
+   making the Licensed Material available under these terms and
+   conditions.
+   Section 1 -- Definitions.
+   a. Adapted Material means material subject to Copyright and Similar
+      Rights that is derived from or based upon the Licensed Material
+      and in which the Licensed Material is translated, altered,
+      arranged, transformed, or otherwise modified in a manner requiring
+      permission under the Copyright and Similar Rights held by the
+      Licensor. For purposes of this Public License, where the Licensed
+      Material is a musical work, performance, or sound recording,
+      Adapted Material is always produced where the Licensed Material is
+      synched in timed relation with a moving image.
+   b. Adapter's License means the license You apply to Your Copyright
+      and Similar Rights in Your contributions to Adapted Material in
+      accordance with the terms and conditions of this Public License.
+   c. Copyright and Similar Rights means copyright and/or similar rights
+      closely related to copyright including, without limitation,
+      performance, broadcast, sound recording, and Sui Generis Database
+      Rights, without regard to how the rights are labeled or
+      categorized. For purposes of this Public License, the rights
+      specified in Section 2(b)(1)-(2) are not Copyright and Similar
+      Rights.
+   d. Effective Technological Measures means those measures that, in the
+      absence of proper authority, may not be circumvented under laws
+      fulfilling obligations under Article 11 of the WIPO Copyright
+      Treaty adopted on December 20, 1996, and/or similar international
+      agreements.
+   e. Exceptions and Limitations means fair use, fair dealing, and/or
+      any other exception or limitation to Copyright and Similar Rights
+      that applies to Your use of the Licensed Material.
+   f. Licensed Material means the artistic or literary work, database,
+      or other material to which the Licensor applied this Public
+      License.
+   g. Licensed Rights means the rights granted to You subject to the
+      terms and conditions of this Public License, which are limited to
+      all Copyright and Similar Rights that apply to Your use of the
+      Licensed Material and that the Licensor has authority to license.
+   h. Licensor means the individual(s) or entity(ies) granting rights
+      under this Public License.
+   i. NonCommercial means not primarily intended for or directed towards
+      commercial advantage or monetary compensation. For purposes of
+      this Public License, the exchange of the Licensed Material for
+      other material subject to Copyright and Similar Rights by digital
+      file-sharing or similar means is NonCommercial provided there is
+      no payment of monetary compensation in connection with the
+      exchange.
+   j. Share means to provide material to the public by any means or
+      process that requires permission under the Licensed Rights, such
+      as reproduction, public display, public performance, distribution,
+      dissemination, communication, or importation, and to make material
+      available to the public including in ways that members of the
+      public may access the material from a place and at a time
+      individually chosen by them.
+   k. Sui Generis Database Rights means rights other than copyright
+      resulting from Directive 96/9/EC of the European Parliament and of
+      the Council of 11 March 1996 on the legal protection of databases,
+      as amended and/or succeeded, as well as other essentially
+      equivalent rights anywhere in the world.
+   l. You means the individual or entity exercising the Licensed Rights
+      under this Public License. Your has a corresponding meaning.
+   Section 2 -- Scope.
+   a. License grant.
+         1. Subject to the terms and conditions of this Public License,
+            the Licensor hereby grants You a worldwide, royalty-free,
+            non-sublicensable, non-exclusive, irrevocable license to
+            exercise the Licensed Rights in the Licensed Material to:
+               a. reproduce and Share the Licensed Material, in whole or
+                  in part, for NonCommercial purposes only; and
+               b. produce, reproduce, and Share Adapted Material for
+                  NonCommercial purposes only.
+         2. Exceptions and Limitations. For the avoidance of doubt, where
+            Exceptions and Limitations apply to Your use, this Public
+            License does not apply, and You do not need to comply with
+            its terms and conditions.
+         3. Term. The term of this Public License is specified in Section
+            6(a).
+         4. Media and formats; technical modifications allowed. The
+            Licensor authorizes You to exercise the Licensed Rights in
+            all media and formats whether now known or hereafter created,
+            and to make technical modifications necessary to do so. The
+            Licensor waives and/or agrees not to assert any right or
+            authority to forbid You from making technical modifications
+            necessary to exercise the Licensed Rights, including
+            technical modifications necessary to circumvent Effective
+            Technological Measures. For purposes of this Public License,
+            simply making modifications authorized by this Section 2(a)
+            (4) never produces Adapted Material.
+         5. Downstream recipients.
+               a. Offer from the Licensor -- Licensed Material. Every
+                  recipient of the Licensed Material automatically
+                  receives an offer from the Licensor to exercise the
+                  Licensed Rights under the terms and conditions of this
+                  Public License.
+               b. No downstream restrictions. You may not offer or impose
+                  any additional or different terms or conditions on, or
+                  apply any Effective Technological Measures to, the
+                  Licensed Material if doing so restricts exercise of the
+                  Licensed Rights by any recipient of the Licensed
+                  Material.
+         6. No endorsement. Nothing in this Public License constitutes or
+            may be construed as permission to assert or imply that You
+            are, or that Your use of the Licensed Material is, connected
+            with, or sponsored, endorsed, or granted official status by,
+            the Licensor or others designated to receive attribution as
+            provided in Section 3(a)(1)(A)(i).
+   b. Other rights.
+         1. Moral rights, such as the right of integrity, are not
+            licensed under this Public License, nor are publicity,
+            privacy, and/or other similar personality rights; however, to
+            the extent possible, the Licensor waives and/or agrees not to
+            assert any such rights held by the Licensor to the limited
+            extent necessary to allow You to exercise the Licensed
+            Rights, but not otherwise.
+         2. Patent and trademark rights are not licensed under this
+            Public License.
+         3. To the extent possible, the Licensor waives any right to
+            collect royalties from You for the exercise of the Licensed
+            Rights, whether directly or through a collecting society
+            under any voluntary or waivable statutory or compulsory
+            licensing scheme. In all other cases the Licensor expressly
+            reserves any right to collect such royalties, including when
+            the Licensed Material is used other than for NonCommercial
+            purposes.
+   Section 3 -- License Conditions.
+   Your exercise of the Licensed Rights is expressly made subject to the
+   following conditions.
+   a. Attribution.
+         1. If You Share the Licensed Material (including in modified
+            form), You must:
+               a. retain the following if it is supplied by the Licensor
+                  with the Licensed Material:
+                  i. identification of the creator(s) of the Licensed
+                     Material and any others designated to receive
+                     attribution, in any reasonable manner requested by
+                     the Licensor (including by pseudonym if
+                     designated);
+                  ii. a copyright notice;
+                  iii. a notice that refers to this Public License;
+                  iv. a notice that refers to the disclaimer of
+                     warranties;
+                  v. a URI or hyperlink to the Licensed Material to the
+                     extent reasonably practicable;
+               b. indicate if You modified the Licensed Material and
+                  retain an indication of any previous modifications; and
+               c. indicate the Licensed Material is licensed under this
+                  Public License, and include the text of, or the URI or
+                  hyperlink to, this Public License.
+         2. You may satisfy the conditions in Section 3(a)(1) in any
+            reasonable manner based on the medium, means, and context in
+            which You Share the Licensed Material. For example, it may be
+            reasonable to satisfy the conditions by providing a URI or
+            hyperlink to a resource that includes the required
+            information.
+         3. If requested by the Licensor, You must remove any of the
+            information required by Section 3(a)(1)(A) to the extent
+            reasonably practicable.
+         4. If You Share Adapted Material You produce, the Adapter's
+            License You apply must not prevent recipients of the Adapted
+            Material from complying with this Public License.
+   Section 4 -- Sui Generis Database Rights.
+   Where the Licensed Rights include Sui Generis Database Rights that
+   apply to Your use of the Licensed Material:
+   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+      to extract, reuse, reproduce, and Share all or a substantial
+      portion of the contents of the database for NonCommercial purposes
+      only;
+   b. if You include all or a substantial portion of the database
+      contents in a database in which You have Sui Generis Database
+      Rights, then the database in which You have Sui Generis Database
+      Rights (but not its individual contents) is Adapted Material; and
+   c. You must comply with the conditions in Section 3(a) if You Share
+      all or a substantial portion of the contents of the database.
+   For the avoidance of doubt, this Section 4 supplements and does not
+   replace Your obligations under this Public License where the Licensed
+   Rights include other Copyright and Similar Rights.
+   Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+   c. The disclaimer of warranties and limitation of liability provided
+      above shall be interpreted in a manner that, to the extent
+      possible, most closely approximates an absolute disclaimer and
+      waiver of all liability.
+   Section 6 -- Term and Termination.
+   a. This Public License applies for the term of the Copyright and
+      Similar Rights licensed here. However, if You fail to comply with
+      this Public License, then Your rights under this Public License
+      terminate automatically.
+   b. Where Your right to use the Licensed Material has terminated under
+      Section 6(a), it reinstates:
+         1. automatically as of the date the violation is cured, provided
+            it is cured within 30 days of Your discovery of the
+            violation; or
+         2. upon express reinstatement by the Licensor.
+      For the avoidance of doubt, this Section 6(b) does not affect any
+      right the Licensor may have to seek remedies for Your violations
+      of this Public License.
+   c. For the avoidance of doubt, the Licensor may also offer the
+      Licensed Material under separate terms or conditions or stop
+      distributing the Licensed Material at any time; however, doing so
+      will not terminate this Public License.
+   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+      License.
+   Section 7 -- Other Terms and Conditions.
+   a. The Licensor shall not be bound by any additional or different
+      terms or conditions communicated by You unless expressly agreed.
+   b. Any arrangements, understandings, or agreements regarding the
+      Licensed Material not stated herein are separate from and
+      independent of the terms and conditions of this Public License.
+   Section 8 -- Interpretation.
+   a. For the avoidance of doubt, this Public License does not, and
+      shall not be interpreted to, reduce, limit, restrict, or impose
+      conditions on any use of the Licensed Material that could lawfully
+      be made without permission under this Public License.
+   b. To the extent possible, if any provision of this Public License is
+      deemed unenforceable, it shall be automatically reformed to the
+      minimum extent necessary to make it enforceable. If the provision
+      cannot be reformed, it shall be severed from this Public License
+      without affecting the enforceability of the remaining terms and
+      conditions.
+   c. No term or condition of this Public License will be waived and no
+      failure to comply consented to unless expressly agreed to by the
+      Licensor.
+   d. Nothing in this Public License constitutes or may be interpreted
+      as a limitation upon, or waiver of, any privileges and immunities
+      that apply to the Licensor or You, including from the legal
+      processes of any jurisdiction or authority.
+   =======================================================================
+   Creative Commons is not a party to its public
+   licenses. Notwithstanding, Creative Commons may elect to apply one of
+   its public licenses to material it publishes and in those instances
+   will be considered the “Licensor.” The text of the Creative Commons
+   public licenses is dedicated to the public domain under the CC0 Public
+   Domain Dedication. Except for the limited purpose of indicating that
+   material is shared under a Creative Commons public license or as
+   otherwise permitted by the Creative Commons policies published at
+   creativecommons.org/policies, Creative Commons does not authorize the
+   use of the trademark "Creative Commons" or any other trademark or logo
+   of Creative Commons without its prior written consent including,
+   without limitation, in connection with any unauthorized modifications
+   to any of its public licenses or any other arrangements,
+   understandings, or agreements concerning use of licensed material. For
+   the avoidance of doubt, this paragraph does not form part of the
+   public licenses.
+   Creative Commons may be contacted at creativecommons.org.
+   5. OpenDiT (https://github.com/NUS-HPC-AI-Lab/OpenDiT/blob/master/LICENSE)
+   Copyright OpenDiT
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   6. Open-Sora (https://github.com/hpcaitech/Open-Sora/blob/main/LICENSE)
+   Copyright Open-Sora
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,184 @@

+# Improved Quality, Synchrony, and Preference Alignment for Joint Audio-Video Generation
+This codebase is built upon [JavisDiT](https://github.com/JavisDiT/JavisDiT). Many thanks to their contribution.
+## Installation
+For CUDA 12.1, you can install the dependencies with the following commands.
+```bash
+# create a virtual env and activate (conda as an example)
+conda create -n javisdit python=3.10
+conda activate javisdit
+# install torch, torchvision and xformers
+pip install -r requirements/requirements-cu121.txt
+# install ffpmeg
+conda install "ffmpeg<7" -c conda-forge -y
+# the default installation is for inference only
+pip install -v .
+# for development mode, `pip install -v -e .`
+# to skip dependencies, `pip install -v -e . --no-deps`
+# replace
+export PYTHON_SITE_PACKAGES=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
+cp assets/src/pytorchvideo_augmentations.py ${PYTHON_SITE_PACKAGES}/pytorchvideo/transforms/augmentations.py
+cp assets/src/funasr_utils_load_utils.py ${PYTHON_SITE_PACKAGES}/funasr/utils/load_utils.py
+# (optional but recommended) install flash attention
+# set enable_flash_attn=False in config to disable flash attention
+pip install packaging ninja
+pip install flash-attn --no-build-isolation
+```
+## Training
+### Data Preparation
+In this project, we use a `.csv` file to manage all the training entries and their attributes for efficient training:
+| path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text|
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---|
+| /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy |
+The content of columns may vary in different training stages. The detailed instructions for each training stage can be found in [here](assets/docs/data.md).
+### Stage1 - Audio Pre-Train
+In this stage, we perform audio pretraining to intialize the text-to-audio generation capability:
+```bash
+torchrun --standalone --nproc_per_node 8 \
+    scripts/train.py \
+    configs/wan2.1/train/stage1_audio.py \
+    --data-path data/meta/audio/train_audio.csv
+```
+The resulting checkpoints will be saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc/model`. You can move the checkpoints to `exps/audio_pretrain/` for later use.
+```bash
+mkdir -p exps/audio_pretrain
+mv runs/000-Wan2_1_T2V_1_3B/epoch049-global_step53000 exps/audio_pretrain/
+```
+### Stage2 - Audio-Video SFT
+In this stage, we perform finetuning for joint audio-video generation (with LoRA adaptation):
+```bash
+torchrun --standalone --nproc_per_node 8 \
+    scripts/train_prior.py \
+    configs/wan2.1/train/stage2_audio_video.py \
+    --data-path data/meta/video/train_av_sft.csv
+```
+The resulting checkpoints will be saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc` with the `model` and `lora` subfolders. You can move the checkpoints to `exps/audio_video_sft/` for later use.
+```bash
+mkdir -p exps/audio_video_sft
+mv runs/000-Wan2_1_T2V_1_3B/epoch001-global_step13000 exps/audio_video_sft/
+```
+### Stage3 - Audio-Video DPO
+In this stage, we perform DPO to align joint audio-video generation with human preference (reuse and update the LoRA parameters learned from the previous stage):
+```bash
+torchrun --standalone --nproc_per_node 8 \
+    scripts/train.py \
+    configs/wan2.1/train/stage3_audio_video_dpo.py \
+    --data-path /data/meta/avdpo/train_av_dpo.csv
+```
+The resulting checkpoints will be also saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc` with the `model` and `lora` subfolders. You can move the checkpoints to `checkpoints/` for inference and evaluation.
+```bash
+mv runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc checkpoints/your_model
+```
+## Inference
+The basic command line inference is as follows:
+```bash
+resolution=480p # or 240p
+num_frames=65  # 4s
+aspect_ratio="9:16"
+DATASET="JavisBench"  # or JavisBench-mini
+prompt_path="data/eval/JavisBench/${DATASET}.csv"
+save_dir="samples/${DATASET}"
+model_path="checkpoints/your_model"
+ngpus=1
+torchrun --standalone --nproc_per_node ${ngpus} \
+    scripts/inference.py \
+    configs/wan2.1/inference/sample.py \
+    --resolution ${resolution} --num-frames ${num_frames} --aspect-ratio ${aspect_ratio} \
+    --prompt-path ${prompt_path} --model-path ${model_path} \
+    --save-dir ${save_dir} --verbose 1
+# (Optional, for evaluation) Extract audios from generated videos
+python -m tools.datasets.convert video ${save_dir} --output ${save_dir}/meta.csv
+python -m tools.datasets.datautil ${save_dir}/meta.csv --extract-audio --audio-sr 16000
+rm -f ${save_dir}/meta*.csv
+```
+Setting `--verbose 2` will display the progress of a single diffusion process. And you can replace the `--prompt-path ${prompt_path}` with a single prompt to generate a single video, such as `--prompt "a beautiful waterfall"`.
+## Evaluation
+### Installation
+Install necessary packages:
+```bash
+pip install -r requirements/requirements-eval.txt
+```
+Download the meta file and data of [JavisBench](https://huggingface.co/datasets/JavisDiT/JavisBench), and put them into `data/eval/`:
+```bash
+cd /path/to/JavisDiT
+mkdir -p data/eval
+huggingface-cli download --repo-type dataset JavisDiT/JavisBench --local-dir data/eval/JavisBench
+```
+### Evaluation on JavisBench or JavisBench-mini
+Run the following code and the results will be saved in `./evaluation_results`. For details please refer to the details of [JavisBench](eval/javisbench/README.md).
+```bash
+MAX_FRAMES=16
+IMAGE_SIZE=224
+MAX_AUDIO_LEN_S=4.0
+# Params to calculate JavisScore
+WINDOW_SIZE_S=2.0
+WINDOW_OVERLAP_S=1.5
+METRICS="all"
+RESULTS_DIR="./evaluation_results"
+DATASET="JavisBench"  # or JavisBench-mini
+INPUT_FILE="data/eval/JavisBench/${DATASET}.csv"
+FVD_AVCACHE_PATH="data/eval/JavisBench/cache/fvd_fad/${DATASET}-vanilla-max4s.pt"
+INFER_DATA_DIR="samples/${DATASET}"
+python -m eval.javisbench.main \
+  --input_file "${INPUT_FILE}" \
+  --infer_data_dir "${INFER_DATA_DIR}" \
+  --output_file "${RESULTS_DIR}/${DATASET}.json" \
+  --max_frames ${MAX_FRAMES} \
+  --image_size ${IMAGE_SIZE} \
+  --max_audio_len_s ${MAX_AUDIO_LEN_S} \
+  --window_size_s ${WINDOW_SIZE_S} \
+  --window_overlap_s ${WINDOW_OVERLAP_S} \
+  --fvd_avcache_path ${FVD_AVCACHE_PATH} \
+  --metrics ${METRICS}
+```

assets/demo/Fig7-JAVG/case1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98a50cd8a5395c8961df05e51f3eb13eb4ee01ba9d56d949208083c52b0f8c79
+size 280678

assets/demo/Fig7-JAVG/case2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc657fb7679449e2b44dcc277b001d87114e51fa8665ae782af062827ba6d0a4
+size 286608

assets/demo/FigA11-X-Cond/A2V.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9edc892ccf4f050d047c91dc25a11400b2be5628369d3659efb4a5db019a49bc
+size 553560

assets/demo/FigA11-X-Cond/AI2V.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91c9c88664c640860c90dc47b7ee44fdff5fe08255de0e6f349a3be78af7bd8c
+size 564407

assets/demo/FigA11-X-Cond/AV-Ext.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60f1781b846859f0c7878f3f541dacb3d83c27d48d23753d8b058a39c314acf3
+size 598493

assets/demo/FigA11-X-Cond/GT.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90b0ffb6de655e2fb1457011bb26a150d6a1472e29313b58bd2ab21b50047f2e
+size 289664

assets/demo/FigA11-X-Cond/I2AV.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01b65d9b301389e2691efc054cc7e66dad0fd5d56dce15c0077ff0b8baff5f84
+size 565559

assets/demo/FigA11-X-Cond/V2A.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc5d55a5d211295314a932e4007b8fe9505bc417c33921c4103e37b6e8a09863
+size 567901

assets/demo/FigA9-JAVG/case1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf5f2c3100a945c84f4f035cc26fc71cb582f824f5b0eee2c3254b08bd7a653
+size 599929

assets/demo/FigA9-JAVG/case2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d99920708093b21c0a2e8ca51fb2e4bcede4818bf4c6267f22e1c0775960842d
+size 562301

assets/demo/FigA9-JAVG/case3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68d95f9577d1162b5d2c23e9e4f5e3862be1afe1c8a7f0b3a18f3ee274f6438f
+size 569684

assets/demo/FigA9-JAVG/case4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02cdb118dcc68b6cbe5fef868c8586677d6c50e9c7607c3d4b8d8f651a5e3d49
+size 559562

assets/demo/FigA9-JAVG/case5.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d75a37d17261386226b64b4b856bdbd70b6ba4a2561fee4fbbbcab9fe1fa35d5
+size 537903

assets/demo/FigA9-JAVG/case6.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccae34cbd657a06a690b550304949322c4975bdb99f742eb74b40e2971048399
+size 579138

assets/demo/FigA9-JAVG/case7.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e534daf85a87b4448036077e19593ebb76aae07623ae3fd307db1e71a3c03dfc
+size 558740

assets/demo/audio_prompts.txt ADDED Viewed

	@@ -0,0 +1,1222 @@

+People are clicking, breathing, and speaking.
+Sound effects, music, and human sounds are heard.
+Laughter and conversation are heard, speech synthesizer sounds are heard, and breathing is heard.
+Whistling and wind noise is present.
+Mechanisms make beeping and tapping sounds.
+A woman speaks near running water, a man and a child speak, and music plays.
+A car alarm goes off and a car drives by with the sound of wind.
+Thunks and background noise can be heard.
+A turkey makes sounds and mechanisms tick.
+A ringtone is ringing repeatedly.
+A sine wave plays.
+Background noise, bird songs, surface contact, and the sound of a clock are heard.
+Male and female speech, television, and crowd sounds are heard, along with buzzers.
+Cats make noise in a noisy background.
+An engine hums in the background.
+People are talking and a door is slamming and knocking while music is playing.
+A bicycle is moving and there is background noise.
+Sound effects are occurring.
+Men sing with music.
+Farts, background noise, sound effects, and men speaking with chirping birds in the background.
+People are cooing, making noise, and speaking while background noise persists.
+A rooster is crowing and mechanisms are functioning.
+People are coughing and making sounds in the background.
+Various surface contacts and ticks are heard in the background.
+Men speak and breathe next to music and each other.
+Whispering and human sounds come from mechanisms.
+Women are slurping and speaking over background noise.
+A truck horn blares.
+Background noise and people talking, drinking, and eating can be heard.
+Clicking and tearing mechanisms can be heard.
+A dial tone is heard.
+People are chewing and making noise in the background, with sounds of glass clinking and whispering.
+People are speaking and background noise is present.
+Background noise, breaking sounds, and conversation can be heard as men speak and laugh.
+A phone rings with background noise.
+A car is revving and auto racing.
+The sound of horse hooves and mechanisms can be heard with wind noise in the background.
+Women are speaking and scraping sounds are present.
+Background noise and a doorbell ringing are heard.
+A stomach rumbles with background noise.
+A sliding door is opened and closed with ticking mechanisms heard.
+Sound effects are heard.
+A static noise is heard with a sine wave in the background and music is playing.
+Glass sounds, a ringtone, tapping, speech synthesis, laughter, and ticking are heard.
+A camera clicks with male speech and mechanical sounds.
+A sine wave plays.
+Men are speaking with background noise.
+Music is playing.
+A heartbeat and background noise can be heard.
+Pigs are oinking, music is playing, and a woman is speaking.
+Mechanisms and an alarm are heard, with music playing.
+Goats are bleating and a clock is ticking.
+A heavy engine and barking can be heard, with laughter and more barking in the background.
+People are playing a video game and chatting with sound effects and footsteps.
+Men are speaking with background noise.
+Breathing and mechanisms make ticking and pattering sounds.
+Chewing and surface contact sounds are heard.
+Background noise, laughter, and ticking sounds are heard.
+A sound effect is heard.
+Snoring and breathing sounds can be heard in the background.
+Dial tones, background noise, ticking, and a man speaking can be heard.
+The sound of a woman speaking is heard over background noise and caterwauling.
+A man speaks over a plopping sound effect.
+Mechanisms make sounds along with gushing water and roaring big cats.
+Music is played, with a tick and a sheep bleating.
+A ding is heard.
+A single sound effect is heard.
+Men are speaking and a road vehicle is heard with human voices.
+There is a buzz, wind, and bird songs, followed by ticking sounds.
+Clicking and typing on a computer keyboard, breathing, and a man speaking are heard.
+Music and wind play together.
+Cutting sounds, birds chirping, and rustling can be heard.
+A trickle sound and frogs are croaking.
+A bird calls, followed by a fart and background noise.
+Heart sounds are heard in a repeating pattern.
+A sine wave is heard with speech synthesizers.
+A sound effect is heard.
+Music, a tick, and a sound effect are heard.
+A man speaks as farts and laughter are heard in the background with wind noise.
+A woman walks and speaks while sound effects and mechanical noises can be heard.
+A boing, music, and female singing are heard.
+Someone speaks with surface contact and shuffling card sounds.
+A sound effect occurs.
+Mechanisms and laughter mix with wind, running water and splashes.
+Surface contact occurs with whacks and thwacks as people laugh.
+A woman is speaking and heart sounds and clicking are heard in the background.
+Wind noise is heard with a buzz.
+Cats purr with background noise.
+A bus is heard, air brakes are activated and birds chirp.
+Background noise, bird calls, and surface contact sounds are heard.
+A doorbell, man speaking, music, and surface sounds are heard.
+A doorbell rings.
+Birds are singing with mechanisms and ticking in the background.
+Beeps and male speech are heard.
+Wind, mechanisms, horse sounds, bird sounds, and wind noise can be heard.
+The sound of a waterfall.
+Music and background noise are heard.
+Machinery runs, scrapes, ticks, and taps occur.
+An explosion and shouting are heard, with background noise and people talking.
+Speech synthesizers, human voices, and plops are heard.
+A dog is barking and mechanisms are heard.
+Howls occur among mechanisms.
+Tapping sounds are interspersed with whispers.
+A jet engine is heard.
+Sound effects and machine gun sounds.
+Heartbeats are heard, with brief tones and background noise.
+Various ticks and mechanisms can be heard.
+Radio, telephone dialing, and a man speaking are heard with background noise.
+Background noise and sound effects are present.
+Laughter, man's voice and human sounds can be heard in the background noise.
+Only music is heard.
+Sound effects and background noise are heard.
+Various sound effects are heard, including clicking, whooshing, and rumbling.
+Laughter and a thunk occur amidst background noise and sound effects.
+Background noise, smoke alarms, and a woman speaking can be heard.
+There is reverberation and a bell sound with surface contact.
+A man is speaking and zipping up with background noise.
+Wind, vehicle, sirens, and crunching sounds.
+A purring sound is heard with background noise.
+A microwave beeps and a door slams in the background noise.
+A camera is clicking with background noise and mechanisms.
+People are chewing, whispering, and breathing.
+Surface contact sounds and a man's speech are heard in the background noise.
+Music is playing, there are ticking sounds and sound effects.
+Heartbeats sound repeatedly.
+Sound effects, video game sounds, and human voices are heard.
+Background noise and clicking sounds can be heard.
+A sound effect, a man speaking, and an explosion occur.
+A dog barks and yips amidst background noise and human sounds.
+A person is typing on a computer keyboard with background noise.
+Sound effects and plops mix with camera clicks and other mechanisms.
+A sine wave is heard.
+A sine wave can be heard.
+Music is playing and an electronic tuner is being used.
+A man speaks over wind noise, roosters crowing, and ticks.
+Crunching sounds and background noise are heard with music playing.
+People are chewing and making noise.
+A bus is honking its horn.
+A cat is purring and there is background noise.
+Mechanisms are ticking.
+Background noise and breathing are heard.
+Sine waves, busy signals, dialing, sound effects, and other sourceless sounds are heard.
+Gunfire, a man speaking, and gasping are heard, followed by video game sounds.
+A single, isolated sound effect is played.
+Hooves clip-clop amidst background noise.
+A sink is filling or washing, and water is heard in the background noise.
+Chewing and biting sounds mix with ticking and surface contact noises.
+A man is speaking while a camera clicks in the background.
+A sound effect is played.
+Wind is blowing and animals can be heard.
+A sine wave plays.
+A motor vehicle is moving on the road, a man is speaking, and there are sounds of a ticking, tapping, and an owl.
+Background noise, ticking sounds, sneezing, breathing, and a woman speaking are heard.
+Crinkling and rustling sounds with background noise.
+Male speech is heard over surface contact and background noise.
+A person is walking and playing a video game with whacks and human voices.
+A bell tolls and wind blows as a sheep bleats and a child speaks.
+A heartbeat can be heard with background noise and ticking sounds.
+Wind, coughing, car honking, speech, and female speech can be heard with wind noise picked up by the microphone.
+A human voice speaks during a thunderstorm with background noise and clicking sounds.
+A man is speaking over background noise and clicking sounds.
+Mechanical and breathing sounds mix with surface contact and whispering.
+A water tap drips in the background, a man speaks, cutlery clinks, and a scraping sound is heard.
+Wind, bleating goats, bird songs, and buzzing sounds mix.
+Chirping of birds is heard.
+Whispering, crumpling, and breathing sounds are heard.
+A mouse clicks and surfaces are contacted with background noise.
+Ticking sounds and surface contact are occurring repeatedly.
+There are various sounds including water, video game effects, and human voice.
+A man speaks with background noise and a sound effect is heard.
+Firecrackers and wind can be heard with sound effects.
+Mechanisms operate with tapping and surface contact.
+A man speaks and sighs as a clock ticks.
+A man speaks as a mechanical fan runs and clicking sounds are heard in the background.
+Ticking and firecracker sounds are heard in the background.
+Music is playing with taps in the background.
+A heartbeat is heard with background noise.
+A hum is heard followed by heartbeats and sounds of writing and plopping.
+Wind, cars, and various objects are making noise and moving.
+Background noise and animals (crickets, animals growling) are heard.
+People are laughing and speaking over gurgling water and breathing sounds.
+Keypress tones, background noise, a cash register, and a ding can be heard.
+Music, men speaking, and artillery fire can be heard.
+A man is speaking, farting, and tapping with mechanisms.
+A mix of speech, sound effects, and music.
+A whooshing sound, surface contact, and an explosion are heard.
+Background noise and cooing sounds are heard.
+Gunshots and clangs are heard among brief tones.
+Breaking sounds are accompanied by footsteps and mechanisms.
+People are making sounds, with a cat meowing and background noise.
+Mechanisms, surface contact, pouring, tapping, breathing, and more pouring and tapping can be heard.
+A firecracker is heard, followed by footsteps and background noise.
+Background noise is heard, with various ringtones and keypress tones.
+Mechanisms with human sounds, ticks, taps, and female speech.
+Human voices speaking with background noise, surface contact, and male speech are heard.
+Water flows and mechanisms make ticking sounds.
+An alarm is ringing.
+Cards shuffle, mechanisms sound, and breathing is heard.
+Mechanisms, ticking, and beeping sounds are heard.
+A man speaks and clicks are heard, followed by surface contact and more clicking.
+Mechanisms, tapping, and ticking sounds are heard.
+Music and background noise are heard.
+A man is speaking while a crowd is making noise.
+A whip is heard in the wind, along with the sound of glass clinking and human voices.
+Sound effects, dings, and speech synthesizer sounds are heard.
+Whistling, sound effects, and mechanisms are heard.
+Music plays over noise.
+Background noise and purring are heard.
+Mechanisms and cawing birds make sounds while glass is heard.
+Sheep bleat and wind noise can be heard.
+A man speaks, followed by echo, background noise, and brief tones.
+Surface contact, ticking, and male speech are heard over background noise.
+An aircraft engine is heard.
+A tuning fork, ticking, and scraping noises create a busy environment.
+Scissors are heard, followed by a coin dropping, beeps, music, video game sounds, and sound effects.
+Sound effects, breaking, background noise, whispering, and human sounds are present.
+An aircraft engine is heard.
+A sound effect plays with background noise.
+A cat purrs as something ticks and birds sing.
+A busy signal is heard.
+Animals and wind noise with laughter, bleating, and whispering.
+A sound effect is heard.
+Something is being poured with background noise.
+A phone dial tone is followed by keypress tones and a sound effect.
+Coin dropping, ticking, and mechanism sounds are heard.
+Pigs and music are heard.
+Liquid is heard, followed by footsteps and video game sounds.
+Music plays and a man speaks, followed by a bell ringing.
+Mechanisms can be heard, a cat meows, a woman speaks, and breathing sounds can be heard.
+A woman is speaking while birds chirp and ticks are heard.
+A fart is heard, speech synthesizers are speaking, and laughter is heard.
+Horns honk repeatedly from a mid-frequency engine.
+Walking sounds and whistling with some human voice and sound effects.
+The wind blows, with ticking, wind noise, breathing, laughter, and human sounds heard.
+Bursting and plopping sounds, and sound effects are heard.
+Beeps, camera sounds, tapping, and surface contact can be heard.
+A brief tone is heard.
+People are tapping and making surface contact in a noisy environment.
+Background noise and barking dogs are heard with bird vocalizations and ticking sounds.
+A man is speaking, breathing, making surface contact sounds and crumpling papers.
+Background noise, walking, splashing, and gurgling are heard.
+Music and sound effects are played.
+Child and adult speech is heard amid background noise and ticking.
+Background noise, sound effects, and speech synthesizer.
+A hammer is being used and making tapping noises.
+Mechanical sounds alternate with patter and mouse sounds, and men speak intermittently.
+Men are speaking, with a hair dryer running in the background.
+A electric shaver ticks.
+Human voices and snoring can be heard.
+Women and a man are speaking, a pig is heard, and people are breathing and making contact sounds.
+A woman is speaking, people are talking and kids are playing in the background while water can be heard and occasionally someone takes a breath.
+A single ping sound is heard.
+Background noise and camera sounds are heard.
+Sonar beeps in the background.
+A bicycle bell rings.
+Something ticks while making surface contact.
+Heartbeats are heard over background noise and water, with occasional music.
+Chewing and animal sounds are heard with a tap and thump.
+A sine wave is playing.
+Beeps are heard repetitively.
+An air horn is heard, and there is background noise with tapping sounds.
+A man burps, makes human sounds, speaks and breathes with mechanisms and surface contact noises.
+Growling and breathing sounds are heard.
+A man speaks with background noise while birds sing.
+Wind, birds are chirping, mechanisms are ticking, and ticks are ticking.
+Music, ding sounds, and typing on a typewriter.
+Music is playing with background noise.
+A man speaks while various mechanisms and surface contacts occur, and breathing is heard.
+Wind blows, animals make sounds, and birds sing.
+Breathing and gobbling sounds can be heard in the background.
+Breathing sounds are heard.
+Music is playing with a man speaking and background noise.
+Sound effects occur.
+Birds are flying and tapping with background noise and pigeon sounds.
+Birds sing and chirp, with wind noise and occasional bird calls.
+The background noise is interrupted by beeps.
+A man is speaking while tools can be heard in the background.
+Paper rustles repeatedly alongside ticking sounds.
+A video game is playing and a man is speaking.
+Wind rustles and horses neigh.
+A sound effect is heard.
+Cards are being shuffled with background noise.
+Scissors are being used repeatedly.
+A busy signal, beeps, keypress tone, ticking, and background noise are heard.
+Singing and a ding fill the air with melody.
+A woman is speaking, people are breathing, and there is background noise, ticking, kids speaking, humming, tapping, and more breathing.
+Sound effects are heard.
+Sound effects and heartbeat with music are heard.
+Rustling sound is heard repeatedly with background noise.
+A car is reversing with beeps and scrapes.
+People are shouting and talking over background noise and a child speaking.
+Whips cracking are heard.
+Mice are heard scurrying.
+A bird makes clucking sounds, crows, and flaps its wings.
+Music is playing.
+A whale is vocalizing and water splashes.
+Cars honk and tick as they pass each other on the road.
+Chewing and bird vocalizations are heard, with background noise.
+A beep sounds.
+A sound effect is followed by a glass shatter, mechanisms and ticks.
+A video game is being played with various sound effects and whacking noises.
+People make human sounds and laugh.
+Women are speaking, breathing, and making clicking and sound effects.
+A man is singing and background noise and writing sounds are present.
+Mechanisms are being used and music is playing over a background noise.
+A sine wave is playing.
+Women are laughing and speaking.
+A knock is heard in the background.
+A man is speaking with background noise, objects are making contact with a surface and car horns are honking.
+Wind and horns are heard, with a group of people giggling.
+People are speaking, and mechanisms, cutlery, and boiling sounds can be heard.
+A woman speaks after sneezing, then there are ticking and breathing sounds.
+A man is speaking and sheep are bleating with ticking sounds in the background.
+Glass is shattering and a siren is sounding in the wind.
+Pigs are heard, with wind and ticking sounds.
+A man's speech and background noise are accompanied by various bird calls and laughter.
+A dog pants with sounds of scraping and ticking in the background.
+A speech synthesizer is speaking with background noise and clicking sounds.
+Sound effects, video game sounds, human voice, and more are heard.
+Mechanisms are operating, with breathing, sneezing, and surface contact sounds.
+Birds chirp and croak with background noise.
+Medium frequency engines and car horns alternate with ticks.
+Music is playing with background noise, video game sound, and a police car is passing by with its siren.
+Sound effects and sine waves are heard.
+A bell rings and there is the sound of mechanisms, typing on a computer keyboard, scraping, breaking, and laughter.
+Mechanisms and speech with background noise are heard.
+Music is playing with a ticking clock and ticking sounds.
+Background noise, bird chirping, and birds are heard.
+A whale vocalization and a sound effect can be heard.
+Music and background noise, and ticks can be heard.
+Owls are hooting and mechanisms are functioning.
+Female singing and music are heard.
+Water is flowing, birds are chirping and tweeting and people are speaking.
+Multiple people engage in various speech patterns while whispering and background noise are present.
+Windows are being closed and a tap and background noise are heard.
+Wind, breathing, birds chirping, a man speaking, and ticking sounds can be heard.
+A man is speaking, breathing, and speaking again with background noise.
+Various video game sounds and human voices are heard.
+Typing, writing, and clicking sounds can be heard while a man speaks and mechanisms and breathing sounds are in the background.
+A ticking sound is heard with fireworks and mechanisms.
+Background noise can be heard, followed by a woman speaking and surface contact.
+People are walking and birds are singing outside.
+Farts, squealing, and mechanisms are in operation.
+Honking and background noise are heard, with a man speaking.
+Men are speaking and making grunting sounds with mechanisms in the background.
+A series of sound effects are being played.
+Birds chirp and a car horn sounds in the background.
+A child is speaking with background noise.
+Women speak and machinery sounds while glass clinks.
+Cars are revving and accelerating with bleats and a bell.
+Wind noise, rustling, breathing, sniffing, and background noise are heard.
+Birds sing and an explosion is heard over background noise.
+Crickets are chirping and there are occasional plops.
+Music and background noise are heard.
+Background noise and sound effects of tapping and plopping are present along with video game sounds.
+Music and sound effects play.
+A man speaks, with wind noise and surface contact sounds.
+A series of sound effects are playing with background music.
+The sound of horses walking is mixed with some background noise.
+A sine wave and chirp tone are heard.
+Wind and nature sounds, including crickets and whooshing, can be heard.
+A stomach grumbles amidst background noise.
+A man speaks, bird songs play, and a skateboard moves over background noise.
+Children are speaking and birds are singing with waterfowl and human sounds.
+Background noise and shuffling sounds are present.
+A man is speaking with music and a ding sound.
+A man is speaking and slapping while mechanisms are heard.
+Video game sounds are heard before something breaks.
+Sound effects, speech, and a ding are heard.
+Flapping and wind noise is heard.
+An explosion, screaming, music, a man speaking, and video game sounds occur.
+Sounds of paper crinkling, mechanisms, and surface contact are heard.
+A woman is speaking, music is playing, and sound effects can be heard.
+Only mechanisms can be heard.
+Music plays with ticking mechanisms and human sounds.
+Typewriters type and sound effects play.
+Men are speaking and clicking sounds are heard.
+A man is speaking, breathing, chewing, and background noise is heard.
+Music plays continuously.
+A sound effect is followed by a woman speaking.
+Music is playing.
+Honking cars can be heard in the background noise.
+A dial tone is heard followed by music.
+Bird sounds, thunks, and mechanisms can be heard.
+An animal makes noises while mechanisms are operating.
+A beep is followed by a man humming, speaking and ticking with background noise.
+A variety of birds are chirping, animals are making noises, people are talking and singing.
+Men are talking with background noise.
+Background noise, clicking, footsteps, and farts are heard.
+An alarm clock rings in the background.
+Background noise and wind noise with microphone, neighing, and a man speaking are heard.
+A sound effect is heard.
+Sanding and female speech with background noise is heard.
+Birds are chirping and singing, with occasional coughing and movement.
+Humans make sounds, run and make sound effects.
+A bicycle bell rings.
+Background noise with a doorbell ringing.
+Heartbeats are heard with background noise.
+Glass is clinking and surfaces are being contacted with background noise.
+A man is speaking and typing on a computer keyboard.
+Fire crackles as a woman sings.
+Various birds chirp, crickets chirp, the wind blows, and footsteps are heard.
+Background noise is present while crickets chirp, paper rustles, and tearing sounds can be heard.
+A man is speaking, bird calls are heard, and artillery fire can be heard.
+Background noise and clicking sounds are heard.
+Mechanisms and paper are crumpling and rustling.
+Music is playing.
+Wind is blowing, mechanisms are ticking, birds are chirping and cawing, and tapping is heard.
+Mechanisms and women are speaking, and taps are heard.
+Chirping birds and wind are heard, then a woman sings.
+Various sounds including whistling, speech, music and tapping are heard.
+Background noise, wind, vehicles passing by, chopping, birds singing, laughter, and breathing are heard.
+Heart sounds and background noise are heard, with occasional whoops and laughter.
+A bell rings and a train is passing by, blowing its horn.
+Music, wind, and wind chimes are heard.
+Various human voices and sound effects are heard.
+Camera clicks and mechanical mechanisms are heard.
+A human voice can be heard.
+Paper is rustled.
+Wind noise, airplane sounds, bird calls, and squeaks are heard.
+Various sound effects are heard with whooshing noises.
+Video game sounds and an animal are heard.
+Water flows, splashes, and people breathe.
+People are making various human sounds, with sound effects in the background.
+A sound effect accompanies a child's speech.
+Dogs are barking and there is background noise.
+Men speaking and breathing are heard in the background.
+Footsteps can be heard walking.
+Mechanisms, human voices, and breathing are heard with slapping and tapping sounds.
+Clicking and human sounds are heard with background noise.
+Background noise, radio, and women speaking are heard.
+Multiple sound effects are heard, with a purring sound in between.
+Music plays and a clock ticks and sings.
+Birds chirp and tweet in the background.
+Breathing, chewing, and surface contact sounds are heard with background noise.
+A bell sound, such as that of a doorbell, is heard.
+Background noise and men speaking can be heard.
+Mechanisms and shuffling cards sound, with occasional honks of a vehicle horn.
+Mechanisms and rodents are heard.
+Beeping machinery sounds are heard.
+A sound is made by scratching followed by a tap.
+Mechanisms are making whacking sounds, while a gurgling noise and speech synthesizer are heard.
+A doorbell and barking are heard.
+Background music is playing.
+Sounds of running, gasping, and breathing are heard with background noise.
+A motorcycle makes a sound, followed by video game sounds and tire skidding.
+Background noise, conversation, tapping, and speech are heard, with a child speaking.
+A man speaks with background noise and clicking.
+Wild animals make noises.
+Mosquitoes, slapping sounds, crickets, and mosquitoes are heard.
+The sound of a rumble is present.
+Background noise, ticking, thumping, chirping, and tweeting are heard.
+Background noise is present and a buzzer is heard.
+Men are speaking and spraying with ticking sounds in the background.
+Music and sound effects are heard.
+People are talking, whispering, laughing, and making sounds in a noisy environment.
+A sound effect is heard in this audio sequence.
+A female is speaking while a heartbeat is heard in the background.
+A mechanical fan is running and wind is blowing through the microphone.
+Video game sounds, wind, laughter, gasping, and men speaking are heard.
+Bells and ticks are heard with background noise.
+A device beeps, someone screams and cries, followed by a whack and static.
+A motorcycle revs and accelerates.
+Music plays as water flows and background noise is heard.
+Wind is blowing, water is heard, a sailboat is sailing, and breathing is heard in the background.
+Background noise and ticking sounds are heard.
+Background noise, a woman is speaking, and music is playing.
+Whispering and tapping sounds are interspersed with breathing and surface contact.
+Mechanisms and bouncing sounds can be heard with background breathing.
+Wind is blowing, birds are singing, and a man is speaking, with gunshots in between.
+A firecracker goes off with background noise.
+A woman speaks and writes with tapping and breathing sounds in a noisy background.
+Footsteps and video game sounds are heard.
+Mechanisms, shuffling cards, and surface contact sounds are heard.
+Fire is heard, running, a vehicle, and a police car siren can be heard.
+Pigeons, doves, birds, barks, and ticks are heard with background noise.
+A zipper is opened, and people talk and breathe, with a horse neighing and a tap running.
+Scraping and mechanisms sounds, a television is on, and a man is speaking.
+A ringtone, doorbell, and breathing sounds are heard.
+A man speaks while birds sing and a rowboat moves through a river.
+Background noise, bird calls, and music are heard.
+A heartbeat and hum are audible.
+Bells ring with background noise and wind sounds.
+A person is crying and sobbing while mechanisms make noise and they take breaths.
+A sine wave sound is heard.
+Laughter, a bell, and mechanisms accompany a woman speaking.
+Tapping and smoke alarm sounds can be heard.
+Mechanisms can be heard with a human voice.
+Beeping and human voices are heard with noise.
+Animals barking, bleating, and panting with background noise and ticks.
+Tapping, writing, and mechanisms are making noise.
+Background noise, whacks, and breaking sounds are heard.
+Thunderstorm and ticking sounds with human sounds in the background.
+Heartbeats are heard alongside footsteps, wind, and the sound of a car.
+A heartbeat is heard, followed by female and male speech and a sound effect.
+Zippers are being opened and closed amidst background noise.
+People are talking and making noises on a surface with breathing and ticking sounds.
+Footsteps, speaking, thumping sounds are heard.
+Sound effects play with a thump.
+A whip cracks, people make sounds, and a man speaks and laughs with ticking in the background.
+A man is typing on a computer and playing video games while talking to someone and listening to music.
+Background noise and a hoot are heard along with tapping.
+Birds sing, people walk and talk, and a vehicle drives by.
+People are making chewing, human sounds and mechanisms are in motion.
+People laugh, a woman speaks, rodents make noise, clicks and taps are heard, breathing and mechanisms are heard.
+Yipping sounds with background noise are heard.
+Mechanisms, men talking and breathing are heard.
+A telephone bell rings.
+Scissors are being used.
+A motor vehicle (road) is heard, followed by tire squealing or skidding.
+Background noise and human sounds are present.
+Bird calls mixed with crows and sound effects.
+Heavy footsteps and background noise are heard as a man speaks.
+A man speaks while sounds of ticking, tapping, and mechanisms can be heard.
+Background noise and an explosion are heard.
+Music plays with video game and sound effects.
+Background noise and animal sounds are heard.
+Sound effects and background noise can be heard.
+Mechanisms are functioning, with a dial tone and busy signal.
+Music and coins dropping are heard.
+Bird calls, barking, running, background noise, and a dog growling are heard.
+Heart sounds and mechanisms are heard.
+Waterfowl and bells are heard while the wind blows.
+Mechanisms and an explosion are heard.
+A foghorn sounds and music plays over background noise.
+A chime is ringing with background noise.
+Whistling, background noise, and men speaking are heard.
+Music and speech noise can be heard.
+An arrow flies and a car is heard in the background.
+Background noise, mouse patter and mouse sounds can be heard.
+Tapping sounds and sonar noises repeat.
+Wind is blowing, a police car siren is sounding, and crows are cawing.
+Sounds of mechanisms, video games, white noise, clicking, footsteps, breaking, gunfire, ticking, and dripping are heard.
+Dogs are panting.
+Sound effects and clanging are heard.
+Writing and music accompany sound effects from mechanisms.
+Male speech, breathing, and background noise are heard throughout.
+A bell is ringing and heart sounds are heard along with background noise and mechanisms.
+Animal sounds dominate.
+Cats and dogs are meowing and barking to the sound of music.
+The sounds of wind, telephones, and people speaking are heard.
+Bouncing and mechanisms sounds are heard.
+Music and sound effects are playing.
+A man is speaking with a tap and glass shattering with a gasp.
+Cooing, mechanical fan, tapping, and ticking sounds are heard.
+A man speaks and breathes over the sound of music.
+Boing sounds are repeated.
+Wind blows as a rowboat glides on water.
+Barking and human sounds can be heard with surface contact, human voice, and ticking.
+Roosters cluck and crow amidst wind noise.
+A bicycle bell and mechanisms can be heard.
+A bell is ringing while mechanisms are operating.
+A bird is singing, and a vehicle honks.
+Mechanisms operate while a man speaks, then a foghorn sounds.
+Video game sounds mix with machine gun fire.
+Breaking sounds are heard.
+Mechanisms and purring are heard with ticking in the background.
+A sine wave tone is heard.
+Male speech, television, gasps, yelling, and laughter are heard with ticking sounds.
+People are laughing, coughing, and making fart sounds, with music and background noise.
+A tuning fork and background noise is heard.
+A roaring sound with clicking and a man speaks.
+Mechanisms, footsteps, laughter, and speech are heard.
+Mechanisms mix with breathing and chewing and liquid sounds.
+A woman is speaking with background noise and more female speech.
+A man speaks with background noise, music plays, and clicking is heard.
+Children speak, cough, and make noises, with background noise.
+Mechanisms make noise as something rolls.
+A man speaks, breathes, and ticks while background noise occurs.
+Singing and music play with background noise and men speaking.
+Sound effects and background noise are heard.
+Croaking frogs and crickets make sounds.
+A woman is speaking, clock ticks are heard, and a woman is speaking more.
+A clock ticks and music plays against a background of noise.
+A bell sounds with human sounds and a sound effect.
+Surface contact and human sounds are heard.
+A car horn sounds amid background noise, followed by bicycle bells and ticking, along with human sounds.
+Women are speaking, hubbub and background noise is heard, animals and children are making noises, and human voices are heard.
+A sine wave is heard.
+People are talking and a beep is heard in the background.
+Only music plays.
+Music plays with background noise and a crowd clapping.
+Music and background noise can be heard.
+A woman speaks while snoring and with mechanisms.
+Insects, wind, and men speaking are heard, with birds chirping.
+Music is playing.
+Background noise and laughter are heard, then several coughs, more laughter.
+A cow is mooing with an echo heard in the background.
+Noise and heart sounds are heard.
+A man is speaking and breathing, with background noise.
+Music is playing, people are making sounds, and a plop is heard.
+People are chewing, clicking, and making ticking noises.
+Chickens can be heard clucking in a noisy background.
+Mechanisms beep amidst background noise.
+Only sonar sounds can be heard.
+An alarm clock is ringing with background noise and wind noise.
+A river is flowing, wind is blowing and a man is speaking with ticks in the background.
+A man is speaking with a water tap and mechanisms in the background.
+Crackling and wind sounds, video game noises, and music can be heard.
+Mechanisms are making sounds.
+The sound of a heartbeat is heard repeatedly.
+Plops, music, background noise, and woman speaking and breathing.
+A loud slam is heard.
+A bus is driving.
+Men are speaking with background noise.
+A creaking noise is heard in a repetitive pattern.
+A series of heartbeats with distortion in the audio.
+A man and woman speak, wind blows, a dog barks, scrapes, and howls.
+Wind blows, mechanisms are heard, a man speaks, human voices are heard, birds are chirping, frogs are heard, and an arrow is released.
+A sine wave is played.
+Female and male singing, sound effects and video game sounds can be heard.
+Writing and background noise are heard.
+Various sound effects and ticking are heard.
+People are speaking, clicking sounds are present, and background noise and human voices can be heard.
+A sewing machine is being operated.
+Ticking, background noise, and a chirp tone are heard.
+Background noise and breathing can be heard.
+A printer is printing while music plays.
+Background noise, beeping, a man speaking, and an alarm are heard.
+Mechanisms and splatter sounds are heard.
+Various sounds occur including clicking, tones, and noise.
+A man speaks and an aircraft is heard.
+Applause is heard.
+The sounds of an electric toothbrush, a man speaking, human sounds, a woman speaking, and laughter are heard.
+Only background noise is heard.
+A child is speaking, and mechanisms are making ticking sounds along with a coughing sound.
+A beep, man speaking, and radio sounds are heard with background noise and ticking.
+A groan and other sound effects.
+Music is playing.
+A heartbeat and animal sounds are heard.
+Wheezing and coughing sounds.
+A man is speaking while mechanisms are moving.
+Heartbeats, background noise, and a sound effect are heard.
+A basketball is bouncing and footsteps are heard with wind noise.
+Background noise can be heard along with men's speech and human voices.
+A sound effect is playing.
+Wind and mechanisms are heard, with sneezes and human sounds in the background.
+Music and a man speaking are heard.
+A sound effect plays with music.
+People cough, sing, breathe, and music plays.
+Chewing and crunching sounds, along with surface contact, can be heard, along with background noise.
+An eruption occurs and people are shouting and speaking.
+Video game sounds and children speaking are heard over plops and music.
+A woman is speaking and writing with mechanical sounds in the background.
+Shuffling cards and men speaking is heard.
+Men are speaking, tapping, and a dog barking is heard.
+Background noise, tapping, laughing, and animal and speech synthesizer sounds are heard.
+Footsteps, surface contact, gunshots, man speaking and laughter are heard over video game sounds.
+Footsteps, writing, and mechanisms are heard.
+Background music is being played.
+Birds call while wind blows and surfaces are touched.
+A man is speaking with background noise and music is playing.
+An explosion occurs followed by speech from a synthesizer.
+A stream is flowing while birds are singing.
+Mechanisms, stomping, running, walking, and speech are heard.
+Brief tones, mechanisms, and surface contact with chopping are heard.
+Beeps and clicks are heard in background noise.
+A fierce roar is heard.
+Running, panting, and a sheep bleating with wind noise.
+A snap and thud are heard with background noise.
+A beep, a man speaking, and birds chirping are heard with a sound effect.
+Wind chimes ring and background noise can be heard.
+Mechanisms, zipping, and man speaking can be heard.
+Footsteps, music, and a thunk sound.
+Writing is heard, then background noise and more writing.
+Music with noise and sound effects.
+Ticking mechanisms and human sounds can be heard along with breathing and surface contact.
+Women speak amid the sounds of mechanisms and crumpling.
+Birds are singing and ticking sounds are heard in the background.
+A cough can be heard.
+Mechanisms are ticking and traffic noise can be heard.
+Mechanisms, clicking, male speaking, and surface contact are heard.
+A chirp tone is heard.
+A sigh is heard.
+Background noise is heard.
+A single ding is heard.
+The hum of electricity is heard and chickens and birds are singing.
+Birds coo and flap their wings while a woman speaks intermittently.
+A telephone bell rings, mechanisms sound, and there is speech synthesizer and breathing before an explosion.
+Children are talking, laughing, and making noise while adults are slapping and speaking.
+Background noise, barking, and growling dogs are heard.
+Music plays as a woman speaks and coughs.
+Background noise, gunshot sounds, and water are heard.
+An engine is running and there are human sounds, squeaks, and brief tones.
+Crowded speech noise fills the background.
+Background noise and jangling keys are heard before a door opens.
+A doorbell rings and a dog barks.
+Goats bleat, mechanisms, man's speech, bird songs, and bleats are heard.
+A zipper zips, writing is heard, and thuds and animal sounds punctuate the background noise.
+Background noise accompanies male speech.
+A motorcycle and sound effects operate.
+Roars and bird songs are heard along with camera sounds.
+Horns are honking and background noise is present.
+Birds are singing and calling, with wind noise and turkeys heard.
+A person is humming, breathing, and listening to ticking while mechanisms are heard in the background.
+People are speaking and breathing, with crumpling sounds in the background.
+Camera sounds, ticks, and background noise are heard.
+Wind blows and guns are fired while a man speaks.
+Heartbeats are heard repeatedly.
+Footsteps can be heard tapping on a surface, followed by a whip being cracked and background noise.
+Mechanisms and footsteps are heard alongside whispering, breathing, and female speech.
+Background noise, tapping, and human sounds and breathing can be heard.
+Wind noise and wind sounds are heard through a microphone.
+The wind blows with animal sounds, human voices, and birds singing.
+A clicking sound is heard as mechanisms are used.
+Heartbeats sound before glass shatters in the background.
+Horses and people are speaking in a noisy environment with ticking sounds.
+Beeps are sounding.
+Crowd noise, speech, and laughter are heard.
+People are chewing, dishes are clanging, and background noise and human sounds are present.
+A clock ticks and mechanisms are heard, followed by surface contact.
+A man speaks over sound effects.
+Women are speaking and mechanisms, including a blender, are in use.
+Thunderstorm rages as a man speaks.
+Clicking and plopping sounds can be heard amid music.
+A busy signal is heard, followed by a woman speaking, tapping sounds, and various human sounds and mechanisms.
+Wind is blowing and a truck is reversing, honking, and making air brake sounds with people talking and giggling.
+A plop sound is heard.
+Mechanisms sound, a cat meows, a tap is heard, and a woman speaks over background noise.
+A sink is filling or being washed, and a man speaks over music and bird calls.
+Music and surface contact sounds mix with a ticking noise.
+Wind is heard as a car makes tire squealing noises and something ticks.
+Camera sounds are heard with background noise and ticking.
+Car sounds, wind, men speaking, car horns, and conversation are heard.
+Background noise blends with female speech.
+An aircraft and video game sounds can be heard.
+A man speaks, background noise, clicking, the man speaks again, breathing is heard, and the man speaks again.
+A woman speaks, a dog barks, a man speaks, and crunching can be heard over background noise.
+A mechanism beeps.
+People are laughing and a woman is speaking over background noise.
+Background noise with ticking sounds.
+Wind and bleats are heard, along with microphone noise.
+Background noise and breathing are heard, followed by multiple men speaking.
+An ambulance siren and wind can be heard.
+Wind blowing, mechanical fan sounds, breathing, and laughter are heard.
+A mid-frequency engine makes ticking and beeping sounds.
+People are making sounds, coughing, and breathing.
+Scratch sounds are heard in the background noise.
+Surface contacts, human speech, and ticks are heard over mechanisms.
+A woman is speaking, breathing, and clicking.
+Birds sing, gun fires, wind blows, man speaks, car honks.
+The sound of a horse snorting is heard.
+A mid-frequency engine and a vehicle horn are heard.
+A telephone is dialing with ticking sounds and breathing in the background.
+The sound of a sine wave.
+Water is flowing.
+Crickets are chirping.
+An explosion takes place.
+Wind, mechanisms, shouting, bird tweets, and an explosion are heard.
+Traffic noise and water sounds mix with ticking and human voice.
+Wind, arrows, and ticking can be heard while both male and female speech is present.
+A firecracker is heard.
+Various brief tones, reverberations, and animal sounds play.
+The sound of a heartbeat is heard.
+Mechanisms, coughing, and surface contact sounds are heard with ticking and breathing.
+A man speaks and guns are fired with ticking sounds in the background.
+Animals and background noise are heard repeatedly.
+A sine wave and background noise are heard with beeps, ticks, slamming sounds, and breathing.
+Mechanisms, tapping, bird flight and vocalization, and tweeting sounds can be heard.
+Wind, human voices, and wind noise can be heard.
+Video game sounds and music play with sound effects and clicking.
+Music and animal sounds can be heard.
+Background noise is heard while a heartbeat is heard repeatedly.
+Wild animals are heard in the background.
+Background music, video game sounds, and a man's voice are heard.
+Clicking sounds occur with a ringing tone in the background.
+A jet engine roars and sound effects play.
+Footsteps are heard along with background noise, a busy signal, and music.
+A heartbeat is heard repeatedly.
+A loud slam is heard.
+Music, background noise, and a woman speaking fill the background.
+Babbling and female speech is heard with background noise and a child speaking.
+A woman is speaking, birds are chirping, and a rooster is crowing.
+Background noise and birds singing, with ticking sounds in between.
+A person's heartbeats can be heard with background noise.
+A speech synthesizer speaks as someone slaps and taps.
+Ticking and surface contact accompany speech and computer typing.
+An unknown sound effect is played.
+Footsteps and music are heard.
+A boing sound is heard.
+Footsteps and a tap are followed by a rumble.
+Video game sounds, footsteps, and a dial tone are heard.
+A cat meows.
+A boing sound is heard.
+A noise is heard.
+People are speaking and conversing over background noise.
+Heartbeats and music fill the air.
+Pattering sounds are made by mechanisms.
+A man speaks, footsteps are heard, and a cow moos over background human sounds.
+A toilet flush is followed by music and a woman speaking.
+Background noise mixes with whispering and wind blowing.
+Heartbeats, music, and a river flow as a man speaks.
+People walk, speak, and make slapping sounds in a noisy environment.
+Television plays amidst noise.
+Background noise, male speech, and breaking sounds are heard.
+Tapping and scratching can be heard over background noise.
+Noise, speech, human sounds, and music can be heard with background noise.
+Background noise and clicking sounds accompany men speaking.
+Various breathing, mechanical, and conversational noises occur with medium engine and child speech sounds.
+A woman and a man are speaking and laughing while a TV plays in the background.
+Howling wind, noise, and mechanisms are heard.
+Sound effects and surface contacts are heard over background noise.
+Mechanisms are heard.
+Mechanisms make ticking noises.
+Cars honk their horns.
+A woman speaks while scissors cut and there are tapping and surface contact sounds.
+A buzzer is buzzing repeatedly.
+There is a sine wave and bouncing sounds with music.
+Sound effects, speech synthesizers speaking are heard.
+Birds are vocalizing, an owl is heard, and surface contact is made.
+Surface contact, background noise, and a doorbell are heard.
+Beeping sounds repeat multiple times.
+Mechanisms make a brief tone sound.
+Music is playing.
+Footsteps are heard.
+Men are speaking, using a computer keyboard, and making speech sounds.
+Music is playing.
+Flapping wings of birds in flight.
+People are breathing, laughing, and honking over wind, ice cream truck, and gasps.
+An animal makes noise amongst clicks and sounds.
+A brief tone is heard.
+A man is speaking and laughing, with a door slamming sound and ticking in the background.
+Music can be heard.
+Whispering and ticking with background noise.
+Radios beep and play while an alarm and police car siren sound.
+A bicycle bell rings and mechanisms are heard.
+Mechanisms, beeping, and a camera sound are heard.
+There is background noise along with ticking and drinking sounds.
+A water tap runs with background noise.
+A man speaks among background noise.
+Mechanisms beep and objects make contact with surfaces.
+Various sound effects are heard with occasional animal sounds.
+Music is playing.
+Music plays with whispering and heartbeat sounds in the background.
+A machine is working as a woman speaks.
+Sound effects are being played.
+A speech synthesizer produces sound effects with background noise.
+Children are speaking, sneezing, and breathing in a noisy environment.
+An explosion and music can be heard.
+Wind, chewing, bird songs, brief tones, human voices, and coughing are heard.
+Background noise and female speech mix with barking and panting dog sounds.
+Sniffing and barking are heard amidst human sounds and background noise.
+Wind, ticks, a brief tone, male speech, a rooster, and crowing are heard.
+A printer and surface contact are heard in the background.
+An ice cream truck plays music while a man sings.
+Vehicles honk as mechanisms make noise, followed by a dripping sound.
+A dog barks with sound effects and background noise.
+A ding-dong and mechanisms can be heard.
+Men speaking, breathing, and clicking accompany background noise.
+A bell is ringing and background noise and wind noise is heard.
+Background noise and birds singing are heard, with occasional sound effects.
+A sine wave is heard.
+Pulses alternate with background noise and a sound effect.
+An explosion causes glass to shatter, followed by an eruption.
+Music, creaking, slamming, and thunking can be heard.
+Laughter and beeping sounds are heard in the background while a woman speaks.
+Music is playing, with a man running and panting, a slapping sound, background noise, and a car passing by.
+Whispering and ticking sounds are heard with background noise.
+A heartbeat sound is being recorded.
+Lions are heard roaring with background noise.
+People are talking and making various sounds.
+A man is speaking with background noise and ticking sounds.
+A conversation takes place on a rowboat over water and ticking sounds.
+A continuous sine wave sound.
+A mid-frequency engine is heard, surface contact is made, zippers are being used, breathing is heard, and a car is accelerating and revving.
+A dog is barking, people are speaking, and arrows are being shot with background noise, bird chirping, and footsteps.
+Wind and animal sounds mix with human voices and a bleat sound.
+A man is speaking and making beeping and mechanical sounds.
+Music and various sound effects are playing, including ding sounds.
+Men speak and mechanisms and clicking sounds are heard.
+Mechanisms, arrow sounds, crickets, and human voices are heard.
+A man speaks over a crowd of people speaking.
+A telephone is ringing, people are talking and laughing, and a child is speaking.
+Sonar signals are heard over background noise.
+Wind is blowing and a horse is whinnying.
+Music is playing and machine guns are firing.
+A man speaks and a tuning fork is struck amid background noise.
+A bell reverberates in the air.
+Heart sounds are heard with background noise.
+Music and echoes, mechanisms, and men speaking can be heard with footsteps.
+Mechanisms, female speech, and music play.
+Mechanisms make ticking noises and music is playing.
+Hunting tools are used to call ducks as wind blows and a man speaks.
+Mechanisms and laughter are heard as a pig oinks and breathing and music are heard.
+A civil defense siren and a sound effect are played.
+Water is flowing, mechanisms are operating.
+Insects are chirping and an arrow is heard while a person is speaking.
+Heartbeats and background noise are heard with a sound effect.
+A woman is speaking and clicking sounds are heard.
+A sine wave accompanies thumping, surface contact, and clicking noises.
+Men speaking, mechanisms, water, and surface contact can be heard.
+An alarm clock goes off amidst background noise.
+A woman is speaking and music is playing, with the sound of a horse-drawn carriage heard.
+There is booing and a sound effect.
+Background noise and ticks are heard.
+Gobbling and sound effects are heard.
+Footsteps, a sheep's bleat, and a plop are heard.
+Noise and sound effects accompany clicking.
+A man speaks, types, and breathes near a computer keyboard.
+A sound effect is heard.
+Sound effects are heard in quick succession.
+Taps and chirping birds are heard, as well as the sound of the wind and surface contact.
+An emergency vehicle is in operation with wind noise and bird sounds.
+Wind, a speedboat, and human voices are heard.
+Music and telephone bells ringing.
+Music and surface contact.
+Music is playing.
+Mechanisms and ticking sounds accompany filing and surface contact.
+Wind, human voices, and female speech can be heard along with wind noise.
+A doorbell rings and dogs are barking.
+Sound effects occur repeatedly.
+A person is gasping, talking, and breathing.
+A woman is singing and tapping with background noise.
+A woman is speaking.
+Mechanisms, tapping, and male speech with grunts are heard.
+Wind blows and birds sing and bleat.
+Background noise and cap guns are firing.
+A man speaks while mechanisms tick, followed by conversation and a woman speaking.
+A man is speaking with mechanisms and a dial tone sound.
+Crickets chirp amidst background noise.
+Noise, dial tones, and music with a coin drop are present.
+Wind noise is heard, an arrow is shot, female speech and conversation are heard, and ticking is heard.
+Background noise and the sounds of horse hooves, with more background noise.
+A groan is heard.
+Surface contact, clicking and typewriter sounds are heard.
+Surface contact and chopping sounds with mechanisms.
+Water is boiling and there is breathing in the background.
+A man is speaking over background noise.
+Whispering and female speech are accompanied by breathing.
+Background noise, mechanisms, and a telephone ringing accompany barking, speech synthesizer and dialing sounds.
+There is crinkling, mechanical fan noise, and surface contact.
+Only mechanisms sounds are heard.
+A dog barks and a pig squeals while some background noise and an animal sound can be heard.
+Mechanisms clank, an animal makes a noise, and a woman speaks.
+A printer makes clicking sounds amidst background noise and beeps.
+Background noise and surface contact sounds are heard, along with the sound of an arrow being shot.
+Laughter, beeps, and speech are interrupted by telephone ringing and tapping sounds.
+People are whispering, breathing and mechanisms are making sounds.
+Heartbeats are heard with ticking sounds in the background.
+A ding sound is heard.
+Mechanisms clack, and people clap.
+Footsteps, chirping birds, rustling leaves, and animal sounds are heard in the background.
+Wind noise, crows, and a barking dog can be heard in the background.
+A medium-frequency engine can be heard, and people are speaking and whispering near bleating animals.
+Background noise with a heartbeat rhythm is present.
+Background noise is present with sounds of surface contact and human voices.
+A bird is cooing and chirping with wind noise and human voices.
+Wind, speaking, and tapping are heard over background noise and conversations.
+People are walking, laughing, breathing, and speaking with wind, bird calls, and a vehicle in the background.
+Background noise and heartbeat sounds are heard.
+Music is playing.
+Crumpling sounds are heard in the background.
+Music, man's speech, shouting, and slamming can be heard.
+Background noise is present before a buzzer sounds.
+A man is speaking over music and human sounds.
+A variety of sound effects play over background noise and music.
+The wind blows while the heartbeat can be heard.
+Music plays on television as a man breathes, sings, snores, laughs, and breathes.
+A woman is speaking while ticks are heard in the background.
+A video game is being played with shots being fired.
+A train is moving and a car is honking.
+Mechanisms are heard, and turkeys are making sounds, with wind and ticking in the background.
+A rumble is heard, followed by heartbeats.
+Men are speaking, music is playing, and whistling and choir can be heard.
+Birds are chirping and rustling, and wind is blowing.
+Busy signal, clicking, and breathing with background noise.
+Tap dancing and music play while a human voice is heard.
+Conversations and mechanisms can be heard with a cat meowing.
+An eruption occurs, and ticking can be heard amidst speech.
+Men are speaking, clicking, and typing on a computer with breathing sounds.
+A man is speaking with turkey sounds and background noise.
+Men are speaking and having a conversation.
+Background noise, telephone bells ringing, and a man is speaking.
+Water trickles while an engine hums.
+Drips and surface contact sounds occur.
+A man is speaking, writing and tapping, and there is background noise.
+Continuous surface contact sounds and background noise heard.
+Breathing and whistling are heard in background noise.
+Running sounds and a boing sound can be heard.
+A speech synthesizer, echo, glass shatter, and more speech synthesizer are heard.
+A person sighs.
+Cats making noise, background noise, and human voices.
+A man is speaking on the phone with background noise.
+Music plays continuously.
+Spray is heard, along with background noise and bird songs.
+People are speaking and chopping, with breathing, laughter, footsteps, and clapping.
+Music and hoots play as a man sings and a speech synthesizer can be heard.
+Clicking and whooshing noises occur.
+Pigs squeal, wind blows, and men are talking.
+Background noise is heard while whispering and breathing can be heard, followed by a busy signal.
+An explosion is heard.
+Music is playing.
+Men are speaking over background noise and breathing is heard.
+A mechanical fan is running.
+Beeping, ticking mechanisms are heard.
+A man speaks and taps are heard amidst breathing and ticking sounds.
+People are shouting amidst music and human voices.
+Breathing, ticking, jangling keys, and various surface contacts are heard in the background.
+Tools and mechanisms are present, with a man speaking.
+Mechanisms produce background noise.
+A cat is purring and background noise is heard.
+Background noise and human sounds are heard.
+Water is ticking and splashing as a man speaks and laughter is heard.
+An effects unit is being used with background noise and ticking sounds.
+People are gargling and speaking with human sounds in the background.
+People are eating and making noises with their cutlery and dishes while background noise is heard.
+A power tool and ticking sounds are heard.
+Sound effects are heard repeatedly.
+A person is sneezing, breathing, and a dog barks.
+A sine wave sound is heard.
+Laughter, typing, a man speaking, sound effects, and mechanisms.
+Music and noise are heard.
+An alarm, ticking, speech, and background noise are heard.
+A ticking noise is heard.
+A man is speaking and shuffling cards.
+Bells are ringing in a change ringing pattern.
+Music is playing.
+A woman and a man speak, with background noise in the background.
+A man speaks as a housefly buzzes and makes surface contact.
+Human voices and music can be heard.
+Heartbeats are heard repeatedly.
+An aircraft engine is heard with wind noises and bird vocalizations.
+Bleats and barks are heard with background noise, and the pattern repeats.
+An emergency vehicle is speeding past, revving its engine.
+A knock is heard.
+Whistling, background noise, clicks, and keyboard sounds are heard, followed by speech.
+Mechanisms are being operated continuously.
+A mechanical fan and wind are heard.
+A crack is heard.
+Music is heard against background noise.
+Mechanisms are moving, birds are flying and singing.
+A bell is ringing and mechanisms are heard.
+There is wind noise and a man is speaking.
+A camera is in use, with occasional ticking and background noise.
+Wind is blowing, a gunshot is heard, a tick is heard, and a sound effect is heard.
+Noise and sound effects are heard.
+Only music is heard.
+Sound effects and beeps are heard while music plays.
+A man speaks over music with a pulsing beat.
+Wind is blowing, and a basketball is bouncing.
+A woman is speaking, glass shatters, and breathing can be heard.
+A background noise is heard with yells and rattles.
+Keyboard typing, background noise, and men speaking are heard.
+A cat is purring, with clicking and background noise.
+Music plays while liquid drips and various sound effects are heard.
+A sigh is heard.
+People are chewing and slurping while whispering and making surface contact.
+A man is speaking with music and whale vocalizations are heard.
+Wind and mechanisms are heard with heartbeat sounds.
+A bird is singing, barking dogs are heard, and an arrow is shot.
+Mechanisms are moving with snoring.
+A ding, sound effect, and background noise are heard.
+Breathing and sniffing sounds can be heard in the background.
+Background music is playing.
+People are clicking and typing, with a man speaking occasionally.
+Beeps repeatedly sound.
+A man is speaking and zipping his clothing in a small room with monologue narration and breathing.
+Sound effects and clapping are interspersed with occasional farts.
+There is silence in a small room.
+Breathing is heard.
+Beeps are repeating.
+A door opens and closes with footsteps and a sigh in a large room or hall.
+Bicycle bells ring with background noise.
+A man and a woman are speaking, with music and silence in between.
+Wind noise is heard in the countryside.
+People chew and tap sounds are made.
+A telephone bell ringing, tapping, and a man speaking are heard.
+Mechanisms and a printer are heard.
+Sound effects play.
+Only the sound of a heartbeat is heard.
+The sound of a bell is heard.
+Fire is burning and people are speaking with spraying mechanisms.
+Whooshes, swooshes, and swishes are heard repeatedly.
+Liquid is heard, followed by thumps and tapping, and then men speaking.
+A man is speaking and hooting sounds are heard.
+Men are speaking and using a computer keyboard.
+Men speak and a camera makes sounds.
+Scissors cutting are heard in background noise.
+The sound of humming fills the air.
+Sound effects play continuously.
+A chirp tone is heard.
+A heart is beating with ticking sounds.
+Music and sound effects are playing.
+Something is breaking with shouting and screaming.
+Men are speaking and a printer is working with mechanisms.
+People whisper, breathe, chew, and crunch with wind noise.
+Dogs are barking and birds are chirping in the background.
+Sound effects and animal noises are heard.
+Coins are dropped, and objects are tapped and contacted with a surface.
+A church bell rings over background noise.
+Silence is interrupted by a man speaking and more speech.
+Background noise is heard, interrupted by chopping sounds.
+Men and a woman are speaking and croaks are heard.
+Water is flowing with clicking sounds.
+Music with sound effects and women speaking.
+Background music is playing with sound effects and croaking.
+A tuning fork is being struck multiple times in a small room.
+Bells are ringing and a man is speaking while wind noise is heard.
+Footsteps are heard with a woman speaking.
+Music is playing.
+Soft music is playing.
+Sound effects, silence, speech and a child speaking are heard with occasional plops.
+Music can be heard.
+Footsteps are heard in a small room, followed by silence and more footsteps, then a camera clicking.
+Human voice and a rumble are heard.
+Heartbeats alternate with noise, clicking, and more heartbeats.
+A noise is followed by a ding.
+White noise is being generated.
+Music plays with liquid flowing in the background.
+Wind is blowing, a man is speaking on a rowboat, splashing and splattering sounds with animal sounds and laughter.
+Silence is broken by beeps.
+The sound of a mechanical fan and ticking and tapping sounds can be heard.
+Music is playing, with a man speaking and tapping in the background.
+Wolves are howling.
+Music is playing.
+A sound effect, tap and creaking.
+Music and a chorus effect are heard.
+A croak is heard over background noise.
+A man is speaking and a ticking sound is heard.
+Scraping sounds are heard multiple times.
+Televisions, female speech, and the sound of rats can be heard in a small room.
+Music is playing.
+A chirp tone can be heard.
+Men are speaking, sighing, and walking.
+Wind noise can be heard in a field recording.
+Whispers, speech, and biting sounds are heard.
+Bicycles ring their bells and wind noise can be heard.
+A stomach is rumbling.
+Tapping, motor vehicles, speech, laughter, and ticking sounds can be heard in the background.
+Silence, drilling, tooling, and power tool sounds are heard.
+A church bell is ringing with an echo and music is playing.
+A bell is ringing.
+Microphone wind noise and machine gun fire heard.
+Music is playing.
+Breathing and water sounds can be heard.
+A woman is speaking and pouring liquid, with ticking and glass clinking heard.
+A series of beeps can be heard repeatedly.
+A woman speaks and speech synthesizer, speech, and sound effects are heard.
+Men speak and tap while turkeys vocalize.
+A man speaks while slapping and thumping sounds are heard with laughter.
+A telephone line is silent until a busy signal is heard.
+A slapping or smacking sound is heard.
+A buzzer is heard.
+Mechanisms and animal sounds with chirping birds are heard.
+Silence alternates with beeps.
+Music is played with a chorus effect.
+A subway door is opening and closing with a ding-dong sound.
+A woman whispers and speaks with human sounds and breathing in the background.
+Mechanisms and ticking are heard as a man speaks.
+Sounds in a small room.
+Mechanisms, people speaking, squealing, and conversation are heard.
+A man speaks, then there is silence followed by music.
+A snake rattles, people are speaking, and breathing is heard with wind blowing into the microphone.
+An alarm clock goes off and the sound of tapping and ticking follows.
+A microwave beeps in a quiet room with sounds of mechanisms.
+Coughing, speech, clicking sounds, and breathing can be heard.
+A tuning fork in a small room.
+Music is playing in the background.
+Heartbeats are heard.
+Sound effects, whooshes, and male speech occur with breathing.
+Music is playing.
+A person is breathing and sounds effect can be heard.
+Heartbeat sounds can be heard.
+Skateboards are being ridden.
+A sine wave is being played.
+A child is speaking, sheep are bleating, and a man is tapping and laughing.
+A cat purrs and meows as a ticking sound continues.
+Music is playing.
+A tuning fork is ringing in a small room.
+Writing and speech can be heard in a small room with narration and monologue.
+Roars and growls are heard.
+Animals and rodents are making noise in a small room with laughter.
+Silence alternates with sine wave sounds.
+A mechanical fan runs.
+Sonar is the only sound.
+Tapping precedes music.
+Clicking sounds play rhythmically with music.
+Whale vocalization is audible.
+Whacking noises and bouncing sounds are heard in a small room.
+A sine wave is produced.
+Animal sounds mix with beeps and a man's speech.
+Bird calls punctuate a siren's wail.
+Taps and whacks produce rhythmic sounds.
+A hammer strikes in silence.
+Laughter and speech mix with television and breathing sounds.
+There are various sound effects and rumbling noises.
+Computer keyboards click repeatedly.
+Music plays in a small room with a chorus effect.
+Stomach rumbling and a tap sound can be heard.
+Beeping and busy signals are heard during telephone calls, along with speech from a man.
+Whispering accompanies the sound of chewing.
+There is speech and tick-tock sounds with occasional scraping.
+A coin drops.
+A sine wave is heard.
+Music plays with shouting, gunshots, glass shattering, clapping, and singing.
+Music is heard.
+Scissors cut while tapping occurs and women speak.
+Laughter is followed by the sound of a fart and noise made by a human voice.
+Music plays near the ocean with the sound of a bell in the distance.
+Ticking sounds are repeated many times.
+The music plays intermittently among silences.
+Babbling is followed by silence.
+Music plays with background noise.
+Wind, shouting, clicking and speech can be heard.
+Gunshots and tapping sounds can be heard.
+Music plays inside a large room or hall.
+There is silence and a sound effect.
+An engine runs as animals walk, pant, and breathe with children's speech and footsteps.
+Domestic animals purr in a small room.
+Farts and a beep sound.
+There are various sounds including silence, television, music, and sound effects.
+Camera mechanisms click.
+Conversations, laughter, and breathing are heard along with frog and animal sounds.
+Sound effects including farts are present.
+A person is breathing, and a telephone is ringing while conversations occur.
+Speech and breathing can be heard along with computer keyboard sounds in a small room.
+A cat meows twice.
+There is silence and then music starts playing.
+The sounds of mechanisms and chewing are heard with breathing and tapping.
+Someone is screaming.
+Taps and alarm clocks make repetitive sounds.
+Wind and water make noise while a man speaks.

assets/demo/prompts.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+A group of anthropomorphic mushrooms having a disco party in the middle of a dark enchanted forest, with glowing neon lights and exaggerated dance moves, their smooth textures and reflective surfaces emphasizing a comical 3D look.
+A panda bear with distinct black patches climbs and rests on a wooden log platform amid lush, natural foliage.
+A vibrant green parrot with hints of yellow and blue perches on a person’s lap, who is wearing grey pants. The parrot features a white beak, grey head, and a black eye. In the background, a red couch and a TV displaying a colorful video with “bilibili” text complete the scene.
+A black-and-white film captures a pianist playing in an empty, decaying theater. His deft fingers and echoing music create a haunting ambiance as dust motes float in the faint light. The gleaming grand piano under the spotlight contrasts with the worn seats and peeling walls, evoking nostalgia.
+Chinese ancient style, realism. A young woman, dressed in an embroidered red qipao, walks along the ancient streets of a bustling Chinese town. The red lanterns hanging above her sway gently in the evening breeze, and her calm, confident stride contrasts with the lively atmosphere of merchants and performers around her.
+A tomato surfing on a piece of lettuce down a waterfall of ranch dressing, with exaggerated surfing moves and creamy wave effects to highlight the 3D animated fun.
+A man in a gray hoodie and a woman in a light gray jacket jog along a residential sidewalk, smiling and chatting. They pass a beige house with a vibrant garden and street lamp on a bright, sunny day. The medium shot captures their movement amid lush greenery, creating a serene, cinematic scene.
+A coastal landscape painting with a prominent archway is displayed on an easel in a bright studio. A camera pan reveals a table cluttered with art supplies and a potted plant, enhancing the artistic vibe. Large windows and soft natural lighting create a cozy, creative atmosphere.
+A scene from disaster movie.
+A candid medium shot captures a woman in a white car, wearing glasses, a yellow top, and a black jacket, with her arm resting on the open window. Behind her, a stone-faced house surrounded by lush greenery basks in natural sunlight, creating a warm and realistic scene.
+Two women sit on a beige couch in a cozy, warmly lit room with a brick wall backdrop. They engage in a cheerful conversation, smiling and toasting red wine in an intimate medium shot.
+A woman with her hair in a bun walks along a city sidewalk, gently touching a lush hedge. Dressed in a plaid jacket and beige pants with a tan backpack, her calm presence is captured in natural daylight against an urban backdrop.
+A breathtaking aerial view shows a river winding like a dark ribbon through lush fields and hills, reflecting the soft pink-orange hues of sunrise or sunset in a serene, picturesque landscape.
+A man performs push-ups on a wooden bench in a sunny park, captured from a side angle in a medium shot. The focus is on his upper body and technique, with natural sunlight accentuating the scene. Lush greenery and distant park-goers contribute to the energetic, realistic setting
+A playful dog in a pink coat with a red leash dashes across a muddy field with sparse crops. The camera tracks its energetic movement from right to left against a backdrop of trees and distant power lines under an overcast sky. The realistic, medium shot captures a candid, lively moment in soft, diffused light.
+A drone camera circles a historic church on a rocky outcrop along the Amalfi Coast, highlighting its stunning architecture, tiered patios, and the dramatic coastal views with waves crashing below and people enjoying the scene in the warm afternoon light.

assets/docs/data.md ADDED Viewed

	@@ -0,0 +1,193 @@

+## Data Preparation
+### Stage1 - JavisDiT-audio
+In this stage, we only need audio files to initialize the audio generation capability:
+| path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text| audio_text|
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| placeholder.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | placeholder | yyy |
+Download the audios (including [AudioCaps](https://drive.google.com/file/d/16J1CVu7EZPD_22FxitZ0TpOd__FwzOmx/view?usp=drive_link), [VGGSound](https://huggingface.co/datasets/Loie/VGGSound), [AudioSet](https://huggingface.co/datasets/agkphysics/AudioSet), [WavCaps](ttps://huggingface.co/datasets/cvssp/WavCaps), [Clotho](https://zenodo.org/records/3490684), [ESC50](https://github.com/karolpiczak/ESC-50?tab=readme-ov-file#download), [MACS](https://zenodo.org/records/2589280), [UrbanSound8K](https://urbansounddataset.weebly.com/urbansound8k.html), [MusicInstrument](https://www.kaggle.com/datasets/soumendraprasad/musical-instruments-sound-dataset), [GTZAN](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification), etc.), and put them into the same folder `/path/to/audios`. Follow the commands to automatically generate a `train_audio.csv` for configuration:
+```bash
+ROOT_AUDIO="/path/to/audios"
+ROOT_META="./data/meta/audio"
+# 1.1 Create a meta file from a unified audio folder. This should output ${ROOT_META}/meta.csv
+python -m tools.datasets.convert audio ${ROOT_AUDIO} --output ${ROOT_META}/meta.csv
+# 1.2 Get audio information. This should output ${ROOT_META}/meta_ainfo.csv
+python -m tools.datasets.datautil ${ROOT_META}/meta.csv --audio-info
+# 2.1 Trim audios within 30 seconds. This should overwrite the raw audios by default and output ${ROOT_META}/meta_ainfo_trim30s.csv
+python -m tools.datasets.datautil ${ROOT_META}/audio_meta.csv --trim-audio 30
+# 2.2 Unify the sample rate to 16k Hz for all audios. This should output ${ROOT_META}/audio_meta_trim30s_sr16000.csv
+python -m tools.datasets.datautil ${ROOT_META}/meta_ainfo_trim30s.csv --resample-audio --audio-sr 16000
+# 3.1 Set dummy videos. This should output ${ROOT_META}/audio_meta_trim30s_sr16000_dummy_videos.csv
+python -m tools.datasets.datautil ${ROOT_META}/audio_meta_trim30s_sr16000.csv --dummy-video
+# 3.2 Get training meta csv. This should output ${ROOT_META}/train_audio.csv
+python -m tools.datasets.find_audio_ds all \
+    --data_root ${ROOT_AUDIO} \
+    --meta_file ${ROOT_META}/audio_meta_trim30s_sr16000_dummy_videos.csv \
+    --save_file ${ROOT_META}/train_audio.csv
+```
+### Stage2 - JavisDiT-prior
+As detailed in our [paper](https://arxiv.org/pdf/2503.23377), the prior estimator is trainning with the contrastive learning paradigm.
+We take the extracted spatio-temporal priors as **anchor**, view the paired audio-video in the training datasets as **positive samples**, and randomly augment the audio or video to construct asychronized audio-video pairs as **negative samples**.
+In particular, saptial- and temporal-asynchronization are separately generated.
+| path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text | unpaired_audio_path |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy | /path/to/zzz.wav |
+#### Ground-truth synchronized audio-video pairs
+Follow the instructions in [Stage3](#stage3---javisdit-jav) to read the audio-video information from training dataset (eg, [TAVGBench](https://github.com/OpenNLPLab/TAVGBench)).
+The obtained basic meta file can be `/path/to/train_jav.csv`.
+#### Offline asynchronized audio generation
+Given a synchronized audio-video pair, we efficiently construct asynchronized audio-video pairs by generating standalone audios from [AudioLDM2](https://github.com/haoheliu/AudioLDM2) without reference videos.
+The native text descrption, native video, generated audio jointly contribute to an asynchronized (negative) sample for contrastive learning.
+Generated audio paths will be recorded in the `unpaired_audio_path` column.
+```bash
+ROOT_META="./data/meta/prior"
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4\
+    tools/st_prior/gen_unpaired_audios.py \
+    --input_meta ${ROOT_META}/train_jav.csv \
+    --output_dir ./data/st_prior/audio/unpaired \
+    --output_meta ${ROOT_META}/train_prior.csv \
+    --match_duration
+```
+#### Online asynchronized audio-video augmentation
+This part is implemented in `javisdit/datasets/augment.py`, where we developed various spatial/temporal augmentations for video/audio samples independently to constructing spatially/temporally asynchronized audio-video pairs.
+For implementation details please kindly refer to our [paper](https://arxiv.org/pdf/2503.23377) and [code](javisdit/datasets/augment.py), and here we introduce the data preparation to perform corresponding augmentations:
+- Auxiliary Video Resource ([SA-V](https://ai.meta.com/datasets/segment-anything-video/))
+For video spatial augmentation, one of the efficient approaches is to randomly adding a sounding-object's masklet into a video sequence, causing spatial asynchrony between video and audio pairs.
+Here we take the training set of [SA-V](https://ai.meta.com/datasets/segment-anything-video/) to collect native object maskelets at 6fps:
+```
+data/st_prior/video/SA_V/
+├── sav_train
+│   ├── sav_000
+│   ├── sav_001
+│   └── sav_002
+```
+Then, we utilize [GroundedSAM](https://github.com/zhengyuhang123/GroundedSAM.git) to extend 6fps annotations to 24fps masklets:
+```bash
+mkdir third_party && cd third_party
+git clone https://github.com/zhengyuhang123/GroundedSAM.git
+cd GroundedSAM
+export AM_I_DOCKER=False
+export BUILD_WITH_CUDA=True
+python -m pip install -e segment_anything
+pip install --no-build-isolation -e GroundingDINO
+wget -P EfficientSAM/ https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_sam.pt
+cd ../../
+ls data/st_prior/video/SA_V/sav_train/sav_*/*.mp4 > data/st_prior/video/SA_V/sa_v_list.txt
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 \
+    tools/st_prior/get_masklets.py \
+    --data_path data/st_prior/video/SA_V/sa_v_list.txt \
+    --output_dir data/st_prior/video/SA_V/crops
+ls data/st_prior/video/SA_V/crops/*.json > data/st_prior/video/SA_V/crops/pool_list.txt
+```
+The exracted masklets will be stored as:
+```
+data/st_prior/video/SA_V/crops/
+├── pool_list.txt
+├── sav_000001_mask_000.mp4
+├── sav_000001_masklet_000.mp4
+├── sav_000001_meta_000.json
+├── sav_000002_mask_000.mp4
+├── sav_000002_mask_001.mp4
+├── sav_000002_masklet_000.mp4
+├── sav_000002_masklet_001.mp4
+├── sav_000002_meta_000.json
+├── sav_000002_meta_001.json
+├── ...
+```
+- Auxiliary Audio Resource ([AudioSep](https://github.com/Audio-AGI/AudioSep))
+After seperating audio sources from original audio files, we can apply arbitrary addition and deletion operations on audios to introduce spatial asynchrony between video and audio pairs:
+```bash
+cd third_party
+git clone https://github.com/Audio-AGI/AudioSep.git
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 \
+    tools/st_prior/sep_audios.py \
+    --audio_path /path/to/TAVGBench \
+    --output_path ./data/st_prior/audio/TAVGBench
+ls data/st_prior/audio/TAVGBench/*.wav > data/st_prior/audio/TAVGBench/pool_list.txt
+```
+### Stage3 - JavisDiT-jav
+Here we provide an example with [TAVGBench](https://github.com/OpenNLPLab/TAVGBench) to prepare video-audio-text triplets for training. You can easily transfer to your own datasets.
+| path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text|
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---|
+| /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy |
+With our cleaned [`release_captions_clean.txt`](https://huggingface.co/datasets/JavisDiT/TAVGBench_clean/tree/main) file, the following script will automatically generate a `train_jav.csv` for configuration:
+```bash
+ROOT_VIDEO="/path/to/videos"
+ROOT_META="./data/meta/TAVGBench"
+fmin=10  # minial frames for each video
+# 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv
+python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv
+# 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin${fmin}.csv
+python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin ${fmin}
+# 2.1 Unify FPS to 24 Hz for all videos. This will change the raw videos, and output ${ROOT_META}/meta_info_fmin${fmin}_fps24.csv
+python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}.csv --uni-fps 24 --overwrite
+# 2.2 Extract audios from videos, and fix the sample rate to 16k Hz for all audios. This should output ${ROOT_META}/meta_info_fmin${fmin}_fps24_au_sr16000.csv
+python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_fps24.csv --extract-audio --audio-sr 16000
+# 3.1 Get training meta csv. This should output ${ROOT_META}/train_jav.csv
+python -m tools.datasets.find_jav_ds tavgbench \
+    --meta_src /path/to/TAVGBench_clean/release_captions_clean.txt \
+    --meta_file ${ROOT_META}/meta_info_fmin${fmin}_fps24_au_sr16000.csv \
+    --save_file ${ROOT_META}/train_jav.csv
+```
+If you get multiple data sources, just merge the csv files to a single one:
+```bash
+python -m tools.datasets.datautil ds1.csv ds2.csv ... --output /path/to/output.csv
+```

assets/image/JavisDiT-framework-resized.png ADDED Viewed

Git LFS Details

SHA256: 5bc4e9c38d8e249568a982151f52db29d47a99a3d1b11cb97b55cefe737339cc
Pointer size: 132 Bytes
Size of remote file: 1.75 MB

assets/image/JavisDiT-intro-resized.png ADDED Viewed

Git LFS Details

SHA256: 03bd381cfe4945dbd85b2680bb199469c015c93bebc49ab05f096de684df7e4a
Pointer size: 132 Bytes
Size of remote file: 9.24 MB

assets/image/logo.png ADDED Viewed

Git LFS Details

SHA256: a04bc6126c6bf8f3c6e92815ba203ac8c5950ad360c85d3341f06dbbe98d5fdc
Pointer size: 131 Bytes
Size of remote file: 241 kB

assets/src/funasr_utils_load_utils.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import os
+import torch
+import json
+from io import BytesIO
+import torch.distributed as dist
+import numpy as np
+import kaldiio
+import librosa
+import torchaudio
+import time
+import logging
+from torch.nn.utils.rnn import pad_sequence
+try:
+    from funasr.download.file import download_from_url
+except:
+    print("urllib is not installed, if you infer from url, please install it first.")
+import pdb
+import subprocess
+from subprocess import CalledProcessError, run
+try:
+    from pydub import AudioSegment
+except:
+    pass
+def is_ffmpeg_installed():
+    try:
+        output = subprocess.check_output(["ffmpeg", "-version"], stderr=subprocess.STDOUT)
+        return "ffmpeg version" in output.decode("utf-8")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+use_ffmpeg = False
+if is_ffmpeg_installed():
+    use_ffmpeg = True
+else:
+    print(
+        "Notice: ffmpeg is not installed. torchaudio is used to load audio\n"
+        "If you want to use ffmpeg backend to load audio, please install it by:"
+        "\n\tsudo apt install ffmpeg # ubuntu"
+        "\n\t# brew install ffmpeg # mac"
+    )
+def load_audio_text_image_video(
+    data_or_path_or_list,
+    fs: int = 16000,
+    audio_fs: int = 16000,
+    data_type="sound",
+    tokenizer=None,
+    **kwargs,
+):
+    if isinstance(data_or_path_or_list, (list, tuple)):
+        if data_type is not None and isinstance(data_type, (list, tuple)):
+            data_types = [data_type] * len(data_or_path_or_list)
+            data_or_path_or_list_ret = [[] for d in data_type]
+            for i, (data_type_i, data_or_path_or_list_i) in enumerate(
+                zip(data_types, data_or_path_or_list)
+            ):
+                for j, (data_type_j, data_or_path_or_list_j) in enumerate(
+                    zip(data_type_i, data_or_path_or_list_i)
+                ):
+                    data_or_path_or_list_j = load_audio_text_image_video(
+                        data_or_path_or_list_j,
+                        fs=fs,
+                        audio_fs=audio_fs,
+                        data_type=data_type_j,
+                        tokenizer=tokenizer,
+                        **kwargs,
+                    )
+                    data_or_path_or_list_ret[j].append(data_or_path_or_list_j)
+            return data_or_path_or_list_ret
+        else:
+            return [
+                load_audio_text_image_video(
+                    audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs
+                )
+                for audio in data_or_path_or_list
+            ]
+    if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith(
+        ("http://", "https://")
+    ):  # download url to local file
+        data_or_path_or_list = download_from_url(data_or_path_or_list)
+    if (isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list)) or hasattr(data_or_path_or_list, 'read'):  # local file or bytes io
+        if data_type is None or data_type == "sound":
+            if hasattr(data_or_path_or_list, "read") and hasattr(data_or_path_or_list, "seek"):
+                data_or_path_or_list.seek(0)
+            # if use_ffmpeg:
+            #     data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
+            #     data_or_path_or_list = torch.from_numpy(data_or_path_or_list).squeeze()  # [n_samples,]
+            # else:
+            #     data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
+            #     if kwargs.get("reduce_channels", True):
+            #         data_or_path_or_list = data_or_path_or_list.mean(0)
+            try:
+                data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
+                if kwargs.get("reduce_channels", True):
+                    data_or_path_or_list = data_or_path_or_list.mean(0)
+            except:
+                data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
+                data_or_path_or_list = torch.from_numpy(
+                    data_or_path_or_list
+                ).squeeze()  # [n_samples,]
+        elif data_type == "text" and tokenizer is not None:
+            data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+        elif data_type == "image":  # undo
+            pass
+        elif data_type == "video":  # undo
+            pass
+        # if data_in is a file or url, set is_final=True
+        if "cache" in kwargs:
+            kwargs["cache"]["is_final"] = True
+            kwargs["cache"]["is_streaming_input"] = False
+    elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
+        data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+    elif isinstance(data_or_path_or_list, np.ndarray):  # audio sample point
+        data_or_path_or_list = torch.from_numpy(data_or_path_or_list)  # .squeeze()  # [n_samples,]
+    elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark":
+        data_mat = kaldiio.load_mat(data_or_path_or_list)
+        if isinstance(data_mat, tuple):
+            audio_fs, mat = data_mat
+        else:
+            mat = data_mat
+        if mat.dtype == "int16" or mat.dtype == "int32":
+            mat = mat.astype(np.float64)
+            mat = mat / 32768
+        if mat.ndim == 2:
+            mat = mat[:, 0]
+        data_or_path_or_list = mat
+    else:
+        pass
+        # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
+    if audio_fs != fs and data_type != "text":
+        resampler = torchaudio.transforms.Resample(audio_fs, fs)
+        data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]
+    return data_or_path_or_list
+def load_bytes(input):
+    try:
+        input = validate_frame_rate(input)
+    except:
+        pass
+    middle_data = np.frombuffer(input, dtype=np.int16)
+    middle_data = np.asarray(middle_data)
+    if middle_data.dtype.kind not in "iu":
+        raise TypeError("'middle_data' must be an array of integers")
+    dtype = np.dtype("float32")
+    if dtype.kind != "f":
+        raise TypeError("'dtype' must be a floating point type")
+    i = np.iinfo(middle_data.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+    return array
+def validate_frame_rate(
+    input,
+    fs: int = 16000,
+):
+    # 将文件读取为字节流
+    byte_data = BytesIO(input)
+    # 使用 pydub 加载音频
+    try:
+        audio = AudioSegment.from_file(byte_data)
+    except:
+        raise RuntimeError(
+            "You are decoding the pcm data, please install pydub first. via `pip install pydub`."
+        )
+    # 确保采样率为 16000 Hz
+    if audio.frame_rate != fs:
+        audio = audio.set_frame_rate(fs)
+        # 将重新采样后的音频导出为字节流
+        output = BytesIO()
+        audio.export(output, format="wav")
+        output.seek(0)
+        # 获取重新采样后的字节流数据
+        input = output.read()
+    return input
+def extract_fbank(data, data_len=None, data_type: str = "sound", frontend=None, **kwargs):
+    if isinstance(data, np.ndarray):
+        data = torch.from_numpy(data)
+        if len(data.shape) < 2:
+            data = data[None, :]  # data: [batch, N]
+        data_len = [data.shape[1]] if data_len is None else data_len
+    elif isinstance(data, torch.Tensor):
+        if len(data.shape) < 2:
+            data = data[None, :]  # data: [batch, N]
+        data_len = [data.shape[1]] if data_len is None else data_len
+    elif isinstance(data, (list, tuple)):
+        data_list, data_len = [], []
+        for data_i in data:
+            if isinstance(data_i, np.ndarray):
+                data_i = torch.from_numpy(data_i)
+            if not isinstance(data_i, torch.Tensor) or data_i.shape[0] < 2:
+                data_i = torch.zeros(2)
+            data_list.append(data_i)
+            data_len.append(data_i.shape[0])
+        data = pad_sequence(data_list, batch_first=True)  # data: [batch, N]
+    data, data_len = frontend(data, data_len, **kwargs)
+    if isinstance(data_len, (list, tuple)):
+        data_len = torch.tensor([data_len])
+    return data.to(torch.float32), data_len.to(torch.int32)
+def _load_audio_ffmpeg(file: str, sr: int = 16000):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    # This launches a subprocess to decode audio while down-mixing
+    # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+    # fmt: off
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-threads", "0",
+        "-i", file,
+        "-f", "s16le",
+        "-ac", "1",
+        "-acodec", "pcm_s16le",
+        "-ar", str(sr),
+        "-"
+    ]
+    # fmt: on
+    try:
+        out = run(cmd, capture_output=True, check=True).stdout
+    except CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

assets/src/pytorchvideo_augmentations.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Video transforms that are used for advanced augmentation methods."""
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+import torchvision
+from torchvision.transforms.functional import InterpolationMode, F_t
+# Maximum global magnitude used for video augmentation.
+_AUGMENTATION_MAX_LEVEL = 10
+def _check_fill_arg(kwargs):
+    """
+    Check if kwargs contains key ``fill``.
+    """
+    assert "fill" in kwargs, "Need to have fill in kwargs."
+def _autocontrast(video: torch.Tensor, **kwargs) -> torch.Tensor:
+    """
+    Maximize contrast of a video by remapping its pixels per channel so that the lowest
+    becomes black and the lightest becomes white.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+    """
+    return torchvision.transforms.functional.autocontrast(video)
+def _equalize(video: torch.Tensor, **kwargs) -> torch.Tensor:
+    """
+    Equalize the histogram of a video by applying a non-linear mapping to the input in
+    order to create a uniform distribution of grayscale values in the output.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+    """
+    if video.dtype != torch.uint8:
+        video_type = video.dtype
+        video = (video * 255).to(torch.uint8)
+        return (torchvision.transforms.functional.equalize(video) / 255).to(video_type)
+    return torchvision.transforms.functional.equalize(video)
+def _invert(video: torch.Tensor, **kwargs) -> torch.Tensor:
+    """
+    Invert the colors of a video.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+    """
+    return torchvision.transforms.functional.invert(video)
+def _rotate(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Rotate the image by angle.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): The rotation angle value in degrees, counter-clockwise.
+    """
+    _check_fill_arg(kwargs)
+    return torchvision.transforms.functional.rotate(
+        video, factor, fill=kwargs["fill"], interpolation=InterpolationMode.BILINEAR
+    )
+def _solarize(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Solarize an video by inverting all pixel values above a threshold.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+    """
+    if video.dtype == torch.uint8:
+        return torchvision.transforms.functional.solarize(video, int(factor * 255.0))
+    else:
+        return torchvision.transforms.functional.solarize(video, factor)
+def _adjust_contrast(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Adjust contrast of an a video.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much to adjust the contrast. Can be any non-negative
+            number. 0 gives a solid gray video, 1 gives the original video while 2
+            increases the contrast by a factor of 2.
+    """
+    return torchvision.transforms.functional.adjust_contrast(video, factor)
+def _adjust_saturation(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Adjust the saturation of a video.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much to adjust the saturation. 0 will give a black and
+            white video, 1 will give the original video while 2 will enhance the
+            saturation by a factor of 2.
+    """
+    return torchvision.transforms.functional.adjust_saturation(video, factor)
+def _adjust_brightness(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Adjust brightness of a video.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        sharpness_factor (float): How much to adjust the sharpness. Can be any
+            non-negative number. 0 gives a blurred video, 1 gives the original video
+            while 2 increases the sharpness by a factor of 2.
+    """
+    return torchvision.transforms.functional.adjust_brightness(video, factor)
+def _adjust_sharpness(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
+    """
+    Adjust the sharpness of a video.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much to adjust the sharpness. Can be any non-negative
+            number. 0 gives a blurred video, 1 gives the original video while 2
+            increases the sharpness by a factor of 2.
+    """
+    return torchvision.transforms.functional.adjust_sharpness(video, factor)
+def _posterize(video: torch.Tensor, factor: float, **kwargs):
+    """
+    Posterize an image by reducing the number of bits for each color channel.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): The number of bits to keep for each channel (0-8).
+    """
+    if factor >= 8:
+        return video
+    if video.dtype != torch.uint8:
+        video_type = video.dtype
+        video = (video * 255).to(torch.uint8)
+        return (torchvision.transforms.functional.posterize(video, factor) / 255).to(
+            video_type
+        )
+    return torchvision.transforms.functional.posterize(video, factor)
+def _shear_x(video: torch.Tensor, factor: float, **kwargs):
+    """
+    Shear the video along the horizontal axis.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much to shear along the horizontal axis using the affine
+            matrix.
+    """
+    _check_fill_arg(kwargs)
+    translation_offset = video.size(-2) * factor / 2
+    return F_t.affine(
+        video,
+        [1, factor, translation_offset, 0, 1, 0],
+        fill=kwargs["fill"],
+        interpolation="bilinear",
+    )
+def _shear_y(video: torch.Tensor, factor: float, **kwargs):
+    """
+    Shear the video along the vertical axis.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much to shear along the vertical axis using the affine
+            matrix.
+    """
+    _check_fill_arg(kwargs)
+    translation_offset = video.size(-1) * factor / 2
+    return F_t.affine(
+        video,
+        [1, 0, 0, factor, 1, translation_offset],
+        fill=kwargs["fill"],
+        interpolation="bilinear",
+    )
+def _translate_x(video: torch.Tensor, factor: float, **kwargs):
+    """
+    Translate the video along the vertical axis.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much (relative to the image size) to translate along the
+            vertical axis.
+    """
+    _check_fill_arg(kwargs)
+    translation_offset = factor * video.size(-1)
+    return F_t.affine(
+        video,
+        [1, 0, translation_offset, 0, 1, 0],
+        fill=kwargs["fill"],
+        interpolation="bilinear",
+    )
+def _translate_y(video: torch.Tensor, factor: float, **kwargs):
+    """
+    Translate the video along the vertical axis.
+    Args:
+        video (torch.Tensor): Video tensor with shape (T, C, H, W).
+        factor (float): How much (relative to the image size) to translate along the
+            horizontal axis.
+    """
+    _check_fill_arg(kwargs)
+    translation_offset = factor * video.size(-2)
+    return F_t.affine(
+        video,
+        [1, 0, 0, 0, 1, translation_offset],
+        fill=kwargs["fill"],
+        interpolation="bilinear",
+    )
+def _randomly_negate(magnitude: float) -> float:
+    """
+    Negate input value with 50% chance.
+    Args:
+        magnitude (float): Input value.
+    """
+    return magnitude if torch.rand(1).item() > 0.5 else -magnitude
+def _increasing_magnitude_to_arg(level: int, params: Tuple[float, float]) -> float:
+    """
+    Convert level to transform magnitude. This assumes transform magnitude increases
+    linearly with level.
+    Args:
+        level (int): Level value.
+        params (Tuple[float, float]): Params contains two values: 1) Base transform
+            magnitude when level is 0; 2) Maxmimum increasing in transform magnitude
+            when level is at Maxmimum.
+    """
+    magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
+    return (params[0] + magnitude,)
+def _increasing_randomly_negate_to_arg(
+    level: int, params: Tuple[float, float]
+) -> Tuple[float]:
+    """
+    Convert level to transform magnitude. This assumes transform magnitude increases
+    (or decreases with 50% chance) linearly with level.
+    Args:
+        level (int): Level value.
+        params (Tuple[float, float]): Params contains two values: 1) Base transform
+            magnitude when level is 0; 2) Maxmimum increasing in transform magnitude
+            when level is at maxmimum.
+    """
+    magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
+    return (params[0] + _randomly_negate(magnitude),)
+def _decreasing_int_to_arg(level: int, params: Tuple[int, int]) -> Tuple[int]:
+    """
+    Convert level to transform magnitude. This assumes transform magnitude decreases
+    linearly with level. The return value is converted to int.
+    Args:
+        level (int): Level value.
+        params (Tuple[float, float]): Params contains two values: 1) Base transform
+            magnitude when level is 0; 2) Maxmimum decreasing in transform magnitude
+            when level is at maxmimum.
+    """
+    magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
+    return (params[0] - int(magnitude),)
+def _decreasing_to_arg(level: int, params: Tuple[float, float]) -> Tuple[float]:
+    """
+    Convert level to transform magnitude. This assumes transform magnitude decreases
+    linearly with level.
+    Args:
+        level (int): Level value.
+        params (Tuple[float, float]): Params contains two values: 1) Base transform
+            magnitude when level is 0; 2) Maxmimum decreasing in transform magnitude
+            when level is at maxmimum.
+    """
+    magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
+    return (params[0] - magnitude,)
+# A dictionary that contains transform names (key) and their corresponding transform
+# functions (value).
+_NAME_TO_TRANSFORM_FUNC = {
+    "AdjustBrightness": _adjust_brightness,
+    "AdjustContrast": _adjust_contrast,
+    "AdjustSaturation": _adjust_saturation,
+    "AdjustSharpness": _adjust_sharpness,
+    "AutoContrast": _autocontrast,
+    "Equalize": _equalize,
+    "Invert": _invert,
+    "Rotate": _rotate,
+    "Posterize": _posterize,
+    "Solarize": _solarize,
+    "ShearX": _shear_x,
+    "ShearY": _shear_y,
+    "TranslateX": _translate_x,
+    "TranslateY": _translate_y,
+}
+# A dictionary that contains transform names (key) and their corresponding level
+# functions (value), which converts the magnitude to the transform function arguments.
+_LEVEL_TO_ARG = {
+    "AdjustBrightness": _increasing_randomly_negate_to_arg,
+    "AdjustContrast": _increasing_randomly_negate_to_arg,
+    "AdjustSaturation": _increasing_randomly_negate_to_arg,
+    "AdjustSharpness": _increasing_randomly_negate_to_arg,
+    "AutoContrast": None,
+    "Equalize": None,
+    "Invert": None,
+    "Rotate": _increasing_randomly_negate_to_arg,
+    "Posterize": _decreasing_int_to_arg,
+    "Solarize": _decreasing_to_arg,
+    "ShearX": _increasing_randomly_negate_to_arg,
+    "ShearY": _increasing_randomly_negate_to_arg,
+    "TranslateX": _increasing_randomly_negate_to_arg,
+    "TranslateY": _increasing_randomly_negate_to_arg,
+}
+# A dictionary that contains transform names (key) and their corresponding maximum
+# transform (value).
+_TRANSFORM_MAX_PARAMS = {
+    "AdjustBrightness": (1, 0.9),
+    "AdjustContrast": (1, 0.9),
+    "AdjustSaturation": (1, 0.9),
+    "AdjustSharpness": (1, 0.9),
+    "AutoContrast": None,
+    "Equalize": None,
+    "Invert": None,
+    "Rotate": (0, 30),
+    "Posterize": (4, 4),
+    "Solarize": (1, 1),
+    "ShearX": (0, 0.3),
+    "ShearY": (0, 0.3),
+    "TranslateX": (0, 0.45),
+    "TranslateY": (0, 0.45),
+}
+# Hyperparameters for sampling magnitude.
+SAMPLING_DEFAULT_HPARAS = {"sampling_std": 0.5}
+# Hyperparameters for transform functions.
+TRANSFORM_DEFAULT_HPARAS = {"fill": (0.5, 0.5, 0.5)}
+class AugmentTransform:
+    def __init__(
+        self,
+        transform_name: str,
+        magnitude: int = 10,
+        prob: float = 0.5,
+        name_to_transform_func: Optional[Dict[str, Callable]] = None,
+        level_to_arg: Optional[Dict[str, Callable]] = None,
+        transform_max_paras: Optional[Dict[str, Tuple]] = None,
+        transform_hparas: Optional[Dict[str, Any]] = None,
+        sampling_type: str = "gaussian",
+        sampling_hparas: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        The AugmentTransform composes a video transform that performs augmentation
+        based on a maximum magnitude. AugmentTransform also offers flexible ways to
+        generate augmentation magnitude based on different sampling strategies.
+        Args:
+            transform_name (str): The name of the video transform function.
+            magnitude (int): Magnitude used for transform function.
+            prob (float): The probablity of applying each transform function.
+            name_to_transform_func (Optional[Dict[str, Callable]]): A Dictionary that
+                contains mapping of the transform name to the transform function.
+            level_to_arg (Optional[Dict[str, Callable]]): A Dictionary that contains
+                mapping of the transform name to its level function, which converts
+                the the magnitude to the transform function arguments.
+            transform_max_paras (Optional[Dict[str, Tuple]]): A Dictionary that
+                contains mapping of the transform name to its maximum transform
+                magnitude.
+            transform_hparas (Optional[Dict[Any]]): Transform hyper parameters.
+                Needs to have key fill. By default, it uses transform_default_hparas.
+            sampling_type (str): Sampling method for magnitude of transform. It should
+                be either gaussian or uniform.
+            sampling_hparas (Optional[Dict[Any]]): Hyper parameters for sampling. If
+                gaussian sampling is used, it needs to have key sampling_std. By
+                default, it uses transform_default_hparas.
+        """
+        assert sampling_type in ["gaussian", "uniform"]
+        name_to_transform_func = name_to_transform_func or _NAME_TO_TRANSFORM_FUNC
+        level_to_arg = level_to_arg or _LEVEL_TO_ARG
+        transform_max_paras = transform_max_paras or _TRANSFORM_MAX_PARAMS
+        self.transform_hparas = transform_hparas or TRANSFORM_DEFAULT_HPARAS
+        self.sampling_type = sampling_type
+        self.sampling_hparas = sampling_hparas or SAMPLING_DEFAULT_HPARAS
+        assert "fill" in self.transform_hparas
+        if self.sampling_type == "gaussian":
+            assert "sampling_std" in self.sampling_hparas
+        if self.sampling_type == "uniform":
+            assert "sampling_data_type" in self.sampling_hparas
+            assert "sampling_min" in self.sampling_hparas
+            if self.sampling_hparas["sampling_data_type"] == "int":
+                assert isinstance(self.sampling_hparas["sampling_min"], int)
+            elif self.sampling_hparas["sampling_data_type"] == "float":
+                assert isinstance(self.sampling_hparas["sampling_min"], (int, float))
+        assert transform_name in name_to_transform_func
+        self.max_level = _AUGMENTATION_MAX_LEVEL
+        self.transform_name = transform_name
+        self.magnitude = magnitude
+        self.transform_fn = name_to_transform_func[transform_name]
+        self.level_fn = level_to_arg[transform_name]
+        self.level_paras = transform_max_paras[transform_name]
+        self.prob = prob
+        self.sampling_type = sampling_type
+    def _get_magnitude(self) -> float:
+        """
+        Get magnitude based on sampling type.
+        """
+        if self.sampling_type == "gaussian":
+            return max(
+                0,
+                min(
+                    self.max_level,
+                    torch.normal(
+                        self.magnitude, self.sampling_hparas["sampling_std"], size=(1,)
+                    ).item(),
+                ),
+            )
+        elif self.sampling_type == "uniform":
+            if self.sampling_hparas["sampling_data_type"] == "int":
+                return torch.randint(
+                    self.sampling_hparas["sampling_min"], self.magnitude + 1, size=(1,)
+                ).item()
+            elif self.sampling_hparas["sampling_data_type"] == "float":
+                return (
+                    torch.rand(size=(1,)).item()
+                    * (self.magnitude - self.sampling_hparas["sampling_min"])
+                    + self.sampling_hparas["sampling_min"]
+                )
+            else:
+                raise ValueError("sampling_data_type must be either 'int' or 'float'")
+        else:
+            raise NotImplementedError
+    def __call__(self, video: torch.Tensor) -> torch.Tensor:
+        """
+        The input is a video tensor.
+        Args:
+            video (torch.Tensor): Input video tensor with shape (T, C, H, W).
+        """
+        if torch.rand(1).item() > self.prob:
+            return video
+        magnitude = self._get_magnitude()
+        level_args = (
+            self.level_fn(magnitude, self.level_paras)
+            if self.level_fn is not None
+            else ()
+        )
+        return self.transform_fn(video, *level_args, **self.transform_hparas)

configs/dit/inference/16x256x256.py ADDED Viewed

	@@ -0,0 +1,31 @@

+num_frames = 16
+fps = 8
+image_size = (256, 256)
+# Define model
+model = dict(
+    type="DiT-XL/2",
+    condition="text",
+    from_pretrained="PRETRAINED_MODEL",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "bf16"
+# Others
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256-class.py ADDED Viewed

	@@ -0,0 +1,31 @@

+num_frames = 1
+fps = 1
+image_size = (256, 256)
+# Define model
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    condition="label_1000",
+    from_pretrained="DiT-XL-2-256x256.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="classes",
+    num_classes=1000,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "bf16"
+# Others
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/imagenet_id.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256.py ADDED Viewed

	@@ -0,0 +1,32 @@

+num_frames = 1
+fps = 1
+image_size = (256, 256)
+# Define model
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    condition="text",
+    from_pretrained="PRETRAINED_MODEL",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "bf16"
+# Others
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/imagenet_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/train/16x256x256.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="DiT-XL/2",
+    from_pretrained="DiT-XL-2-256x256.pt",
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/dit/train/1x256x256.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=1,
+    frame_interval=1,
+    image_size=(256, 256),
+    transform_name="center",
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = False
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 128
+lr = 1e-4  # according to DiT repo
+grad_clip = 1.0

configs/javisdit-v0-1/inference/audio_sample.py ADDED Viewed

	@@ -0,0 +1,58 @@

+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = "4s"
+fps = 24
+audio_fps = 16000
+frame_interval = 1
+save_fps = 24
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "OpenSora"
+dtype = "bf16"
+loop = 1  # loop for video extension
+condition_frame_length = 5  # used for video extension conditioning
+align = 5  # TODO: unknown mechanism, maybe for conditional frame alignment?
+verbose = 2
+audio_only = True
+model = dict(
+    type="VASTDiT3-XL/2",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-audio",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # audio generation only
+    only_infer_audio=True,
+    freeze_video_branch=True,
+    freeze_y_embedder=False,
+    train_st_prior_attn=False,
+    train_va_cross_attn=False,
+    audio_patch_size=(4, 1)
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+aes = 6.5    # aesthetic score
+flow = None  # motion score

configs/javisdit-v0-1/inference/sample.py ADDED Viewed

	@@ -0,0 +1,77 @@

+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = "4s"
+fps = 24
+audio_fps = 16000
+frame_interval = 1
+save_fps = 24
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "OpenSora"
+dtype = "bf16"
+loop = 1  # loop for video extension
+condition_frame_length = 5  # used for video extension conditioning
+align = 5  # TODO: unknown mechanism, maybe for conditional frame alignment?
+verbose = 2
+spatial_token_num = 32
+temporal_token_num = 32
+st_prior_channel = 128
+model = dict(
+    type="VASTDiT3-XL/2",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-jav",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # video-audio joint generation
+    freeze_y_embedder=True,
+    freeze_video_branch=True,
+    freeze_audio_branch=True,
+    train_st_prior_attn=True,
+    train_va_cross_attn=True,
+    spatial_prior_len=spatial_token_num,
+    temporal_prior_len=temporal_token_num,
+    st_prior_channel=st_prior_channel,
+    audio_patch_size=(4, 1)
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+prior_encoder = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
+    spatial_token_num=spatial_token_num,
+    temporal_token_num=temporal_token_num,
+    out_dim=st_prior_channel,
+    hidden_size=512,
+    apply_sampling=True,
+    encode_va=False,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+aes = 6.5    # aesthetic score
+flow = None  # motion score

configs/javisdit-v0-1/inference/sample_240p4s.py ADDED Viewed

	@@ -0,0 +1,77 @@

+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = "4s"
+fps = 24
+audio_fps = 16000
+frame_interval = 1
+save_fps = 24
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "OpenSora"
+dtype = "bf16"
+loop = 1  # loop for video extension
+condition_frame_length = 5  # used for video extension conditioning
+align = 5  # TODO: unknown mechanism, maybe for conditional frame alignment?
+verbose = 2
+spatial_token_num = 32
+temporal_token_num = 32
+st_prior_channel = 128
+model = dict(
+    type="VASTDiT3-XL/2",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-jav-240p4s",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # video-audio joint generation
+    freeze_y_embedder=True,
+    freeze_video_branch=True,
+    freeze_audio_branch=True,
+    train_st_prior_attn=True,
+    train_va_cross_attn=True,
+    spatial_prior_len=spatial_token_num,
+    temporal_prior_len=temporal_token_num,
+    st_prior_channel=st_prior_channel,
+    audio_patch_size=(4, 1)
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+prior_encoder = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
+    spatial_token_num=spatial_token_num,
+    temporal_token_num=temporal_token_num,
+    out_dim=st_prior_channel,
+    hidden_size=512,
+    apply_sampling=True,
+    encode_va=False,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+aes = 6.5    # aesthetic score
+flow = None  # motion score

configs/javisdit-v0-1/misc/extract_st_prior_va.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Dataset settings
+dataset = dict(
+    type="VariableVideoAudioTextDataset",
+    direct_load_video_clip=True,
+    transform_name="resize_crop",
+    audio_transform_name="mel_spec_audioldm2",
+    neg_aug=1,
+    neg_aug_kwargs=dict(
+        video_augmentation_pool="./data/st_prior/video/SA-V",
+        audio_augmentation_pool="./data/st_prior/audio/TAVGBench",
+    ),
+    require_onset=True
+)
+# webvid
+bucket_config = {  # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
+    # image size : {num frame : {accept_probs, batch size}}
+    "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
+    # ---
+    "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    # ---
+    "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
+    "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
+    # ---
+    "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
+    # ---
+    "720p": {51: (0.03, 1)},
+    "1024": {51: (0.03, 1)},
+}
+grad_checkpoint = True
+# Acceleration settings
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+plugin = "zero2"
+# Model settings
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+# text_encoder = dict(
+#     type="t5",
+#     from_pretrained="DeepFloyd/t5-v1_1-xxl",
+#     model_max_length=300,
+# )
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 50
+save_total_limit = 2
+bin_size = 16  # 1GB, 4195 bins
+log_time = False
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/misc/extract_va.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Dataset settings
+dataset = dict(
+    type="VariableVideoAudioTextDataset",
+    direct_load_video_clip=True,
+    transform_name="resize_crop",
+    audio_transform_name="mel_spec_audioldm2",
+)
+# load_text_features = True
+# webvid
+bucket_config = {  # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
+    # image size : {num frame : {accept_probs, batch size}}
+    "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
+    # ---
+    "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    # ---
+    "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
+    "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
+    # ---
+    "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
+    # ---
+    "720p": {51: (0.03, 1)},
+    "1024": {51: (0.03, 1)},
+}
+grad_checkpoint = True
+# Acceleration settings
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+plugin = "zero2"
+# Model settings
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+# text_encoder = dict(
+#     type="t5",
+#     from_pretrained="DeepFloyd/t5-v1_1-xxl",
+#     model_max_length=300,
+# )
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 50
+save_total_limit = 2
+bin_size = 64  # 1GB, 4195 bins
+log_time = False
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/train/stage1_audio.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Dataset settings
+audio_only=True
+dataset = dict(
+    type="VariableVideoAudioTextDataset",
+    transform_name="resize_crop",
+    audio_transform_name="mel_spec_audioldm2",
+    audio_only=audio_only
+)
+# webvid
+bucket_config = {  # 5s/it, randomly assigning raw videos to pre-defined and proper buckets
+    # image size : {num frame : {accept_probs, batch size}}
+    # # 28G？
+    # "144p": {51: (1.0, 96), 102: ((1.0, 0.7), 48), 204: ((1.0, 0.3), 24), 408: ((1.0, 0.5), 12)},
+    # # 32G
+    # "144p": {51: (1.0, 128), 102: ((1.0, 0.7), 64), 204: ((1.0, 0.3), 32), 408: ((1.0, 0.5), 16)},
+    # 45G
+    "144p": {51: (1.0, 256), 102: ((1.0, 0.7), 128), 204: ((1.0, 0.3), 64), 408: ((1.0, 0.5), 32)},
+    # 60-70G
+    # "144p": {51: (1.0, 384), 102: ((1.0, 0.7), 192), 204: ((1.0, 0.3), 128), 96: ((1.0, 0.5), 48)},
+    # 80G+
+    # "144p": {51: (1.0, 512), 102: ((1.0, 0.7), 256), 204: ((1.0, 0.3), 128), 408: ((1.0, 0.5), 64)},
+}
+grad_checkpoint = True
+# Acceleration settings
+num_workers = 16
+num_bucket_build_workers = 8
+dtype = "bf16"
+plugin = "zero2"
+# Model settings
+model = dict(
+    type="VASTDiT3-XL/2",
+    weight_init_from=[
+        "./checkpoints/OpenSora-STDiT-v3/model.safetensors"
+    ],
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # audio generation only
+    only_train_audio=True,
+    freeze_video_branch=True,
+    freeze_y_embedder=False,
+    train_st_prior_attn=False,
+    train_va_cross_attn=False,
+    audio_patch_size=(4, 1)
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    sample_method="logit-normal",
+)
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 50
+log_every = 10
+ckpt_every = 250
+save_total_limit = 2
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-4
+ema_decay = 0.99
+adam_eps = 1e-15
+warmup_steps = 1000
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/train/stage2_prior.py ADDED Viewed

	@@ -0,0 +1,107 @@

+spatial_token_num = 32
+temporal_token_num = 32
+st_prior_channel = 128
+# Dataset settings
+dataset = dict(
+    type="VariableVideoAudioTextDataset",
+    direct_load_video_clip=True,
+    transform_name="resize_crop",
+    audio_transform_name="mel_spec_audioldm2",
+    neg_aug=1,
+    neg_aug_kwargs=dict(
+        video_augmentation_pool="./data/st_prior/video/SA-V",
+        audio_augmentation_pool="./data/st_prior/audio/TAVGBench",
+    ),
+)
+load_text_features = False # TODO: text encoder does not take too much time
+# webvid
+bucket_config = {  # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
+    # image size : {num frame : {accept_probs, batch size}}
+    "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
+    # ---
+    "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    # ---
+    "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
+    "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
+    # ---
+    "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
+    # ---
+    "720p": {51: (0.03, 1)},
+    "1024": {51: (0.03, 1)},
+}
+# Acceleration settings
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+# Model settings
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+model = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    spatial_token_num=spatial_token_num,
+    temporal_token_num=temporal_token_num,
+    out_dim=st_prior_channel,
+    hidden_size=512,
+    apply_sampling=True,
+    encode_va=True,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True
+)
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 2
+log_every = 10
+ckpt_every = 200
+save_total_limit = 2
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-5
+warmup_steps = 100
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/train/stage2_prior_feat.py ADDED Viewed

	@@ -0,0 +1,81 @@

+spatial_token_num = 32
+temporal_token_num = 32
+st_prior_channel = 128
+# Dataset settings
+dataset = dict(type="BatchFeatureDataset")
+load_va_features = True
+load_text_features = False # TODO: text encoder does not take too much time
+# Acceleration settings
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+# Model settings
+# vae = dict(
+#     type="OpenSoraVAE_V1_2",
+#     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+#     micro_frame_size=17,
+#     micro_batch_size=4,
+# )
+# audio_vae = dict(
+#     type="AudioLDM2",
+#     from_pretrained="cvssp/audioldm2",
+# )
+model = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    spatial_token_num=spatial_token_num,
+    temporal_token_num=temporal_token_num,
+    out_dim=st_prior_channel,
+    hidden_size=512,
+    apply_sampling=True,
+    encode_va=True,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True
+)
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 2
+log_every = 10
+ckpt_every = 200
+save_total_limit = 2
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-5
+warmup_steps = 100
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/train/stage3_jav.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Dataset settings
+dataset = dict(
+    type="VariableVideoAudioTextDataset",
+    direct_load_video_clip=True,
+    transform_name="resize_crop",
+    audio_transform_name="mel_spec_audioldm2",
+)
+load_text_features = False
+# webvid
+bucket_config = {  # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
+    # image size : {num frame : {accept_probs, batch size}}
+    "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
+    # ---
+    "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
+    # ---
+    "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
+    "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
+    # ---
+    "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
+    # ---
+    "720p": {51: (0.03, 1)},
+    "1024": {51: (0.03, 1)},
+}
+grad_checkpoint = True
+# Acceleration settings
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+plugin = "zero2"
+# Model settings
+spatial_prior_len = 32
+temporal_prior_len = 32
+st_prior_channel = 128
+model = dict(
+    type="VASTDiT3-XL/2",
+    weight_init_from=[
+        "./checkpoints/JavisDiT-v0.1-audio",
+        "./checkpoints/OpenSora-STDiT-v3/model.safetensors",
+    ],
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # video-audio joint generation
+    only_train_audio=False,
+    freeze_y_embedder=True,
+    freeze_video_branch=True,
+    freeze_audio_branch=True,
+    train_st_prior_attn=True,
+    train_va_cross_attn=True,
+    spatial_prior_len=spatial_prior_len,
+    temporal_prior_len=temporal_prior_len,
+    st_prior_channel=st_prior_channel,
+    audio_patch_size=(4, 1)
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+)
+audio_vae = dict(
+    type="AudioLDM2",
+    from_pretrained="cvssp/audioldm2",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    # shardformer=True,
+)
+prior_encoder = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
+    spatial_token_num=spatial_prior_len,
+    temporal_token_num=temporal_prior_len,
+    out_dim=st_prior_channel,
+    apply_sampling=True,
+    encode_va=False,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    sample_method="logit-normal",
+)
+# Mask settings
+# 30%
+mask_ratios = {
+    "random":              0.01,
+    "video_to_audio":      0.05,   # func1
+    "audio_to_video":      0.05,   # func2
+    "sound_image_animate": 0.03,
+    "intepolate":          0.03,
+    "quarter_random":      0.005,
+    "quarter_head":        0.05,   # func3
+    "quarter_tail":        0.005,
+    "quarter_head_tail":   0.005,
+    "image_random":        0.005,
+    "image_head":          0.05,   # func4
+    "image_tail":          0.005,
+    "image_head_tail":     0.005,
+}
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 2
+log_every = 10
+ckpt_every = 50
+save_total_limit = 2
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-4
+ema_decay = 0.99
+adam_eps = 1e-15
+warmup_steps = 1000
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/javisdit-v0-1/train/stage3_jav_feat.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Dataset settings
+dataset = dict(type="BatchFeatureDataset")
+load_va_features = True
+load_text_features = False
+# Acceleration settings
+num_workers = 8
+grad_checkpoint = True
+dtype = "bf16"
+plugin = "zero2"
+# Model settings
+spatial_prior_len = 32
+temporal_prior_len = 32
+st_prior_channel = 128
+model = dict(
+    type="VASTDiT3-XL/2",
+    weight_init_from=[
+        "./checkpoints/JavisDiT-v0.1-audio",
+        "./checkpoints/OpenSora-STDiT-v3/model.safetensors",
+    ],
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    # video-audio joint generation
+    only_train_audio=False,
+    freeze_y_embedder=True,
+    freeze_video_branch=True,
+    freeze_audio_branch=True,
+    train_st_prior_attn=True,
+    train_va_cross_attn=True,
+    spatial_prior_len=spatial_prior_len,
+    temporal_prior_len=temporal_prior_len,
+    st_prior_channel=st_prior_channel,
+    audio_patch_size=(4, 1)
+)
+# vae = dict(
+#     type="OpenSoraVAE_V1_2",
+#     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+#     micro_frame_size=17,
+#     micro_batch_size=4,
+# )
+# audio_vae = dict(
+#     type="AudioLDM2",
+#     from_pretrained="cvssp/audioldm2",
+# )
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    # shardformer=True,
+)
+prior_encoder = dict(
+    type="STIBPrior",
+    imagebind_ckpt_path="./checkpoints",
+    from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
+    spatial_token_num=spatial_prior_len,
+    temporal_token_num=temporal_prior_len,
+    out_dim=st_prior_channel,
+    apply_sampling=True,
+    encode_va=False,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    sample_method="logit-normal",
+)
+# Mask settings
+# 30%
+mask_ratios = {
+    "random":              0.01,
+    "video_to_audio":      0.05,   # func1
+    "audio_to_video":      0.05,   # func2
+    "sound_image_animate": 0.03,
+    "intepolate":          0.03,
+    "quarter_random":      0.005,
+    "quarter_head":        0.05,   # func3
+    "quarter_tail":        0.005,
+    "quarter_head_tail":   0.005,
+    "image_random":        0.005,
+    "image_head":          0.05,   # func4
+    "image_tail":          0.005,
+    "image_head_tail":     0.005,
+}
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 2
+log_every = 10
+ckpt_every = 50
+save_total_limit = 2
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 1e-4
+ema_decay = 0.99
+adam_eps = 1e-15
+warmup_steps = 1000
+# audio settings
+sampling_rate = 16000
+mel_bins = 64
+audio_cfg = {
+    "preprocessing": {
+        "audio": {
+            "sampling_rate": sampling_rate,
+            "max_wav_value": 32768.0,
+            "duration": 10.24,
+        },
+        "stft": {
+            "filter_length": 1024,
+            "hop_length": 160,
+            "win_length": 1024,
+        },
+        "mel": {
+            "n_mel_channels": mel_bins,
+            "mel_fmin": 0,
+            "mel_fmax": 8000,
+        }
+    },
+    "augmentation": {
+        "mixup": 0.0,
+    }
+}

configs/latte/inference/16x256x256-class.py ADDED Viewed

	@@ -0,0 +1,30 @@

+num_frames = 16
+fps = 8
+image_size = (256, 256)
+# Define model
+model = dict(
+    type="Latte-XL/2",
+    condition="label_101",
+    from_pretrained="Latte-XL-2-256x256-ucf101.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="classes",
+    num_classes=101,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "bf16"
+# Others
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_id.txt"
+save_dir = "./samples/samples/"

configs/latte/inference/16x256x256.py ADDED Viewed

	@@ -0,0 +1,31 @@

+num_frames = 16
+fps = 8
+image_size = (256, 256)
+# Define model
+model = dict(
+    type="Latte-XL/2",
+    condition="text",
+    from_pretrained="PRETRAINED_MODEL",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "bf16"
+# Others
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/latte/train/16x256x256.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="Latte-XL/2",
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/inference/sample-ref.py ADDED Viewed

	@@ -0,0 +1,64 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Condition
+prompt_path = None
+prompt = [
+    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
+    'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
+    'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
+    'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
+    'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
+    '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
+]
+loop = 2
+condition_frame_length = 4
+# (
+#   loop id, [the loop index of the condition image or video]
+#   reference id, [the index of the condition image or video in the reference_path]
+#   reference start, [the start frame of the condition image or video]
+#   target start, [the location to insert]
+#   length, [the number of frames to insert]
+#   edit_ratio [the edit rate of the condition image or video]
+# )
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
+    input_sq_size=512,
+    qk_norm=True,
+    qk_norm_legacy=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/inference/sample.py ADDED Viewed

	@@ -0,0 +1,44 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
+    input_sq_size=512,
+    qk_norm=True,
+    qk_norm_legacy=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Condition
+prompt_path = "./assets/texts/t2v_samples.txt"
+prompt = None  # prompt has higher priority than prompt_path
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/train/benchmark.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# this file is only for batch size search and is not used for training
+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+# bucket config format:
+# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
+# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
+# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
+# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
+# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
+bucket_config = {
+    # == manual search ==
+    # "240p": {128: (1.0, 2)}, # 4.28s/it
+    # "240p": {64: (1.0, 4)},
+    # "240p": {32: (1.0, 8)},  # 4.6s/it
+    # "240p": {16: (1.0, 16)},  # 4.6s/it
+    # "480p": {16: (1.0, 4)},  # 4.6s/it
+    # "720p": {16: (1.0, 2)},  # 5.89s/it
+    # "256": {1: (1.0, 256)},  # 4.5s/it
+    # "512": {1: (1.0, 96)}, # 4.7s/it
+    # "512": {1: (1.0, 128)}, # 6.3s/it
+    # "480p": {1: (1.0, 50)},  # 4.0s/it
+    # "1024": {1: (1.0, 32)},  # 6.8s/it
+    # "1024": {1: (1.0, 20)}, # 4.3s/it
+    # "1080p": {1: (1.0, 16)}, # 8.6s/it
+    # "1080p": {1: (1.0, 8)},  # 4.4s/it
+    # == stage 2 ==
+    # "240p": {
+    #     16: (1.0, (2, 32)),
+    #     32: (1.0, (2, 16)),
+    #     64: (1.0, (2, 8)),
+    #     128: (1.0, (2, 6)),
+    # },
+    # "256": {1: (1.0, (128, 300))},
+    # "512": {1: (0.5, (64, 128))},
+    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
+    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
+    # "1024": {1: (0.3, (8, 64))},
+    # "1080p": {1: (0.3, (2, 32))},
+    # == stage 3 ==
+    "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    qk_norm_legacy=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/image.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 6s/it
+    "256": {1: (1.0, 256)},
+    "512": {1: (1.0, 80)},
+    "480p": {1: (1.0, 52)},
+    "1024": {1: (1.0, 20)},
+    "1080p": {1: (1.0, 8)},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    qk_norm_legacy=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = 10  # only for logging
+lr = 2e-5
+grad_clip = 1.0