kaiw7 commited on
Commit
e490e7e
·
verified ·
1 Parent(s): 2c7d185

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +24 -0
  2. .gitignore +218 -0
  3. .pre-commit-config.yaml +31 -0
  4. LICENSE +696 -0
  5. README.md +184 -0
  6. assets/demo/Fig7-JAVG/case1.mp4 +3 -0
  7. assets/demo/Fig7-JAVG/case2.mp4 +3 -0
  8. assets/demo/FigA11-X-Cond/A2V.mp4 +3 -0
  9. assets/demo/FigA11-X-Cond/AI2V.mp4 +3 -0
  10. assets/demo/FigA11-X-Cond/AV-Ext.mp4 +3 -0
  11. assets/demo/FigA11-X-Cond/GT.mp4 +3 -0
  12. assets/demo/FigA11-X-Cond/I2AV.mp4 +3 -0
  13. assets/demo/FigA11-X-Cond/V2A.mp4 +3 -0
  14. assets/demo/FigA9-JAVG/case1.mp4 +3 -0
  15. assets/demo/FigA9-JAVG/case2.mp4 +3 -0
  16. assets/demo/FigA9-JAVG/case3.mp4 +3 -0
  17. assets/demo/FigA9-JAVG/case4.mp4 +3 -0
  18. assets/demo/FigA9-JAVG/case5.mp4 +3 -0
  19. assets/demo/FigA9-JAVG/case6.mp4 +3 -0
  20. assets/demo/FigA9-JAVG/case7.mp4 +3 -0
  21. assets/demo/audio_prompts.txt +1222 -0
  22. assets/demo/prompts.txt +16 -0
  23. assets/docs/data.md +193 -0
  24. assets/image/JavisDiT-framework-resized.png +3 -0
  25. assets/image/JavisDiT-intro-resized.png +3 -0
  26. assets/image/logo.png +3 -0
  27. assets/src/funasr_utils_load_utils.py +262 -0
  28. assets/src/pytorchvideo_augmentations.py +481 -0
  29. configs/dit/inference/16x256x256.py +31 -0
  30. configs/dit/inference/1x256x256-class.py +31 -0
  31. configs/dit/inference/1x256x256.py +32 -0
  32. configs/dit/train/16x256x256.py +50 -0
  33. configs/dit/train/1x256x256.py +51 -0
  34. configs/javisdit-v0-1/inference/audio_sample.py +58 -0
  35. configs/javisdit-v0-1/inference/sample.py +77 -0
  36. configs/javisdit-v0-1/inference/sample_240p4s.py +77 -0
  37. configs/javisdit-v0-1/misc/extract_st_prior_va.py +92 -0
  38. configs/javisdit-v0-1/misc/extract_va.py +88 -0
  39. configs/javisdit-v0-1/train/stage1_audio.py +113 -0
  40. configs/javisdit-v0-1/train/stage2_prior.py +107 -0
  41. configs/javisdit-v0-1/train/stage2_prior_feat.py +81 -0
  42. configs/javisdit-v0-1/train/stage3_jav.py +152 -0
  43. configs/javisdit-v0-1/train/stage3_jav_feat.py +130 -0
  44. configs/latte/inference/16x256x256-class.py +30 -0
  45. configs/latte/inference/16x256x256.py +31 -0
  46. configs/latte/train/16x256x256.py +49 -0
  47. configs/opensora-v1-1/inference/sample-ref.py +64 -0
  48. configs/opensora-v1-1/inference/sample.py +44 -0
  49. configs/opensora-v1-1/train/benchmark.py +102 -0
  50. configs/opensora-v1-1/train/image.py +66 -0
.gitattributes CHANGED
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/demo/Fig7-JAVG/case1.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ assets/demo/Fig7-JAVG/case2.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ assets/demo/FigA11-X-Cond/A2V.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ assets/demo/FigA11-X-Cond/AI2V.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ assets/demo/FigA11-X-Cond/AV-Ext.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ assets/demo/FigA11-X-Cond/GT.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ assets/demo/FigA11-X-Cond/I2AV.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ assets/demo/FigA11-X-Cond/V2A.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ assets/demo/FigA9-JAVG/case1.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ assets/demo/FigA9-JAVG/case2.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ assets/demo/FigA9-JAVG/case3.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ assets/demo/FigA9-JAVG/case4.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ assets/demo/FigA9-JAVG/case5.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ assets/demo/FigA9-JAVG/case6.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ assets/demo/FigA9-JAVG/case7.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ assets/image/JavisDiT-framework-resized.png filter=lfs diff=lfs merge=lfs -text
52
+ assets/image/JavisDiT-intro-resized.png filter=lfs diff=lfs merge=lfs -text
53
+ assets/image/logo.png filter=lfs diff=lfs merge=lfs -text
54
+ eval/javisbench/src/ImageBind/.assets/bird_audio.wav filter=lfs diff=lfs merge=lfs -text
55
+ eval/javisbench/src/ImageBind/.assets/bird_image.jpg filter=lfs diff=lfs merge=lfs -text
56
+ eval/javisbench/src/ImageBind/.assets/car_audio.wav filter=lfs diff=lfs merge=lfs -text
57
+ eval/javisbench/src/ImageBind/.assets/dog_audio.wav filter=lfs diff=lfs merge=lfs -text
58
+ javisdit/models/Y-fUsuo90K0g.wav filter=lfs diff=lfs merge=lfs -text
59
+ javisdit/models/out.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ .idea/
162
+ .vscode/
163
+
164
+ # macos
165
+ *.DS_Store
166
+
167
+ # misc files
168
+ data
169
+ dataset/
170
+ datasets
171
+ !javisdit/datasets
172
+ !tools/datasets
173
+ runs
174
+ checkpoints
175
+ weights
176
+ outputs
177
+ ablation
178
+ !configs/**/ablation/
179
+ !scripts/**/ablation/
180
+ exps
181
+ samples
182
+ logs
183
+ pretrained_models
184
+ evaluation_results/
185
+ cache/
186
+ *.swp
187
+ debug/
188
+ */debug.py
189
+ third_party/
190
+ deprecated
191
+ nohup.*
192
+ tmp
193
+ *.zip
194
+ *.tar
195
+ *.tar.gz
196
+ run.sh
197
+ interface.py
198
+
199
+ # Secret files
200
+ hostfile
201
+ gradio_cached_examples/
202
+ wandb/
203
+
204
+ # vae weights
205
+ eval/vae/flolpips/weights/
206
+
207
+ # npm
208
+ node_modules/
209
+ package-lock.json
210
+ package.json
211
+
212
+ # PLLaVA
213
+ tools/caption/pllava_dir/PLLaVA/
214
+
215
+ # vbench
216
+ vbench
217
+ !eval/vbench
218
+ vbench2_beta_i2v
.pre-commit-config.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+
3
+ - repo: https://github.com/PyCQA/autoflake
4
+ rev: v2.2.1
5
+ hooks:
6
+ - id: autoflake
7
+ name: autoflake (python)
8
+ args: ['--in-place']
9
+
10
+ - repo: https://github.com/pycqa/isort
11
+ rev: 5.12.0
12
+ hooks:
13
+ - id: isort
14
+ name: sort all imports (python)
15
+
16
+ - repo: https://github.com/psf/black-pre-commit-mirror
17
+ rev: 23.9.1
18
+ hooks:
19
+ - id: black
20
+ name: black formatter
21
+
22
+ - repo: https://github.com/pre-commit/pre-commit-hooks
23
+ rev: v4.3.0
24
+ hooks:
25
+ - id: check-yaml
26
+ - id: check-merge-conflict
27
+ - id: check-case-conflict
28
+ - id: trailing-whitespace
29
+ - id: end-of-file-fixer
30
+ - id: mixed-line-ending
31
+ args: ['--fix=lf']
LICENSE ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2025. All rights reserved.
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright 2024 HPC-AI Technology Inc.
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
203
+
204
+ =========================================================================
205
+ This project is inspired by the listed projects and is subject to the following licenses:
206
+
207
+ 1. Latte (https://github.com/Vchitect/Latte/blob/main/LICENSE)
208
+
209
+ Copyright 2024 Latte
210
+
211
+ Licensed under the Apache License, Version 2.0 (the "License");
212
+ you may not use this file except in compliance with the License.
213
+ You may obtain a copy of the License at
214
+
215
+ http://www.apache.org/licenses/LICENSE-2.0
216
+
217
+ Unless required by applicable law or agreed to in writing, software
218
+ distributed under the License is distributed on an "AS IS" BASIS,
219
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
220
+ See the License for the specific language governing permissions and
221
+ limitations under the License.
222
+
223
+ 2. PixArt-alpha (https://github.com/PixArt-alpha/PixArt-alpha/blob/master/LICENSE)
224
+
225
+ Copyright (C) 2024 PixArt-alpha/PixArt-alpha
226
+
227
+ This program is free software: you can redistribute it and/or modify
228
+ it under the terms of the GNU Affero General Public License as published
229
+ by the Free Software Foundation, either version 3 of the License, or
230
+ (at your option) any later version.
231
+
232
+ This program is distributed in the hope that it will be useful,
233
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
234
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
235
+ GNU Affero General Public License for more details.
236
+
237
+ You should have received a copy of the GNU Affero General Public License
238
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
239
+
240
+ 3. dpm-solver (https://github.com/LuChengTHU/dpm-solver/blob/main/LICENSE)
241
+
242
+ MIT License
243
+
244
+ Copyright (c) 2022 Cheng Lu
245
+
246
+ Permission is hereby granted, free of charge, to any person obtaining a copy
247
+ of this software and associated documentation files (the "Software"), to deal
248
+ in the Software without restriction, including without limitation the rights
249
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
250
+ copies of the Software, and to permit persons to whom the Software is
251
+ furnished to do so, subject to the following conditions:
252
+
253
+ The above copyright notice and this permission notice shall be included in all
254
+ copies or substantial portions of the Software.
255
+
256
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
257
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
258
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
259
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
260
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
261
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
262
+ SOFTWARE.
263
+
264
+ 4. DiT (https://github.com/facebookresearch/DiT/blob/main/LICENSE.txt)
265
+
266
+ Attribution-NonCommercial 4.0 International
267
+
268
+ =======================================================================
269
+
270
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
271
+ does not provide legal services or legal advice. Distribution of
272
+ Creative Commons public licenses does not create a lawyer-client or
273
+ other relationship. Creative Commons makes its licenses and related
274
+ information available on an "as-is" basis. Creative Commons gives no
275
+ warranties regarding its licenses, any material licensed under their
276
+ terms and conditions, or any related information. Creative Commons
277
+ disclaims all liability for damages resulting from their use to the
278
+ fullest extent possible.
279
+
280
+ Using Creative Commons Public Licenses
281
+
282
+ Creative Commons public licenses provide a standard set of terms and
283
+ conditions that creators and other rights holders may use to share
284
+ original works of authorship and other material subject to copyright
285
+ and certain other rights specified in the public license below. The
286
+ following considerations are for informational purposes only, are not
287
+ exhaustive, and do not form part of our licenses.
288
+
289
+ Considerations for licensors: Our public licenses are
290
+ intended for use by those authorized to give the public
291
+ permission to use material in ways otherwise restricted by
292
+ copyright and certain other rights. Our licenses are
293
+ irrevocable. Licensors should read and understand the terms
294
+ and conditions of the license they choose before applying it.
295
+ Licensors should also secure all rights necessary before
296
+ applying our licenses so that the public can reuse the
297
+ material as expected. Licensors should clearly mark any
298
+ material not subject to the license. This includes other CC-
299
+ licensed material, or material used under an exception or
300
+ limitation to copyright. More considerations for licensors:
301
+ wiki.creativecommons.org/Considerations_for_licensors
302
+
303
+ Considerations for the public: By using one of our public
304
+ licenses, a licensor grants the public permission to use the
305
+ licensed material under specified terms and conditions. If
306
+ the licensor's permission is not necessary for any reason--for
307
+ example, because of any applicable exception or limitation to
308
+ copyright--then that use is not regulated by the license. Our
309
+ licenses grant only permissions under copyright and certain
310
+ other rights that a licensor has authority to grant. Use of
311
+ the licensed material may still be restricted for other
312
+ reasons, including because others have copyright or other
313
+ rights in the material. A licensor may make special requests,
314
+ such as asking that all changes be marked or described.
315
+ Although not required by our licenses, you are encouraged to
316
+ respect those requests where reasonable. More_considerations
317
+ for the public:
318
+ wiki.creativecommons.org/Considerations_for_licensees
319
+
320
+ =======================================================================
321
+
322
+ Creative Commons Attribution-NonCommercial 4.0 International Public
323
+ License
324
+
325
+ By exercising the Licensed Rights (defined below), You accept and agree
326
+ to be bound by the terms and conditions of this Creative Commons
327
+ Attribution-NonCommercial 4.0 International Public License ("Public
328
+ License"). To the extent this Public License may be interpreted as a
329
+ contract, You are granted the Licensed Rights in consideration of Your
330
+ acceptance of these terms and conditions, and the Licensor grants You
331
+ such rights in consideration of benefits the Licensor receives from
332
+ making the Licensed Material available under these terms and
333
+ conditions.
334
+
335
+ Section 1 -- Definitions.
336
+
337
+ a. Adapted Material means material subject to Copyright and Similar
338
+ Rights that is derived from or based upon the Licensed Material
339
+ and in which the Licensed Material is translated, altered,
340
+ arranged, transformed, or otherwise modified in a manner requiring
341
+ permission under the Copyright and Similar Rights held by the
342
+ Licensor. For purposes of this Public License, where the Licensed
343
+ Material is a musical work, performance, or sound recording,
344
+ Adapted Material is always produced where the Licensed Material is
345
+ synched in timed relation with a moving image.
346
+
347
+ b. Adapter's License means the license You apply to Your Copyright
348
+ and Similar Rights in Your contributions to Adapted Material in
349
+ accordance with the terms and conditions of this Public License.
350
+
351
+ c. Copyright and Similar Rights means copyright and/or similar rights
352
+ closely related to copyright including, without limitation,
353
+ performance, broadcast, sound recording, and Sui Generis Database
354
+ Rights, without regard to how the rights are labeled or
355
+ categorized. For purposes of this Public License, the rights
356
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
357
+ Rights.
358
+ d. Effective Technological Measures means those measures that, in the
359
+ absence of proper authority, may not be circumvented under laws
360
+ fulfilling obligations under Article 11 of the WIPO Copyright
361
+ Treaty adopted on December 20, 1996, and/or similar international
362
+ agreements.
363
+
364
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
365
+ any other exception or limitation to Copyright and Similar Rights
366
+ that applies to Your use of the Licensed Material.
367
+
368
+ f. Licensed Material means the artistic or literary work, database,
369
+ or other material to which the Licensor applied this Public
370
+ License.
371
+
372
+ g. Licensed Rights means the rights granted to You subject to the
373
+ terms and conditions of this Public License, which are limited to
374
+ all Copyright and Similar Rights that apply to Your use of the
375
+ Licensed Material and that the Licensor has authority to license.
376
+
377
+ h. Licensor means the individual(s) or entity(ies) granting rights
378
+ under this Public License.
379
+
380
+ i. NonCommercial means not primarily intended for or directed towards
381
+ commercial advantage or monetary compensation. For purposes of
382
+ this Public License, the exchange of the Licensed Material for
383
+ other material subject to Copyright and Similar Rights by digital
384
+ file-sharing or similar means is NonCommercial provided there is
385
+ no payment of monetary compensation in connection with the
386
+ exchange.
387
+
388
+ j. Share means to provide material to the public by any means or
389
+ process that requires permission under the Licensed Rights, such
390
+ as reproduction, public display, public performance, distribution,
391
+ dissemination, communication, or importation, and to make material
392
+ available to the public including in ways that members of the
393
+ public may access the material from a place and at a time
394
+ individually chosen by them.
395
+
396
+ k. Sui Generis Database Rights means rights other than copyright
397
+ resulting from Directive 96/9/EC of the European Parliament and of
398
+ the Council of 11 March 1996 on the legal protection of databases,
399
+ as amended and/or succeeded, as well as other essentially
400
+ equivalent rights anywhere in the world.
401
+
402
+ l. You means the individual or entity exercising the Licensed Rights
403
+ under this Public License. Your has a corresponding meaning.
404
+
405
+ Section 2 -- Scope.
406
+
407
+ a. License grant.
408
+
409
+ 1. Subject to the terms and conditions of this Public License,
410
+ the Licensor hereby grants You a worldwide, royalty-free,
411
+ non-sublicensable, non-exclusive, irrevocable license to
412
+ exercise the Licensed Rights in the Licensed Material to:
413
+
414
+ a. reproduce and Share the Licensed Material, in whole or
415
+ in part, for NonCommercial purposes only; and
416
+
417
+ b. produce, reproduce, and Share Adapted Material for
418
+ NonCommercial purposes only.
419
+
420
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
421
+ Exceptions and Limitations apply to Your use, this Public
422
+ License does not apply, and You do not need to comply with
423
+ its terms and conditions.
424
+
425
+ 3. Term. The term of this Public License is specified in Section
426
+ 6(a).
427
+
428
+ 4. Media and formats; technical modifications allowed. The
429
+ Licensor authorizes You to exercise the Licensed Rights in
430
+ all media and formats whether now known or hereafter created,
431
+ and to make technical modifications necessary to do so. The
432
+ Licensor waives and/or agrees not to assert any right or
433
+ authority to forbid You from making technical modifications
434
+ necessary to exercise the Licensed Rights, including
435
+ technical modifications necessary to circumvent Effective
436
+ Technological Measures. For purposes of this Public License,
437
+ simply making modifications authorized by this Section 2(a)
438
+ (4) never produces Adapted Material.
439
+
440
+ 5. Downstream recipients.
441
+
442
+ a. Offer from the Licensor -- Licensed Material. Every
443
+ recipient of the Licensed Material automatically
444
+ receives an offer from the Licensor to exercise the
445
+ Licensed Rights under the terms and conditions of this
446
+ Public License.
447
+
448
+ b. No downstream restrictions. You may not offer or impose
449
+ any additional or different terms or conditions on, or
450
+ apply any Effective Technological Measures to, the
451
+ Licensed Material if doing so restricts exercise of the
452
+ Licensed Rights by any recipient of the Licensed
453
+ Material.
454
+
455
+ 6. No endorsement. Nothing in this Public License constitutes or
456
+ may be construed as permission to assert or imply that You
457
+ are, or that Your use of the Licensed Material is, connected
458
+ with, or sponsored, endorsed, or granted official status by,
459
+ the Licensor or others designated to receive attribution as
460
+ provided in Section 3(a)(1)(A)(i).
461
+
462
+ b. Other rights.
463
+
464
+ 1. Moral rights, such as the right of integrity, are not
465
+ licensed under this Public License, nor are publicity,
466
+ privacy, and/or other similar personality rights; however, to
467
+ the extent possible, the Licensor waives and/or agrees not to
468
+ assert any such rights held by the Licensor to the limited
469
+ extent necessary to allow You to exercise the Licensed
470
+ Rights, but not otherwise.
471
+
472
+ 2. Patent and trademark rights are not licensed under this
473
+ Public License.
474
+
475
+ 3. To the extent possible, the Licensor waives any right to
476
+ collect royalties from You for the exercise of the Licensed
477
+ Rights, whether directly or through a collecting society
478
+ under any voluntary or waivable statutory or compulsory
479
+ licensing scheme. In all other cases the Licensor expressly
480
+ reserves any right to collect such royalties, including when
481
+ the Licensed Material is used other than for NonCommercial
482
+ purposes.
483
+
484
+ Section 3 -- License Conditions.
485
+
486
+ Your exercise of the Licensed Rights is expressly made subject to the
487
+ following conditions.
488
+
489
+ a. Attribution.
490
+
491
+ 1. If You Share the Licensed Material (including in modified
492
+ form), You must:
493
+
494
+ a. retain the following if it is supplied by the Licensor
495
+ with the Licensed Material:
496
+
497
+ i. identification of the creator(s) of the Licensed
498
+ Material and any others designated to receive
499
+ attribution, in any reasonable manner requested by
500
+ the Licensor (including by pseudonym if
501
+ designated);
502
+
503
+ ii. a copyright notice;
504
+
505
+ iii. a notice that refers to this Public License;
506
+
507
+ iv. a notice that refers to the disclaimer of
508
+ warranties;
509
+
510
+ v. a URI or hyperlink to the Licensed Material to the
511
+ extent reasonably practicable;
512
+
513
+ b. indicate if You modified the Licensed Material and
514
+ retain an indication of any previous modifications; and
515
+
516
+ c. indicate the Licensed Material is licensed under this
517
+ Public License, and include the text of, or the URI or
518
+ hyperlink to, this Public License.
519
+
520
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
521
+ reasonable manner based on the medium, means, and context in
522
+ which You Share the Licensed Material. For example, it may be
523
+ reasonable to satisfy the conditions by providing a URI or
524
+ hyperlink to a resource that includes the required
525
+ information.
526
+
527
+ 3. If requested by the Licensor, You must remove any of the
528
+ information required by Section 3(a)(1)(A) to the extent
529
+ reasonably practicable.
530
+
531
+ 4. If You Share Adapted Material You produce, the Adapter's
532
+ License You apply must not prevent recipients of the Adapted
533
+ Material from complying with this Public License.
534
+
535
+ Section 4 -- Sui Generis Database Rights.
536
+
537
+ Where the Licensed Rights include Sui Generis Database Rights that
538
+ apply to Your use of the Licensed Material:
539
+
540
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
541
+ to extract, reuse, reproduce, and Share all or a substantial
542
+ portion of the contents of the database for NonCommercial purposes
543
+ only;
544
+
545
+ b. if You include all or a substantial portion of the database
546
+ contents in a database in which You have Sui Generis Database
547
+ Rights, then the database in which You have Sui Generis Database
548
+ Rights (but not its individual contents) is Adapted Material; and
549
+
550
+ c. You must comply with the conditions in Section 3(a) if You Share
551
+ all or a substantial portion of the contents of the database.
552
+
553
+ For the avoidance of doubt, this Section 4 supplements and does not
554
+ replace Your obligations under this Public License where the Licensed
555
+ Rights include other Copyright and Similar Rights.
556
+
557
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
558
+
559
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
560
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
561
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
562
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
563
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
564
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
565
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
566
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
567
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
568
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
569
+
570
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
571
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
572
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
573
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
574
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
575
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
576
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
577
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
578
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
579
+
580
+ c. The disclaimer of warranties and limitation of liability provided
581
+ above shall be interpreted in a manner that, to the extent
582
+ possible, most closely approximates an absolute disclaimer and
583
+ waiver of all liability.
584
+
585
+ Section 6 -- Term and Termination.
586
+
587
+ a. This Public License applies for the term of the Copyright and
588
+ Similar Rights licensed here. However, if You fail to comply with
589
+ this Public License, then Your rights under this Public License
590
+ terminate automatically.
591
+
592
+ b. Where Your right to use the Licensed Material has terminated under
593
+ Section 6(a), it reinstates:
594
+
595
+ 1. automatically as of the date the violation is cured, provided
596
+ it is cured within 30 days of Your discovery of the
597
+ violation; or
598
+
599
+ 2. upon express reinstatement by the Licensor.
600
+
601
+ For the avoidance of doubt, this Section 6(b) does not affect any
602
+ right the Licensor may have to seek remedies for Your violations
603
+ of this Public License.
604
+
605
+ c. For the avoidance of doubt, the Licensor may also offer the
606
+ Licensed Material under separate terms or conditions or stop
607
+ distributing the Licensed Material at any time; however, doing so
608
+ will not terminate this Public License.
609
+
610
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
611
+ License.
612
+
613
+ Section 7 -- Other Terms and Conditions.
614
+
615
+ a. The Licensor shall not be bound by any additional or different
616
+ terms or conditions communicated by You unless expressly agreed.
617
+
618
+ b. Any arrangements, understandings, or agreements regarding the
619
+ Licensed Material not stated herein are separate from and
620
+ independent of the terms and conditions of this Public License.
621
+
622
+ Section 8 -- Interpretation.
623
+
624
+ a. For the avoidance of doubt, this Public License does not, and
625
+ shall not be interpreted to, reduce, limit, restrict, or impose
626
+ conditions on any use of the Licensed Material that could lawfully
627
+ be made without permission under this Public License.
628
+
629
+ b. To the extent possible, if any provision of this Public License is
630
+ deemed unenforceable, it shall be automatically reformed to the
631
+ minimum extent necessary to make it enforceable. If the provision
632
+ cannot be reformed, it shall be severed from this Public License
633
+ without affecting the enforceability of the remaining terms and
634
+ conditions.
635
+
636
+ c. No term or condition of this Public License will be waived and no
637
+ failure to comply consented to unless expressly agreed to by the
638
+ Licensor.
639
+
640
+ d. Nothing in this Public License constitutes or may be interpreted
641
+ as a limitation upon, or waiver of, any privileges and immunities
642
+ that apply to the Licensor or You, including from the legal
643
+ processes of any jurisdiction or authority.
644
+
645
+ =======================================================================
646
+
647
+ Creative Commons is not a party to its public
648
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
649
+ its public licenses to material it publishes and in those instances
650
+ will be considered the “Licensor.” The text of the Creative Commons
651
+ public licenses is dedicated to the public domain under the CC0 Public
652
+ Domain Dedication. Except for the limited purpose of indicating that
653
+ material is shared under a Creative Commons public license or as
654
+ otherwise permitted by the Creative Commons policies published at
655
+ creativecommons.org/policies, Creative Commons does not authorize the
656
+ use of the trademark "Creative Commons" or any other trademark or logo
657
+ of Creative Commons without its prior written consent including,
658
+ without limitation, in connection with any unauthorized modifications
659
+ to any of its public licenses or any other arrangements,
660
+ understandings, or agreements concerning use of licensed material. For
661
+ the avoidance of doubt, this paragraph does not form part of the
662
+ public licenses.
663
+
664
+ Creative Commons may be contacted at creativecommons.org.
665
+
666
+ 5. OpenDiT (https://github.com/NUS-HPC-AI-Lab/OpenDiT/blob/master/LICENSE)
667
+
668
+ Copyright OpenDiT
669
+
670
+ Licensed under the Apache License, Version 2.0 (the "License");
671
+ you may not use this file except in compliance with the License.
672
+ You may obtain a copy of the License at
673
+
674
+ http://www.apache.org/licenses/LICENSE-2.0
675
+
676
+ Unless required by applicable law or agreed to in writing, software
677
+ distributed under the License is distributed on an "AS IS" BASIS,
678
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
679
+ See the License for the specific language governing permissions and
680
+ limitations under the License.
681
+
682
+ 6. Open-Sora (https://github.com/hpcaitech/Open-Sora/blob/main/LICENSE)
683
+
684
+ Copyright Open-Sora
685
+
686
+ Licensed under the Apache License, Version 2.0 (the "License");
687
+ you may not use this file except in compliance with the License.
688
+ You may obtain a copy of the License at
689
+
690
+ http://www.apache.org/licenses/LICENSE-2.0
691
+
692
+ Unless required by applicable law or agreed to in writing, software
693
+ distributed under the License is distributed on an "AS IS" BASIS,
694
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
695
+ See the License for the specific language governing permissions and
696
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Improved Quality, Synchrony, and Preference Alignment for Joint Audio-Video Generation
2
+
3
+ This codebase is built upon [JavisDiT](https://github.com/JavisDiT/JavisDiT). Many thanks to their contribution.
4
+
5
+ ## Installation
6
+
7
+ For CUDA 12.1, you can install the dependencies with the following commands.
8
+
9
+ ```bash
10
+ # create a virtual env and activate (conda as an example)
11
+ conda create -n javisdit python=3.10
12
+ conda activate javisdit
13
+
14
+ # install torch, torchvision and xformers
15
+ pip install -r requirements/requirements-cu121.txt
16
+
17
+ # install ffpmeg
18
+ conda install "ffmpeg<7" -c conda-forge -y
19
+
20
+ # the default installation is for inference only
21
+ pip install -v .
22
+ # for development mode, `pip install -v -e .`
23
+ # to skip dependencies, `pip install -v -e . --no-deps`
24
+
25
+ # replace
26
+ export PYTHON_SITE_PACKAGES=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
27
+ cp assets/src/pytorchvideo_augmentations.py ${PYTHON_SITE_PACKAGES}/pytorchvideo/transforms/augmentations.py
28
+ cp assets/src/funasr_utils_load_utils.py ${PYTHON_SITE_PACKAGES}/funasr/utils/load_utils.py
29
+
30
+ # (optional but recommended) install flash attention
31
+ # set enable_flash_attn=False in config to disable flash attention
32
+ pip install packaging ninja
33
+ pip install flash-attn --no-build-isolation
34
+ ```
35
+
36
+ ## Training
37
+
38
+ ### Data Preparation
39
+
40
+ In this project, we use a `.csv` file to manage all the training entries and their attributes for efficient training:
41
+
42
+ | path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text|
43
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---|
44
+ | /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy |
45
+
46
+ The content of columns may vary in different training stages. The detailed instructions for each training stage can be found in [here](assets/docs/data.md).
47
+
48
+ ### Stage1 - Audio Pre-Train
49
+
50
+ In this stage, we perform audio pretraining to intialize the text-to-audio generation capability:
51
+
52
+ ```bash
53
+ torchrun --standalone --nproc_per_node 8 \
54
+ scripts/train.py \
55
+ configs/wan2.1/train/stage1_audio.py \
56
+ --data-path data/meta/audio/train_audio.csv
57
+ ```
58
+
59
+ The resulting checkpoints will be saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc/model`. You can move the checkpoints to `exps/audio_pretrain/` for later use.
60
+
61
+ ```bash
62
+ mkdir -p exps/audio_pretrain
63
+ mv runs/000-Wan2_1_T2V_1_3B/epoch049-global_step53000 exps/audio_pretrain/
64
+ ```
65
+
66
+ ### Stage2 - Audio-Video SFT
67
+
68
+ In this stage, we perform finetuning for joint audio-video generation (with LoRA adaptation):
69
+
70
+ ```bash
71
+ torchrun --standalone --nproc_per_node 8 \
72
+ scripts/train_prior.py \
73
+ configs/wan2.1/train/stage2_audio_video.py \
74
+ --data-path data/meta/video/train_av_sft.csv
75
+ ```
76
+
77
+ The resulting checkpoints will be saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc` with the `model` and `lora` subfolders. You can move the checkpoints to `exps/audio_video_sft/` for later use.
78
+
79
+ ```bash
80
+ mkdir -p exps/audio_video_sft
81
+ mv runs/000-Wan2_1_T2V_1_3B/epoch001-global_step13000 exps/audio_video_sft/
82
+ ```
83
+
84
+ ### Stage3 - Audio-Video DPO
85
+
86
+ In this stage, we perform DPO to align joint audio-video generation with human preference (reuse and update the LoRA parameters learned from the previous stage):
87
+
88
+ ```bash
89
+ torchrun --standalone --nproc_per_node 8 \
90
+ scripts/train.py \
91
+ configs/wan2.1/train/stage3_audio_video_dpo.py \
92
+ --data-path /data/meta/avdpo/train_av_dpo.csv
93
+ ```
94
+
95
+ The resulting checkpoints will be also saved at `runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc` with the `model` and `lora` subfolders. You can move the checkpoints to `checkpoints/` for inference and evaluation.
96
+
97
+ ```bash
98
+ mv runs/0aa-Wan2_1_T2V_1_3B/epoch0bb-global_stepccc checkpoints/your_model
99
+ ```
100
+
101
+ ## Inference
102
+
103
+ The basic command line inference is as follows:
104
+
105
+ ```bash
106
+ resolution=480p # or 240p
107
+ num_frames=65 # 4s
108
+ aspect_ratio="9:16"
109
+
110
+ DATASET="JavisBench" # or JavisBench-mini
111
+ prompt_path="data/eval/JavisBench/${DATASET}.csv"
112
+ save_dir="samples/${DATASET}"
113
+
114
+ model_path="checkpoints/your_model"
115
+ ngpus=1
116
+
117
+ torchrun --standalone --nproc_per_node ${ngpus} \
118
+ scripts/inference.py \
119
+ configs/wan2.1/inference/sample.py \
120
+ --resolution ${resolution} --num-frames ${num_frames} --aspect-ratio ${aspect_ratio} \
121
+ --prompt-path ${prompt_path} --model-path ${model_path} \
122
+ --save-dir ${save_dir} --verbose 1
123
+
124
+ # (Optional, for evaluation) Extract audios from generated videos
125
+ python -m tools.datasets.convert video ${save_dir} --output ${save_dir}/meta.csv
126
+ python -m tools.datasets.datautil ${save_dir}/meta.csv --extract-audio --audio-sr 16000
127
+ rm -f ${save_dir}/meta*.csv
128
+ ```
129
+
130
+ Setting `--verbose 2` will display the progress of a single diffusion process. And you can replace the `--prompt-path ${prompt_path}` with a single prompt to generate a single video, such as `--prompt "a beautiful waterfall"`.
131
+
132
+
133
+ ## Evaluation
134
+
135
+ ### Installation
136
+
137
+ Install necessary packages:
138
+
139
+ ```bash
140
+ pip install -r requirements/requirements-eval.txt
141
+ ```
142
+
143
+ Download the meta file and data of [JavisBench](https://huggingface.co/datasets/JavisDiT/JavisBench), and put them into `data/eval/`:
144
+
145
+ ```bash
146
+ cd /path/to/JavisDiT
147
+ mkdir -p data/eval
148
+
149
+ huggingface-cli download --repo-type dataset JavisDiT/JavisBench --local-dir data/eval/JavisBench
150
+ ```
151
+
152
+ ### Evaluation on JavisBench or JavisBench-mini
153
+
154
+ Run the following code and the results will be saved in `./evaluation_results`. For details please refer to the details of [JavisBench](eval/javisbench/README.md).
155
+
156
+ ```bash
157
+ MAX_FRAMES=16
158
+ IMAGE_SIZE=224
159
+ MAX_AUDIO_LEN_S=4.0
160
+
161
+ # Params to calculate JavisScore
162
+ WINDOW_SIZE_S=2.0
163
+ WINDOW_OVERLAP_S=1.5
164
+
165
+ METRICS="all"
166
+ RESULTS_DIR="./evaluation_results"
167
+
168
+ DATASET="JavisBench" # or JavisBench-mini
169
+ INPUT_FILE="data/eval/JavisBench/${DATASET}.csv"
170
+ FVD_AVCACHE_PATH="data/eval/JavisBench/cache/fvd_fad/${DATASET}-vanilla-max4s.pt"
171
+ INFER_DATA_DIR="samples/${DATASET}"
172
+
173
+ python -m eval.javisbench.main \
174
+ --input_file "${INPUT_FILE}" \
175
+ --infer_data_dir "${INFER_DATA_DIR}" \
176
+ --output_file "${RESULTS_DIR}/${DATASET}.json" \
177
+ --max_frames ${MAX_FRAMES} \
178
+ --image_size ${IMAGE_SIZE} \
179
+ --max_audio_len_s ${MAX_AUDIO_LEN_S} \
180
+ --window_size_s ${WINDOW_SIZE_S} \
181
+ --window_overlap_s ${WINDOW_OVERLAP_S} \
182
+ --fvd_avcache_path ${FVD_AVCACHE_PATH} \
183
+ --metrics ${METRICS}
184
+ ```
assets/demo/Fig7-JAVG/case1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a50cd8a5395c8961df05e51f3eb13eb4ee01ba9d56d949208083c52b0f8c79
3
+ size 280678
assets/demo/Fig7-JAVG/case2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc657fb7679449e2b44dcc277b001d87114e51fa8665ae782af062827ba6d0a4
3
+ size 286608
assets/demo/FigA11-X-Cond/A2V.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9edc892ccf4f050d047c91dc25a11400b2be5628369d3659efb4a5db019a49bc
3
+ size 553560
assets/demo/FigA11-X-Cond/AI2V.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91c9c88664c640860c90dc47b7ee44fdff5fe08255de0e6f349a3be78af7bd8c
3
+ size 564407
assets/demo/FigA11-X-Cond/AV-Ext.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60f1781b846859f0c7878f3f541dacb3d83c27d48d23753d8b058a39c314acf3
3
+ size 598493
assets/demo/FigA11-X-Cond/GT.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90b0ffb6de655e2fb1457011bb26a150d6a1472e29313b58bd2ab21b50047f2e
3
+ size 289664
assets/demo/FigA11-X-Cond/I2AV.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b65d9b301389e2691efc054cc7e66dad0fd5d56dce15c0077ff0b8baff5f84
3
+ size 565559
assets/demo/FigA11-X-Cond/V2A.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc5d55a5d211295314a932e4007b8fe9505bc417c33921c4103e37b6e8a09863
3
+ size 567901
assets/demo/FigA9-JAVG/case1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccf5f2c3100a945c84f4f035cc26fc71cb582f824f5b0eee2c3254b08bd7a653
3
+ size 599929
assets/demo/FigA9-JAVG/case2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99920708093b21c0a2e8ca51fb2e4bcede4818bf4c6267f22e1c0775960842d
3
+ size 562301
assets/demo/FigA9-JAVG/case3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d95f9577d1162b5d2c23e9e4f5e3862be1afe1c8a7f0b3a18f3ee274f6438f
3
+ size 569684
assets/demo/FigA9-JAVG/case4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02cdb118dcc68b6cbe5fef868c8586677d6c50e9c7607c3d4b8d8f651a5e3d49
3
+ size 559562
assets/demo/FigA9-JAVG/case5.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75a37d17261386226b64b4b856bdbd70b6ba4a2561fee4fbbbcab9fe1fa35d5
3
+ size 537903
assets/demo/FigA9-JAVG/case6.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccae34cbd657a06a690b550304949322c4975bdb99f742eb74b40e2971048399
3
+ size 579138
assets/demo/FigA9-JAVG/case7.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e534daf85a87b4448036077e19593ebb76aae07623ae3fd307db1e71a3c03dfc
3
+ size 558740
assets/demo/audio_prompts.txt ADDED
@@ -0,0 +1,1222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ People are clicking, breathing, and speaking.
2
+ Sound effects, music, and human sounds are heard.
3
+ Laughter and conversation are heard, speech synthesizer sounds are heard, and breathing is heard.
4
+ Whistling and wind noise is present.
5
+ Mechanisms make beeping and tapping sounds.
6
+ A woman speaks near running water, a man and a child speak, and music plays.
7
+ A car alarm goes off and a car drives by with the sound of wind.
8
+ Thunks and background noise can be heard.
9
+ A turkey makes sounds and mechanisms tick.
10
+ A ringtone is ringing repeatedly.
11
+ A sine wave plays.
12
+ Background noise, bird songs, surface contact, and the sound of a clock are heard.
13
+ Male and female speech, television, and crowd sounds are heard, along with buzzers.
14
+ Cats make noise in a noisy background.
15
+ An engine hums in the background.
16
+ People are talking and a door is slamming and knocking while music is playing.
17
+ A bicycle is moving and there is background noise.
18
+ Sound effects are occurring.
19
+ Men sing with music.
20
+ Farts, background noise, sound effects, and men speaking with chirping birds in the background.
21
+ People are cooing, making noise, and speaking while background noise persists.
22
+ A rooster is crowing and mechanisms are functioning.
23
+ People are coughing and making sounds in the background.
24
+ Various surface contacts and ticks are heard in the background.
25
+ Men speak and breathe next to music and each other.
26
+ Whispering and human sounds come from mechanisms.
27
+ Women are slurping and speaking over background noise.
28
+ A truck horn blares.
29
+ Background noise and people talking, drinking, and eating can be heard.
30
+ Clicking and tearing mechanisms can be heard.
31
+ A dial tone is heard.
32
+ People are chewing and making noise in the background, with sounds of glass clinking and whispering.
33
+ People are speaking and background noise is present.
34
+ Background noise, breaking sounds, and conversation can be heard as men speak and laugh.
35
+ A phone rings with background noise.
36
+ A car is revving and auto racing.
37
+ The sound of horse hooves and mechanisms can be heard with wind noise in the background.
38
+ Women are speaking and scraping sounds are present.
39
+ Background noise and a doorbell ringing are heard.
40
+ A stomach rumbles with background noise.
41
+ A sliding door is opened and closed with ticking mechanisms heard.
42
+ Sound effects are heard.
43
+ A static noise is heard with a sine wave in the background and music is playing.
44
+ Glass sounds, a ringtone, tapping, speech synthesis, laughter, and ticking are heard.
45
+ A camera clicks with male speech and mechanical sounds.
46
+ A sine wave plays.
47
+ Men are speaking with background noise.
48
+ Music is playing.
49
+ A heartbeat and background noise can be heard.
50
+ Pigs are oinking, music is playing, and a woman is speaking.
51
+ Mechanisms and an alarm are heard, with music playing.
52
+ Goats are bleating and a clock is ticking.
53
+ A heavy engine and barking can be heard, with laughter and more barking in the background.
54
+ People are playing a video game and chatting with sound effects and footsteps.
55
+ Men are speaking with background noise.
56
+ Breathing and mechanisms make ticking and pattering sounds.
57
+ Chewing and surface contact sounds are heard.
58
+ Background noise, laughter, and ticking sounds are heard.
59
+ A sound effect is heard.
60
+ Snoring and breathing sounds can be heard in the background.
61
+ Dial tones, background noise, ticking, and a man speaking can be heard.
62
+ The sound of a woman speaking is heard over background noise and caterwauling.
63
+ A man speaks over a plopping sound effect.
64
+ Mechanisms make sounds along with gushing water and roaring big cats.
65
+ Music is played, with a tick and a sheep bleating.
66
+ A ding is heard.
67
+ A single sound effect is heard.
68
+ Men are speaking and a road vehicle is heard with human voices.
69
+ There is a buzz, wind, and bird songs, followed by ticking sounds.
70
+ Clicking and typing on a computer keyboard, breathing, and a man speaking are heard.
71
+ Music and wind play together.
72
+ Cutting sounds, birds chirping, and rustling can be heard.
73
+ A trickle sound and frogs are croaking.
74
+ A bird calls, followed by a fart and background noise.
75
+ Heart sounds are heard in a repeating pattern.
76
+ A sine wave is heard with speech synthesizers.
77
+ A sound effect is heard.
78
+ Music, a tick, and a sound effect are heard.
79
+ A man speaks as farts and laughter are heard in the background with wind noise.
80
+ A woman walks and speaks while sound effects and mechanical noises can be heard.
81
+ A boing, music, and female singing are heard.
82
+ Someone speaks with surface contact and shuffling card sounds.
83
+ A sound effect occurs.
84
+ Mechanisms and laughter mix with wind, running water and splashes.
85
+ Surface contact occurs with whacks and thwacks as people laugh.
86
+ A woman is speaking and heart sounds and clicking are heard in the background.
87
+ Wind noise is heard with a buzz.
88
+ Cats purr with background noise.
89
+ A bus is heard, air brakes are activated and birds chirp.
90
+ Background noise, bird calls, and surface contact sounds are heard.
91
+ A doorbell, man speaking, music, and surface sounds are heard.
92
+ A doorbell rings.
93
+ Birds are singing with mechanisms and ticking in the background.
94
+ Beeps and male speech are heard.
95
+ Wind, mechanisms, horse sounds, bird sounds, and wind noise can be heard.
96
+ The sound of a waterfall.
97
+ Music and background noise are heard.
98
+ Machinery runs, scrapes, ticks, and taps occur.
99
+ An explosion and shouting are heard, with background noise and people talking.
100
+ Speech synthesizers, human voices, and plops are heard.
101
+ A dog is barking and mechanisms are heard.
102
+ Howls occur among mechanisms.
103
+ Tapping sounds are interspersed with whispers.
104
+ A jet engine is heard.
105
+ Sound effects and machine gun sounds.
106
+ Heartbeats are heard, with brief tones and background noise.
107
+ Various ticks and mechanisms can be heard.
108
+ Radio, telephone dialing, and a man speaking are heard with background noise.
109
+ Background noise and sound effects are present.
110
+ Laughter, man's voice and human sounds can be heard in the background noise.
111
+ Only music is heard.
112
+ Sound effects and background noise are heard.
113
+ Various sound effects are heard, including clicking, whooshing, and rumbling.
114
+ Laughter and a thunk occur amidst background noise and sound effects.
115
+ Background noise, smoke alarms, and a woman speaking can be heard.
116
+ There is reverberation and a bell sound with surface contact.
117
+ A man is speaking and zipping up with background noise.
118
+ Wind, vehicle, sirens, and crunching sounds.
119
+ A purring sound is heard with background noise.
120
+ A microwave beeps and a door slams in the background noise.
121
+ A camera is clicking with background noise and mechanisms.
122
+ People are chewing, whispering, and breathing.
123
+ Surface contact sounds and a man's speech are heard in the background noise.
124
+ Music is playing, there are ticking sounds and sound effects.
125
+ Heartbeats sound repeatedly.
126
+ Sound effects, video game sounds, and human voices are heard.
127
+ Background noise and clicking sounds can be heard.
128
+ A sound effect, a man speaking, and an explosion occur.
129
+ A dog barks and yips amidst background noise and human sounds.
130
+ A person is typing on a computer keyboard with background noise.
131
+ Sound effects and plops mix with camera clicks and other mechanisms.
132
+ A sine wave is heard.
133
+ A sine wave can be heard.
134
+ Music is playing and an electronic tuner is being used.
135
+ A man speaks over wind noise, roosters crowing, and ticks.
136
+ Crunching sounds and background noise are heard with music playing.
137
+ People are chewing and making noise.
138
+ A bus is honking its horn.
139
+ A cat is purring and there is background noise.
140
+ Mechanisms are ticking.
141
+ Background noise and breathing are heard.
142
+ Sine waves, busy signals, dialing, sound effects, and other sourceless sounds are heard.
143
+ Gunfire, a man speaking, and gasping are heard, followed by video game sounds.
144
+ A single, isolated sound effect is played.
145
+ Hooves clip-clop amidst background noise.
146
+ A sink is filling or washing, and water is heard in the background noise.
147
+ Chewing and biting sounds mix with ticking and surface contact noises.
148
+ A man is speaking while a camera clicks in the background.
149
+ A sound effect is played.
150
+ Wind is blowing and animals can be heard.
151
+ A sine wave plays.
152
+ A motor vehicle is moving on the road, a man is speaking, and there are sounds of a ticking, tapping, and an owl.
153
+ Background noise, ticking sounds, sneezing, breathing, and a woman speaking are heard.
154
+ Crinkling and rustling sounds with background noise.
155
+ Male speech is heard over surface contact and background noise.
156
+ A person is walking and playing a video game with whacks and human voices.
157
+ A bell tolls and wind blows as a sheep bleats and a child speaks.
158
+ A heartbeat can be heard with background noise and ticking sounds.
159
+ Wind, coughing, car honking, speech, and female speech can be heard with wind noise picked up by the microphone.
160
+ A human voice speaks during a thunderstorm with background noise and clicking sounds.
161
+ A man is speaking over background noise and clicking sounds.
162
+ Mechanical and breathing sounds mix with surface contact and whispering.
163
+ A water tap drips in the background, a man speaks, cutlery clinks, and a scraping sound is heard.
164
+ Wind, bleating goats, bird songs, and buzzing sounds mix.
165
+ Chirping of birds is heard.
166
+ Whispering, crumpling, and breathing sounds are heard.
167
+ A mouse clicks and surfaces are contacted with background noise.
168
+ Ticking sounds and surface contact are occurring repeatedly.
169
+ There are various sounds including water, video game effects, and human voice.
170
+ A man speaks with background noise and a sound effect is heard.
171
+ Firecrackers and wind can be heard with sound effects.
172
+ Mechanisms operate with tapping and surface contact.
173
+ A man speaks and sighs as a clock ticks.
174
+ A man speaks as a mechanical fan runs and clicking sounds are heard in the background.
175
+ Ticking and firecracker sounds are heard in the background.
176
+ Music is playing with taps in the background.
177
+ A heartbeat is heard with background noise.
178
+ A hum is heard followed by heartbeats and sounds of writing and plopping.
179
+ Wind, cars, and various objects are making noise and moving.
180
+ Background noise and animals (crickets, animals growling) are heard.
181
+ People are laughing and speaking over gurgling water and breathing sounds.
182
+ Keypress tones, background noise, a cash register, and a ding can be heard.
183
+ Music, men speaking, and artillery fire can be heard.
184
+ A man is speaking, farting, and tapping with mechanisms.
185
+ A mix of speech, sound effects, and music.
186
+ A whooshing sound, surface contact, and an explosion are heard.
187
+ Background noise and cooing sounds are heard.
188
+ Gunshots and clangs are heard among brief tones.
189
+ Breaking sounds are accompanied by footsteps and mechanisms.
190
+ People are making sounds, with a cat meowing and background noise.
191
+ Mechanisms, surface contact, pouring, tapping, breathing, and more pouring and tapping can be heard.
192
+ A firecracker is heard, followed by footsteps and background noise.
193
+ Background noise is heard, with various ringtones and keypress tones.
194
+ Mechanisms with human sounds, ticks, taps, and female speech.
195
+ Human voices speaking with background noise, surface contact, and male speech are heard.
196
+ Water flows and mechanisms make ticking sounds.
197
+ An alarm is ringing.
198
+ Cards shuffle, mechanisms sound, and breathing is heard.
199
+ Mechanisms, ticking, and beeping sounds are heard.
200
+ A man speaks and clicks are heard, followed by surface contact and more clicking.
201
+ Mechanisms, tapping, and ticking sounds are heard.
202
+ Music and background noise are heard.
203
+ A man is speaking while a crowd is making noise.
204
+ A whip is heard in the wind, along with the sound of glass clinking and human voices.
205
+ Sound effects, dings, and speech synthesizer sounds are heard.
206
+ Whistling, sound effects, and mechanisms are heard.
207
+ Music plays over noise.
208
+ Background noise and purring are heard.
209
+ Mechanisms and cawing birds make sounds while glass is heard.
210
+ Sheep bleat and wind noise can be heard.
211
+ A man speaks, followed by echo, background noise, and brief tones.
212
+ Surface contact, ticking, and male speech are heard over background noise.
213
+ An aircraft engine is heard.
214
+ A tuning fork, ticking, and scraping noises create a busy environment.
215
+ Scissors are heard, followed by a coin dropping, beeps, music, video game sounds, and sound effects.
216
+ Sound effects, breaking, background noise, whispering, and human sounds are present.
217
+ An aircraft engine is heard.
218
+ A sound effect plays with background noise.
219
+ A cat purrs as something ticks and birds sing.
220
+ A busy signal is heard.
221
+ Animals and wind noise with laughter, bleating, and whispering.
222
+ A sound effect is heard.
223
+ Something is being poured with background noise.
224
+ A phone dial tone is followed by keypress tones and a sound effect.
225
+ Coin dropping, ticking, and mechanism sounds are heard.
226
+ Pigs and music are heard.
227
+ Liquid is heard, followed by footsteps and video game sounds.
228
+ Music plays and a man speaks, followed by a bell ringing.
229
+ Mechanisms can be heard, a cat meows, a woman speaks, and breathing sounds can be heard.
230
+ A woman is speaking while birds chirp and ticks are heard.
231
+ A fart is heard, speech synthesizers are speaking, and laughter is heard.
232
+ Horns honk repeatedly from a mid-frequency engine.
233
+ Walking sounds and whistling with some human voice and sound effects.
234
+ The wind blows, with ticking, wind noise, breathing, laughter, and human sounds heard.
235
+ Bursting and plopping sounds, and sound effects are heard.
236
+ Beeps, camera sounds, tapping, and surface contact can be heard.
237
+ A brief tone is heard.
238
+ People are tapping and making surface contact in a noisy environment.
239
+ Background noise and barking dogs are heard with bird vocalizations and ticking sounds.
240
+ A man is speaking, breathing, making surface contact sounds and crumpling papers.
241
+ Background noise, walking, splashing, and gurgling are heard.
242
+ Music and sound effects are played.
243
+ Child and adult speech is heard amid background noise and ticking.
244
+ Background noise, sound effects, and speech synthesizer.
245
+ A hammer is being used and making tapping noises.
246
+ Mechanical sounds alternate with patter and mouse sounds, and men speak intermittently.
247
+ Men are speaking, with a hair dryer running in the background.
248
+ A electric shaver ticks.
249
+ Human voices and snoring can be heard.
250
+ Women and a man are speaking, a pig is heard, and people are breathing and making contact sounds.
251
+ A woman is speaking, people are talking and kids are playing in the background while water can be heard and occasionally someone takes a breath.
252
+ A single ping sound is heard.
253
+ Background noise and camera sounds are heard.
254
+ Sonar beeps in the background.
255
+ A bicycle bell rings.
256
+ Something ticks while making surface contact.
257
+ Heartbeats are heard over background noise and water, with occasional music.
258
+ Chewing and animal sounds are heard with a tap and thump.
259
+ A sine wave is playing.
260
+ Beeps are heard repetitively.
261
+ An air horn is heard, and there is background noise with tapping sounds.
262
+ A man burps, makes human sounds, speaks and breathes with mechanisms and surface contact noises.
263
+ Growling and breathing sounds are heard.
264
+ A man speaks with background noise while birds sing.
265
+ Wind, birds are chirping, mechanisms are ticking, and ticks are ticking.
266
+ Music, ding sounds, and typing on a typewriter.
267
+ Music is playing with background noise.
268
+ A man speaks while various mechanisms and surface contacts occur, and breathing is heard.
269
+ Wind blows, animals make sounds, and birds sing.
270
+ Breathing and gobbling sounds can be heard in the background.
271
+ Breathing sounds are heard.
272
+ Music is playing with a man speaking and background noise.
273
+ Sound effects occur.
274
+ Birds are flying and tapping with background noise and pigeon sounds.
275
+ Birds sing and chirp, with wind noise and occasional bird calls.
276
+ The background noise is interrupted by beeps.
277
+ A man is speaking while tools can be heard in the background.
278
+ Paper rustles repeatedly alongside ticking sounds.
279
+ A video game is playing and a man is speaking.
280
+ Wind rustles and horses neigh.
281
+ A sound effect is heard.
282
+ Cards are being shuffled with background noise.
283
+ Scissors are being used repeatedly.
284
+ A busy signal, beeps, keypress tone, ticking, and background noise are heard.
285
+ Singing and a ding fill the air with melody.
286
+ A woman is speaking, people are breathing, and there is background noise, ticking, kids speaking, humming, tapping, and more breathing.
287
+ Sound effects are heard.
288
+ Sound effects and heartbeat with music are heard.
289
+ Rustling sound is heard repeatedly with background noise.
290
+ A car is reversing with beeps and scrapes.
291
+ People are shouting and talking over background noise and a child speaking.
292
+ Whips cracking are heard.
293
+ Mice are heard scurrying.
294
+ A bird makes clucking sounds, crows, and flaps its wings.
295
+ Music is playing.
296
+ A whale is vocalizing and water splashes.
297
+ Cars honk and tick as they pass each other on the road.
298
+ Chewing and bird vocalizations are heard, with background noise.
299
+ A beep sounds.
300
+ A sound effect is followed by a glass shatter, mechanisms and ticks.
301
+ A video game is being played with various sound effects and whacking noises.
302
+ People make human sounds and laugh.
303
+ Women are speaking, breathing, and making clicking and sound effects.
304
+ A man is singing and background noise and writing sounds are present.
305
+ Mechanisms are being used and music is playing over a background noise.
306
+ A sine wave is playing.
307
+ Women are laughing and speaking.
308
+ A knock is heard in the background.
309
+ A man is speaking with background noise, objects are making contact with a surface and car horns are honking.
310
+ Wind and horns are heard, with a group of people giggling.
311
+ People are speaking, and mechanisms, cutlery, and boiling sounds can be heard.
312
+ A woman speaks after sneezing, then there are ticking and breathing sounds.
313
+ A man is speaking and sheep are bleating with ticking sounds in the background.
314
+ Glass is shattering and a siren is sounding in the wind.
315
+ Pigs are heard, with wind and ticking sounds.
316
+ A man's speech and background noise are accompanied by various bird calls and laughter.
317
+ A dog pants with sounds of scraping and ticking in the background.
318
+ A speech synthesizer is speaking with background noise and clicking sounds.
319
+ Sound effects, video game sounds, human voice, and more are heard.
320
+ Mechanisms are operating, with breathing, sneezing, and surface contact sounds.
321
+ Birds chirp and croak with background noise.
322
+ Medium frequency engines and car horns alternate with ticks.
323
+ Music is playing with background noise, video game sound, and a police car is passing by with its siren.
324
+ Sound effects and sine waves are heard.
325
+ A bell rings and there is the sound of mechanisms, typing on a computer keyboard, scraping, breaking, and laughter.
326
+ Mechanisms and speech with background noise are heard.
327
+ Music is playing with a ticking clock and ticking sounds.
328
+ Background noise, bird chirping, and birds are heard.
329
+ A whale vocalization and a sound effect can be heard.
330
+ Music and background noise, and ticks can be heard.
331
+ Owls are hooting and mechanisms are functioning.
332
+ Female singing and music are heard.
333
+ Water is flowing, birds are chirping and tweeting and people are speaking.
334
+ Multiple people engage in various speech patterns while whispering and background noise are present.
335
+ Windows are being closed and a tap and background noise are heard.
336
+ Wind, breathing, birds chirping, a man speaking, and ticking sounds can be heard.
337
+ A man is speaking, breathing, and speaking again with background noise.
338
+ Various video game sounds and human voices are heard.
339
+ Typing, writing, and clicking sounds can be heard while a man speaks and mechanisms and breathing sounds are in the background.
340
+ A ticking sound is heard with fireworks and mechanisms.
341
+ Background noise can be heard, followed by a woman speaking and surface contact.
342
+ People are walking and birds are singing outside.
343
+ Farts, squealing, and mechanisms are in operation.
344
+ Honking and background noise are heard, with a man speaking.
345
+ Men are speaking and making grunting sounds with mechanisms in the background.
346
+ A series of sound effects are being played.
347
+ Birds chirp and a car horn sounds in the background.
348
+ A child is speaking with background noise.
349
+ Women speak and machinery sounds while glass clinks.
350
+ Cars are revving and accelerating with bleats and a bell.
351
+ Wind noise, rustling, breathing, sniffing, and background noise are heard.
352
+ Birds sing and an explosion is heard over background noise.
353
+ Crickets are chirping and there are occasional plops.
354
+ Music and background noise are heard.
355
+ Background noise and sound effects of tapping and plopping are present along with video game sounds.
356
+ Music and sound effects play.
357
+ A man speaks, with wind noise and surface contact sounds.
358
+ A series of sound effects are playing with background music.
359
+ The sound of horses walking is mixed with some background noise.
360
+ A sine wave and chirp tone are heard.
361
+ Wind and nature sounds, including crickets and whooshing, can be heard.
362
+ A stomach grumbles amidst background noise.
363
+ A man speaks, bird songs play, and a skateboard moves over background noise.
364
+ Children are speaking and birds are singing with waterfowl and human sounds.
365
+ Background noise and shuffling sounds are present.
366
+ A man is speaking with music and a ding sound.
367
+ A man is speaking and slapping while mechanisms are heard.
368
+ Video game sounds are heard before something breaks.
369
+ Sound effects, speech, and a ding are heard.
370
+ Flapping and wind noise is heard.
371
+ An explosion, screaming, music, a man speaking, and video game sounds occur.
372
+ Sounds of paper crinkling, mechanisms, and surface contact are heard.
373
+ A woman is speaking, music is playing, and sound effects can be heard.
374
+ Only mechanisms can be heard.
375
+ Music plays with ticking mechanisms and human sounds.
376
+ Typewriters type and sound effects play.
377
+ Men are speaking and clicking sounds are heard.
378
+ A man is speaking, breathing, chewing, and background noise is heard.
379
+ Music plays continuously.
380
+ A sound effect is followed by a woman speaking.
381
+ Music is playing.
382
+ Honking cars can be heard in the background noise.
383
+ A dial tone is heard followed by music.
384
+ Bird sounds, thunks, and mechanisms can be heard.
385
+ An animal makes noises while mechanisms are operating.
386
+ A beep is followed by a man humming, speaking and ticking with background noise.
387
+ A variety of birds are chirping, animals are making noises, people are talking and singing.
388
+ Men are talking with background noise.
389
+ Background noise, clicking, footsteps, and farts are heard.
390
+ An alarm clock rings in the background.
391
+ Background noise and wind noise with microphone, neighing, and a man speaking are heard.
392
+ A sound effect is heard.
393
+ Sanding and female speech with background noise is heard.
394
+ Birds are chirping and singing, with occasional coughing and movement.
395
+ Humans make sounds, run and make sound effects.
396
+ A bicycle bell rings.
397
+ Background noise with a doorbell ringing.
398
+ Heartbeats are heard with background noise.
399
+ Glass is clinking and surfaces are being contacted with background noise.
400
+ A man is speaking and typing on a computer keyboard.
401
+ Fire crackles as a woman sings.
402
+ Various birds chirp, crickets chirp, the wind blows, and footsteps are heard.
403
+ Background noise is present while crickets chirp, paper rustles, and tearing sounds can be heard.
404
+ A man is speaking, bird calls are heard, and artillery fire can be heard.
405
+ Background noise and clicking sounds are heard.
406
+ Mechanisms and paper are crumpling and rustling.
407
+ Music is playing.
408
+ Wind is blowing, mechanisms are ticking, birds are chirping and cawing, and tapping is heard.
409
+ Mechanisms and women are speaking, and taps are heard.
410
+ Chirping birds and wind are heard, then a woman sings.
411
+ Various sounds including whistling, speech, music and tapping are heard.
412
+ Background noise, wind, vehicles passing by, chopping, birds singing, laughter, and breathing are heard.
413
+ Heart sounds and background noise are heard, with occasional whoops and laughter.
414
+ A bell rings and a train is passing by, blowing its horn.
415
+ Music, wind, and wind chimes are heard.
416
+ Various human voices and sound effects are heard.
417
+ Camera clicks and mechanical mechanisms are heard.
418
+ A human voice can be heard.
419
+ Paper is rustled.
420
+ Wind noise, airplane sounds, bird calls, and squeaks are heard.
421
+ Various sound effects are heard with whooshing noises.
422
+ Video game sounds and an animal are heard.
423
+ Water flows, splashes, and people breathe.
424
+ People are making various human sounds, with sound effects in the background.
425
+ A sound effect accompanies a child's speech.
426
+ Dogs are barking and there is background noise.
427
+ Men speaking and breathing are heard in the background.
428
+ Footsteps can be heard walking.
429
+ Mechanisms, human voices, and breathing are heard with slapping and tapping sounds.
430
+ Clicking and human sounds are heard with background noise.
431
+ Background noise, radio, and women speaking are heard.
432
+ Multiple sound effects are heard, with a purring sound in between.
433
+ Music plays and a clock ticks and sings.
434
+ Birds chirp and tweet in the background.
435
+ Breathing, chewing, and surface contact sounds are heard with background noise.
436
+ A bell sound, such as that of a doorbell, is heard.
437
+ Background noise and men speaking can be heard.
438
+ Mechanisms and shuffling cards sound, with occasional honks of a vehicle horn.
439
+ Mechanisms and rodents are heard.
440
+ Beeping machinery sounds are heard.
441
+ A sound is made by scratching followed by a tap.
442
+ Mechanisms are making whacking sounds, while a gurgling noise and speech synthesizer are heard.
443
+ A doorbell and barking are heard.
444
+ Background music is playing.
445
+ Sounds of running, gasping, and breathing are heard with background noise.
446
+ A motorcycle makes a sound, followed by video game sounds and tire skidding.
447
+ Background noise, conversation, tapping, and speech are heard, with a child speaking.
448
+ A man speaks with background noise and clicking.
449
+ Wild animals make noises.
450
+ Mosquitoes, slapping sounds, crickets, and mosquitoes are heard.
451
+ The sound of a rumble is present.
452
+ Background noise, ticking, thumping, chirping, and tweeting are heard.
453
+ Background noise is present and a buzzer is heard.
454
+ Men are speaking and spraying with ticking sounds in the background.
455
+ Music and sound effects are heard.
456
+ People are talking, whispering, laughing, and making sounds in a noisy environment.
457
+ A sound effect is heard in this audio sequence.
458
+ A female is speaking while a heartbeat is heard in the background.
459
+ A mechanical fan is running and wind is blowing through the microphone.
460
+ Video game sounds, wind, laughter, gasping, and men speaking are heard.
461
+ Bells and ticks are heard with background noise.
462
+ A device beeps, someone screams and cries, followed by a whack and static.
463
+ A motorcycle revs and accelerates.
464
+ Music plays as water flows and background noise is heard.
465
+ Wind is blowing, water is heard, a sailboat is sailing, and breathing is heard in the background.
466
+ Background noise and ticking sounds are heard.
467
+ Background noise, a woman is speaking, and music is playing.
468
+ Whispering and tapping sounds are interspersed with breathing and surface contact.
469
+ Mechanisms and bouncing sounds can be heard with background breathing.
470
+ Wind is blowing, birds are singing, and a man is speaking, with gunshots in between.
471
+ A firecracker goes off with background noise.
472
+ A woman speaks and writes with tapping and breathing sounds in a noisy background.
473
+ Footsteps and video game sounds are heard.
474
+ Mechanisms, shuffling cards, and surface contact sounds are heard.
475
+ Fire is heard, running, a vehicle, and a police car siren can be heard.
476
+ Pigeons, doves, birds, barks, and ticks are heard with background noise.
477
+ A zipper is opened, and people talk and breathe, with a horse neighing and a tap running.
478
+ Scraping and mechanisms sounds, a television is on, and a man is speaking.
479
+ A ringtone, doorbell, and breathing sounds are heard.
480
+ A man speaks while birds sing and a rowboat moves through a river.
481
+ Background noise, bird calls, and music are heard.
482
+ A heartbeat and hum are audible.
483
+ Bells ring with background noise and wind sounds.
484
+ A person is crying and sobbing while mechanisms make noise and they take breaths.
485
+ A sine wave sound is heard.
486
+ Laughter, a bell, and mechanisms accompany a woman speaking.
487
+ Tapping and smoke alarm sounds can be heard.
488
+ Mechanisms can be heard with a human voice.
489
+ Beeping and human voices are heard with noise.
490
+ Animals barking, bleating, and panting with background noise and ticks.
491
+ Tapping, writing, and mechanisms are making noise.
492
+ Background noise, whacks, and breaking sounds are heard.
493
+ Thunderstorm and ticking sounds with human sounds in the background.
494
+ Heartbeats are heard alongside footsteps, wind, and the sound of a car.
495
+ A heartbeat is heard, followed by female and male speech and a sound effect.
496
+ Zippers are being opened and closed amidst background noise.
497
+ People are talking and making noises on a surface with breathing and ticking sounds.
498
+ Footsteps, speaking, thumping sounds are heard.
499
+ Sound effects play with a thump.
500
+ A whip cracks, people make sounds, and a man speaks and laughs with ticking in the background.
501
+ A man is typing on a computer and playing video games while talking to someone and listening to music.
502
+ Background noise and a hoot are heard along with tapping.
503
+ Birds sing, people walk and talk, and a vehicle drives by.
504
+ People are making chewing, human sounds and mechanisms are in motion.
505
+ People laugh, a woman speaks, rodents make noise, clicks and taps are heard, breathing and mechanisms are heard.
506
+ Yipping sounds with background noise are heard.
507
+ Mechanisms, men talking and breathing are heard.
508
+ A telephone bell rings.
509
+ Scissors are being used.
510
+ A motor vehicle (road) is heard, followed by tire squealing or skidding.
511
+ Background noise and human sounds are present.
512
+ Bird calls mixed with crows and sound effects.
513
+ Heavy footsteps and background noise are heard as a man speaks.
514
+ A man speaks while sounds of ticking, tapping, and mechanisms can be heard.
515
+ Background noise and an explosion are heard.
516
+ Music plays with video game and sound effects.
517
+ Background noise and animal sounds are heard.
518
+ Sound effects and background noise can be heard.
519
+ Mechanisms are functioning, with a dial tone and busy signal.
520
+ Music and coins dropping are heard.
521
+ Bird calls, barking, running, background noise, and a dog growling are heard.
522
+ Heart sounds and mechanisms are heard.
523
+ Waterfowl and bells are heard while the wind blows.
524
+ Mechanisms and an explosion are heard.
525
+ A foghorn sounds and music plays over background noise.
526
+ A chime is ringing with background noise.
527
+ Whistling, background noise, and men speaking are heard.
528
+ Music and speech noise can be heard.
529
+ An arrow flies and a car is heard in the background.
530
+ Background noise, mouse patter and mouse sounds can be heard.
531
+ Tapping sounds and sonar noises repeat.
532
+ Wind is blowing, a police car siren is sounding, and crows are cawing.
533
+ Sounds of mechanisms, video games, white noise, clicking, footsteps, breaking, gunfire, ticking, and dripping are heard.
534
+ Dogs are panting.
535
+ Sound effects and clanging are heard.
536
+ Writing and music accompany sound effects from mechanisms.
537
+ Male speech, breathing, and background noise are heard throughout.
538
+ A bell is ringing and heart sounds are heard along with background noise and mechanisms.
539
+ Animal sounds dominate.
540
+ Cats and dogs are meowing and barking to the sound of music.
541
+ The sounds of wind, telephones, and people speaking are heard.
542
+ Bouncing and mechanisms sounds are heard.
543
+ Music and sound effects are playing.
544
+ A man is speaking with a tap and glass shattering with a gasp.
545
+ Cooing, mechanical fan, tapping, and ticking sounds are heard.
546
+ A man speaks and breathes over the sound of music.
547
+ Boing sounds are repeated.
548
+ Wind blows as a rowboat glides on water.
549
+ Barking and human sounds can be heard with surface contact, human voice, and ticking.
550
+ Roosters cluck and crow amidst wind noise.
551
+ A bicycle bell and mechanisms can be heard.
552
+ A bell is ringing while mechanisms are operating.
553
+ A bird is singing, and a vehicle honks.
554
+ Mechanisms operate while a man speaks, then a foghorn sounds.
555
+ Video game sounds mix with machine gun fire.
556
+ Breaking sounds are heard.
557
+ Mechanisms and purring are heard with ticking in the background.
558
+ A sine wave tone is heard.
559
+ Male speech, television, gasps, yelling, and laughter are heard with ticking sounds.
560
+ People are laughing, coughing, and making fart sounds, with music and background noise.
561
+ A tuning fork and background noise is heard.
562
+ A roaring sound with clicking and a man speaks.
563
+ Mechanisms, footsteps, laughter, and speech are heard.
564
+ Mechanisms mix with breathing and chewing and liquid sounds.
565
+ A woman is speaking with background noise and more female speech.
566
+ A man speaks with background noise, music plays, and clicking is heard.
567
+ Children speak, cough, and make noises, with background noise.
568
+ Mechanisms make noise as something rolls.
569
+ A man speaks, breathes, and ticks while background noise occurs.
570
+ Singing and music play with background noise and men speaking.
571
+ Sound effects and background noise are heard.
572
+ Croaking frogs and crickets make sounds.
573
+ A woman is speaking, clock ticks are heard, and a woman is speaking more.
574
+ A clock ticks and music plays against a background of noise.
575
+ A bell sounds with human sounds and a sound effect.
576
+ Surface contact and human sounds are heard.
577
+ A car horn sounds amid background noise, followed by bicycle bells and ticking, along with human sounds.
578
+ Women are speaking, hubbub and background noise is heard, animals and children are making noises, and human voices are heard.
579
+ A sine wave is heard.
580
+ People are talking and a beep is heard in the background.
581
+ Only music plays.
582
+ Music plays with background noise and a crowd clapping.
583
+ Music and background noise can be heard.
584
+ A woman speaks while snoring and with mechanisms.
585
+ Insects, wind, and men speaking are heard, with birds chirping.
586
+ Music is playing.
587
+ Background noise and laughter are heard, then several coughs, more laughter.
588
+ A cow is mooing with an echo heard in the background.
589
+ Noise and heart sounds are heard.
590
+ A man is speaking and breathing, with background noise.
591
+ Music is playing, people are making sounds, and a plop is heard.
592
+ People are chewing, clicking, and making ticking noises.
593
+ Chickens can be heard clucking in a noisy background.
594
+ Mechanisms beep amidst background noise.
595
+ Only sonar sounds can be heard.
596
+ An alarm clock is ringing with background noise and wind noise.
597
+ A river is flowing, wind is blowing and a man is speaking with ticks in the background.
598
+ A man is speaking with a water tap and mechanisms in the background.
599
+ Crackling and wind sounds, video game noises, and music can be heard.
600
+ Mechanisms are making sounds.
601
+ The sound of a heartbeat is heard repeatedly.
602
+ Plops, music, background noise, and woman speaking and breathing.
603
+ A loud slam is heard.
604
+ A bus is driving.
605
+ Men are speaking with background noise.
606
+ A creaking noise is heard in a repetitive pattern.
607
+ A series of heartbeats with distortion in the audio.
608
+ A man and woman speak, wind blows, a dog barks, scrapes, and howls.
609
+ Wind blows, mechanisms are heard, a man speaks, human voices are heard, birds are chirping, frogs are heard, and an arrow is released.
610
+ A sine wave is played.
611
+ Female and male singing, sound effects and video game sounds can be heard.
612
+ Writing and background noise are heard.
613
+ Various sound effects and ticking are heard.
614
+ People are speaking, clicking sounds are present, and background noise and human voices can be heard.
615
+ A sewing machine is being operated.
616
+ Ticking, background noise, and a chirp tone are heard.
617
+ Background noise and breathing can be heard.
618
+ A printer is printing while music plays.
619
+ Background noise, beeping, a man speaking, and an alarm are heard.
620
+ Mechanisms and splatter sounds are heard.
621
+ Various sounds occur including clicking, tones, and noise.
622
+ A man speaks and an aircraft is heard.
623
+ Applause is heard.
624
+ The sounds of an electric toothbrush, a man speaking, human sounds, a woman speaking, and laughter are heard.
625
+ Only background noise is heard.
626
+ A child is speaking, and mechanisms are making ticking sounds along with a coughing sound.
627
+ A beep, man speaking, and radio sounds are heard with background noise and ticking.
628
+ A groan and other sound effects.
629
+ Music is playing.
630
+ A heartbeat and animal sounds are heard.
631
+ Wheezing and coughing sounds.
632
+ A man is speaking while mechanisms are moving.
633
+ Heartbeats, background noise, and a sound effect are heard.
634
+ A basketball is bouncing and footsteps are heard with wind noise.
635
+ Background noise can be heard along with men's speech and human voices.
636
+ A sound effect is playing.
637
+ Wind and mechanisms are heard, with sneezes and human sounds in the background.
638
+ Music and a man speaking are heard.
639
+ A sound effect plays with music.
640
+ People cough, sing, breathe, and music plays.
641
+ Chewing and crunching sounds, along with surface contact, can be heard, along with background noise.
642
+ An eruption occurs and people are shouting and speaking.
643
+ Video game sounds and children speaking are heard over plops and music.
644
+ A woman is speaking and writing with mechanical sounds in the background.
645
+ Shuffling cards and men speaking is heard.
646
+ Men are speaking, tapping, and a dog barking is heard.
647
+ Background noise, tapping, laughing, and animal and speech synthesizer sounds are heard.
648
+ Footsteps, surface contact, gunshots, man speaking and laughter are heard over video game sounds.
649
+ Footsteps, writing, and mechanisms are heard.
650
+ Background music is being played.
651
+ Birds call while wind blows and surfaces are touched.
652
+ A man is speaking with background noise and music is playing.
653
+ An explosion occurs followed by speech from a synthesizer.
654
+ A stream is flowing while birds are singing.
655
+ Mechanisms, stomping, running, walking, and speech are heard.
656
+ Brief tones, mechanisms, and surface contact with chopping are heard.
657
+ Beeps and clicks are heard in background noise.
658
+ A fierce roar is heard.
659
+ Running, panting, and a sheep bleating with wind noise.
660
+ A snap and thud are heard with background noise.
661
+ A beep, a man speaking, and birds chirping are heard with a sound effect.
662
+ Wind chimes ring and background noise can be heard.
663
+ Mechanisms, zipping, and man speaking can be heard.
664
+ Footsteps, music, and a thunk sound.
665
+ Writing is heard, then background noise and more writing.
666
+ Music with noise and sound effects.
667
+ Ticking mechanisms and human sounds can be heard along with breathing and surface contact.
668
+ Women speak amid the sounds of mechanisms and crumpling.
669
+ Birds are singing and ticking sounds are heard in the background.
670
+ A cough can be heard.
671
+ Mechanisms are ticking and traffic noise can be heard.
672
+ Mechanisms, clicking, male speaking, and surface contact are heard.
673
+ A chirp tone is heard.
674
+ A sigh is heard.
675
+ Background noise is heard.
676
+ A single ding is heard.
677
+ The hum of electricity is heard and chickens and birds are singing.
678
+ Birds coo and flap their wings while a woman speaks intermittently.
679
+ A telephone bell rings, mechanisms sound, and there is speech synthesizer and breathing before an explosion.
680
+ Children are talking, laughing, and making noise while adults are slapping and speaking.
681
+ Background noise, barking, and growling dogs are heard.
682
+ Music plays as a woman speaks and coughs.
683
+ Background noise, gunshot sounds, and water are heard.
684
+ An engine is running and there are human sounds, squeaks, and brief tones.
685
+ Crowded speech noise fills the background.
686
+ Background noise and jangling keys are heard before a door opens.
687
+ A doorbell rings and a dog barks.
688
+ Goats bleat, mechanisms, man's speech, bird songs, and bleats are heard.
689
+ A zipper zips, writing is heard, and thuds and animal sounds punctuate the background noise.
690
+ Background noise accompanies male speech.
691
+ A motorcycle and sound effects operate.
692
+ Roars and bird songs are heard along with camera sounds.
693
+ Horns are honking and background noise is present.
694
+ Birds are singing and calling, with wind noise and turkeys heard.
695
+ A person is humming, breathing, and listening to ticking while mechanisms are heard in the background.
696
+ People are speaking and breathing, with crumpling sounds in the background.
697
+ Camera sounds, ticks, and background noise are heard.
698
+ Wind blows and guns are fired while a man speaks.
699
+ Heartbeats are heard repeatedly.
700
+ Footsteps can be heard tapping on a surface, followed by a whip being cracked and background noise.
701
+ Mechanisms and footsteps are heard alongside whispering, breathing, and female speech.
702
+ Background noise, tapping, and human sounds and breathing can be heard.
703
+ Wind noise and wind sounds are heard through a microphone.
704
+ The wind blows with animal sounds, human voices, and birds singing.
705
+ A clicking sound is heard as mechanisms are used.
706
+ Heartbeats sound before glass shatters in the background.
707
+ Horses and people are speaking in a noisy environment with ticking sounds.
708
+ Beeps are sounding.
709
+ Crowd noise, speech, and laughter are heard.
710
+ People are chewing, dishes are clanging, and background noise and human sounds are present.
711
+ A clock ticks and mechanisms are heard, followed by surface contact.
712
+ A man speaks over sound effects.
713
+ Women are speaking and mechanisms, including a blender, are in use.
714
+ Thunderstorm rages as a man speaks.
715
+ Clicking and plopping sounds can be heard amid music.
716
+ A busy signal is heard, followed by a woman speaking, tapping sounds, and various human sounds and mechanisms.
717
+ Wind is blowing and a truck is reversing, honking, and making air brake sounds with people talking and giggling.
718
+ A plop sound is heard.
719
+ Mechanisms sound, a cat meows, a tap is heard, and a woman speaks over background noise.
720
+ A sink is filling or being washed, and a man speaks over music and bird calls.
721
+ Music and surface contact sounds mix with a ticking noise.
722
+ Wind is heard as a car makes tire squealing noises and something ticks.
723
+ Camera sounds are heard with background noise and ticking.
724
+ Car sounds, wind, men speaking, car horns, and conversation are heard.
725
+ Background noise blends with female speech.
726
+ An aircraft and video game sounds can be heard.
727
+ A man speaks, background noise, clicking, the man speaks again, breathing is heard, and the man speaks again.
728
+ A woman speaks, a dog barks, a man speaks, and crunching can be heard over background noise.
729
+ A mechanism beeps.
730
+ People are laughing and a woman is speaking over background noise.
731
+ Background noise with ticking sounds.
732
+ Wind and bleats are heard, along with microphone noise.
733
+ Background noise and breathing are heard, followed by multiple men speaking.
734
+ An ambulance siren and wind can be heard.
735
+ Wind blowing, mechanical fan sounds, breathing, and laughter are heard.
736
+ A mid-frequency engine makes ticking and beeping sounds.
737
+ People are making sounds, coughing, and breathing.
738
+ Scratch sounds are heard in the background noise.
739
+ Surface contacts, human speech, and ticks are heard over mechanisms.
740
+ A woman is speaking, breathing, and clicking.
741
+ Birds sing, gun fires, wind blows, man speaks, car honks.
742
+ The sound of a horse snorting is heard.
743
+ A mid-frequency engine and a vehicle horn are heard.
744
+ A telephone is dialing with ticking sounds and breathing in the background.
745
+ The sound of a sine wave.
746
+ Water is flowing.
747
+ Crickets are chirping.
748
+ An explosion takes place.
749
+ Wind, mechanisms, shouting, bird tweets, and an explosion are heard.
750
+ Traffic noise and water sounds mix with ticking and human voice.
751
+ Wind, arrows, and ticking can be heard while both male and female speech is present.
752
+ A firecracker is heard.
753
+ Various brief tones, reverberations, and animal sounds play.
754
+ The sound of a heartbeat is heard.
755
+ Mechanisms, coughing, and surface contact sounds are heard with ticking and breathing.
756
+ A man speaks and guns are fired with ticking sounds in the background.
757
+ Animals and background noise are heard repeatedly.
758
+ A sine wave and background noise are heard with beeps, ticks, slamming sounds, and breathing.
759
+ Mechanisms, tapping, bird flight and vocalization, and tweeting sounds can be heard.
760
+ Wind, human voices, and wind noise can be heard.
761
+ Video game sounds and music play with sound effects and clicking.
762
+ Music and animal sounds can be heard.
763
+ Background noise is heard while a heartbeat is heard repeatedly.
764
+ Wild animals are heard in the background.
765
+ Background music, video game sounds, and a man's voice are heard.
766
+ Clicking sounds occur with a ringing tone in the background.
767
+ A jet engine roars and sound effects play.
768
+ Footsteps are heard along with background noise, a busy signal, and music.
769
+ A heartbeat is heard repeatedly.
770
+ A loud slam is heard.
771
+ Music, background noise, and a woman speaking fill the background.
772
+ Babbling and female speech is heard with background noise and a child speaking.
773
+ A woman is speaking, birds are chirping, and a rooster is crowing.
774
+ Background noise and birds singing, with ticking sounds in between.
775
+ A person's heartbeats can be heard with background noise.
776
+ A speech synthesizer speaks as someone slaps and taps.
777
+ Ticking and surface contact accompany speech and computer typing.
778
+ An unknown sound effect is played.
779
+ Footsteps and music are heard.
780
+ A boing sound is heard.
781
+ Footsteps and a tap are followed by a rumble.
782
+ Video game sounds, footsteps, and a dial tone are heard.
783
+ A cat meows.
784
+ A boing sound is heard.
785
+ A noise is heard.
786
+ People are speaking and conversing over background noise.
787
+ Heartbeats and music fill the air.
788
+ Pattering sounds are made by mechanisms.
789
+ A man speaks, footsteps are heard, and a cow moos over background human sounds.
790
+ A toilet flush is followed by music and a woman speaking.
791
+ Background noise mixes with whispering and wind blowing.
792
+ Heartbeats, music, and a river flow as a man speaks.
793
+ People walk, speak, and make slapping sounds in a noisy environment.
794
+ Television plays amidst noise.
795
+ Background noise, male speech, and breaking sounds are heard.
796
+ Tapping and scratching can be heard over background noise.
797
+ Noise, speech, human sounds, and music can be heard with background noise.
798
+ Background noise and clicking sounds accompany men speaking.
799
+ Various breathing, mechanical, and conversational noises occur with medium engine and child speech sounds.
800
+ A woman and a man are speaking and laughing while a TV plays in the background.
801
+ Howling wind, noise, and mechanisms are heard.
802
+ Sound effects and surface contacts are heard over background noise.
803
+ Mechanisms are heard.
804
+ Mechanisms make ticking noises.
805
+ Cars honk their horns.
806
+ A woman speaks while scissors cut and there are tapping and surface contact sounds.
807
+ A buzzer is buzzing repeatedly.
808
+ There is a sine wave and bouncing sounds with music.
809
+ Sound effects, speech synthesizers speaking are heard.
810
+ Birds are vocalizing, an owl is heard, and surface contact is made.
811
+ Surface contact, background noise, and a doorbell are heard.
812
+ Beeping sounds repeat multiple times.
813
+ Mechanisms make a brief tone sound.
814
+ Music is playing.
815
+ Footsteps are heard.
816
+ Men are speaking, using a computer keyboard, and making speech sounds.
817
+ Music is playing.
818
+ Flapping wings of birds in flight.
819
+ People are breathing, laughing, and honking over wind, ice cream truck, and gasps.
820
+ An animal makes noise amongst clicks and sounds.
821
+ A brief tone is heard.
822
+ A man is speaking and laughing, with a door slamming sound and ticking in the background.
823
+ Music can be heard.
824
+ Whispering and ticking with background noise.
825
+ Radios beep and play while an alarm and police car siren sound.
826
+ A bicycle bell rings and mechanisms are heard.
827
+ Mechanisms, beeping, and a camera sound are heard.
828
+ There is background noise along with ticking and drinking sounds.
829
+ A water tap runs with background noise.
830
+ A man speaks among background noise.
831
+ Mechanisms beep and objects make contact with surfaces.
832
+ Various sound effects are heard with occasional animal sounds.
833
+ Music is playing.
834
+ Music plays with whispering and heartbeat sounds in the background.
835
+ A machine is working as a woman speaks.
836
+ Sound effects are being played.
837
+ A speech synthesizer produces sound effects with background noise.
838
+ Children are speaking, sneezing, and breathing in a noisy environment.
839
+ An explosion and music can be heard.
840
+ Wind, chewing, bird songs, brief tones, human voices, and coughing are heard.
841
+ Background noise and female speech mix with barking and panting dog sounds.
842
+ Sniffing and barking are heard amidst human sounds and background noise.
843
+ Wind, ticks, a brief tone, male speech, a rooster, and crowing are heard.
844
+ A printer and surface contact are heard in the background.
845
+ An ice cream truck plays music while a man sings.
846
+ Vehicles honk as mechanisms make noise, followed by a dripping sound.
847
+ A dog barks with sound effects and background noise.
848
+ A ding-dong and mechanisms can be heard.
849
+ Men speaking, breathing, and clicking accompany background noise.
850
+ A bell is ringing and background noise and wind noise is heard.
851
+ Background noise and birds singing are heard, with occasional sound effects.
852
+ A sine wave is heard.
853
+ Pulses alternate with background noise and a sound effect.
854
+ An explosion causes glass to shatter, followed by an eruption.
855
+ Music, creaking, slamming, and thunking can be heard.
856
+ Laughter and beeping sounds are heard in the background while a woman speaks.
857
+ Music is playing, with a man running and panting, a slapping sound, background noise, and a car passing by.
858
+ Whispering and ticking sounds are heard with background noise.
859
+ A heartbeat sound is being recorded.
860
+ Lions are heard roaring with background noise.
861
+ People are talking and making various sounds.
862
+ A man is speaking with background noise and ticking sounds.
863
+ A conversation takes place on a rowboat over water and ticking sounds.
864
+ A continuous sine wave sound.
865
+ A mid-frequency engine is heard, surface contact is made, zippers are being used, breathing is heard, and a car is accelerating and revving.
866
+ A dog is barking, people are speaking, and arrows are being shot with background noise, bird chirping, and footsteps.
867
+ Wind and animal sounds mix with human voices and a bleat sound.
868
+ A man is speaking and making beeping and mechanical sounds.
869
+ Music and various sound effects are playing, including ding sounds.
870
+ Men speak and mechanisms and clicking sounds are heard.
871
+ Mechanisms, arrow sounds, crickets, and human voices are heard.
872
+ A man speaks over a crowd of people speaking.
873
+ A telephone is ringing, people are talking and laughing, and a child is speaking.
874
+ Sonar signals are heard over background noise.
875
+ Wind is blowing and a horse is whinnying.
876
+ Music is playing and machine guns are firing.
877
+ A man speaks and a tuning fork is struck amid background noise.
878
+ A bell reverberates in the air.
879
+ Heart sounds are heard with background noise.
880
+ Music and echoes, mechanisms, and men speaking can be heard with footsteps.
881
+ Mechanisms, female speech, and music play.
882
+ Mechanisms make ticking noises and music is playing.
883
+ Hunting tools are used to call ducks as wind blows and a man speaks.
884
+ Mechanisms and laughter are heard as a pig oinks and breathing and music are heard.
885
+ A civil defense siren and a sound effect are played.
886
+ Water is flowing, mechanisms are operating.
887
+ Insects are chirping and an arrow is heard while a person is speaking.
888
+ Heartbeats and background noise are heard with a sound effect.
889
+ A woman is speaking and clicking sounds are heard.
890
+ A sine wave accompanies thumping, surface contact, and clicking noises.
891
+ Men speaking, mechanisms, water, and surface contact can be heard.
892
+ An alarm clock goes off amidst background noise.
893
+ A woman is speaking and music is playing, with the sound of a horse-drawn carriage heard.
894
+ There is booing and a sound effect.
895
+ Background noise and ticks are heard.
896
+ Gobbling and sound effects are heard.
897
+ Footsteps, a sheep's bleat, and a plop are heard.
898
+ Noise and sound effects accompany clicking.
899
+ A man speaks, types, and breathes near a computer keyboard.
900
+ A sound effect is heard.
901
+ Sound effects are heard in quick succession.
902
+ Taps and chirping birds are heard, as well as the sound of the wind and surface contact.
903
+ An emergency vehicle is in operation with wind noise and bird sounds.
904
+ Wind, a speedboat, and human voices are heard.
905
+ Music and telephone bells ringing.
906
+ Music and surface contact.
907
+ Music is playing.
908
+ Mechanisms and ticking sounds accompany filing and surface contact.
909
+ Wind, human voices, and female speech can be heard along with wind noise.
910
+ A doorbell rings and dogs are barking.
911
+ Sound effects occur repeatedly.
912
+ A person is gasping, talking, and breathing.
913
+ A woman is singing and tapping with background noise.
914
+ A woman is speaking.
915
+ Mechanisms, tapping, and male speech with grunts are heard.
916
+ Wind blows and birds sing and bleat.
917
+ Background noise and cap guns are firing.
918
+ A man speaks while mechanisms tick, followed by conversation and a woman speaking.
919
+ A man is speaking with mechanisms and a dial tone sound.
920
+ Crickets chirp amidst background noise.
921
+ Noise, dial tones, and music with a coin drop are present.
922
+ Wind noise is heard, an arrow is shot, female speech and conversation are heard, and ticking is heard.
923
+ Background noise and the sounds of horse hooves, with more background noise.
924
+ A groan is heard.
925
+ Surface contact, clicking and typewriter sounds are heard.
926
+ Surface contact and chopping sounds with mechanisms.
927
+ Water is boiling and there is breathing in the background.
928
+ A man is speaking over background noise.
929
+ Whispering and female speech are accompanied by breathing.
930
+ Background noise, mechanisms, and a telephone ringing accompany barking, speech synthesizer and dialing sounds.
931
+ There is crinkling, mechanical fan noise, and surface contact.
932
+ Only mechanisms sounds are heard.
933
+ A dog barks and a pig squeals while some background noise and an animal sound can be heard.
934
+ Mechanisms clank, an animal makes a noise, and a woman speaks.
935
+ A printer makes clicking sounds amidst background noise and beeps.
936
+ Background noise and surface contact sounds are heard, along with the sound of an arrow being shot.
937
+ Laughter, beeps, and speech are interrupted by telephone ringing and tapping sounds.
938
+ People are whispering, breathing and mechanisms are making sounds.
939
+ Heartbeats are heard with ticking sounds in the background.
940
+ A ding sound is heard.
941
+ Mechanisms clack, and people clap.
942
+ Footsteps, chirping birds, rustling leaves, and animal sounds are heard in the background.
943
+ Wind noise, crows, and a barking dog can be heard in the background.
944
+ A medium-frequency engine can be heard, and people are speaking and whispering near bleating animals.
945
+ Background noise with a heartbeat rhythm is present.
946
+ Background noise is present with sounds of surface contact and human voices.
947
+ A bird is cooing and chirping with wind noise and human voices.
948
+ Wind, speaking, and tapping are heard over background noise and conversations.
949
+ People are walking, laughing, breathing, and speaking with wind, bird calls, and a vehicle in the background.
950
+ Background noise and heartbeat sounds are heard.
951
+ Music is playing.
952
+ Crumpling sounds are heard in the background.
953
+ Music, man's speech, shouting, and slamming can be heard.
954
+ Background noise is present before a buzzer sounds.
955
+ A man is speaking over music and human sounds.
956
+ A variety of sound effects play over background noise and music.
957
+ The wind blows while the heartbeat can be heard.
958
+ Music plays on television as a man breathes, sings, snores, laughs, and breathes.
959
+ A woman is speaking while ticks are heard in the background.
960
+ A video game is being played with shots being fired.
961
+ A train is moving and a car is honking.
962
+ Mechanisms are heard, and turkeys are making sounds, with wind and ticking in the background.
963
+ A rumble is heard, followed by heartbeats.
964
+ Men are speaking, music is playing, and whistling and choir can be heard.
965
+ Birds are chirping and rustling, and wind is blowing.
966
+ Busy signal, clicking, and breathing with background noise.
967
+ Tap dancing and music play while a human voice is heard.
968
+ Conversations and mechanisms can be heard with a cat meowing.
969
+ An eruption occurs, and ticking can be heard amidst speech.
970
+ Men are speaking, clicking, and typing on a computer with breathing sounds.
971
+ A man is speaking with turkey sounds and background noise.
972
+ Men are speaking and having a conversation.
973
+ Background noise, telephone bells ringing, and a man is speaking.
974
+ Water trickles while an engine hums.
975
+ Drips and surface contact sounds occur.
976
+ A man is speaking, writing and tapping, and there is background noise.
977
+ Continuous surface contact sounds and background noise heard.
978
+ Breathing and whistling are heard in background noise.
979
+ Running sounds and a boing sound can be heard.
980
+ A speech synthesizer, echo, glass shatter, and more speech synthesizer are heard.
981
+ A person sighs.
982
+ Cats making noise, background noise, and human voices.
983
+ A man is speaking on the phone with background noise.
984
+ Music plays continuously.
985
+ Spray is heard, along with background noise and bird songs.
986
+ People are speaking and chopping, with breathing, laughter, footsteps, and clapping.
987
+ Music and hoots play as a man sings and a speech synthesizer can be heard.
988
+ Clicking and whooshing noises occur.
989
+ Pigs squeal, wind blows, and men are talking.
990
+ Background noise is heard while whispering and breathing can be heard, followed by a busy signal.
991
+ An explosion is heard.
992
+ Music is playing.
993
+ Men are speaking over background noise and breathing is heard.
994
+ A mechanical fan is running.
995
+ Beeping, ticking mechanisms are heard.
996
+ A man speaks and taps are heard amidst breathing and ticking sounds.
997
+ People are shouting amidst music and human voices.
998
+ Breathing, ticking, jangling keys, and various surface contacts are heard in the background.
999
+ Tools and mechanisms are present, with a man speaking.
1000
+ Mechanisms produce background noise.
1001
+ A cat is purring and background noise is heard.
1002
+ Background noise and human sounds are heard.
1003
+ Water is ticking and splashing as a man speaks and laughter is heard.
1004
+ An effects unit is being used with background noise and ticking sounds.
1005
+ People are gargling and speaking with human sounds in the background.
1006
+ People are eating and making noises with their cutlery and dishes while background noise is heard.
1007
+ A power tool and ticking sounds are heard.
1008
+ Sound effects are heard repeatedly.
1009
+ A person is sneezing, breathing, and a dog barks.
1010
+ A sine wave sound is heard.
1011
+ Laughter, typing, a man speaking, sound effects, and mechanisms.
1012
+ Music and noise are heard.
1013
+ An alarm, ticking, speech, and background noise are heard.
1014
+ A ticking noise is heard.
1015
+ A man is speaking and shuffling cards.
1016
+ Bells are ringing in a change ringing pattern.
1017
+ Music is playing.
1018
+ A woman and a man speak, with background noise in the background.
1019
+ A man speaks as a housefly buzzes and makes surface contact.
1020
+ Human voices and music can be heard.
1021
+ Heartbeats are heard repeatedly.
1022
+ An aircraft engine is heard with wind noises and bird vocalizations.
1023
+ Bleats and barks are heard with background noise, and the pattern repeats.
1024
+ An emergency vehicle is speeding past, revving its engine.
1025
+ A knock is heard.
1026
+ Whistling, background noise, clicks, and keyboard sounds are heard, followed by speech.
1027
+ Mechanisms are being operated continuously.
1028
+ A mechanical fan and wind are heard.
1029
+ A crack is heard.
1030
+ Music is heard against background noise.
1031
+ Mechanisms are moving, birds are flying and singing.
1032
+ A bell is ringing and mechanisms are heard.
1033
+ There is wind noise and a man is speaking.
1034
+ A camera is in use, with occasional ticking and background noise.
1035
+ Wind is blowing, a gunshot is heard, a tick is heard, and a sound effect is heard.
1036
+ Noise and sound effects are heard.
1037
+ Only music is heard.
1038
+ Sound effects and beeps are heard while music plays.
1039
+ A man speaks over music with a pulsing beat.
1040
+ Wind is blowing, and a basketball is bouncing.
1041
+ A woman is speaking, glass shatters, and breathing can be heard.
1042
+ A background noise is heard with yells and rattles.
1043
+ Keyboard typing, background noise, and men speaking are heard.
1044
+ A cat is purring, with clicking and background noise.
1045
+ Music plays while liquid drips and various sound effects are heard.
1046
+ A sigh is heard.
1047
+ People are chewing and slurping while whispering and making surface contact.
1048
+ A man is speaking with music and whale vocalizations are heard.
1049
+ Wind and mechanisms are heard with heartbeat sounds.
1050
+ A bird is singing, barking dogs are heard, and an arrow is shot.
1051
+ Mechanisms are moving with snoring.
1052
+ A ding, sound effect, and background noise are heard.
1053
+ Breathing and sniffing sounds can be heard in the background.
1054
+ Background music is playing.
1055
+ People are clicking and typing, with a man speaking occasionally.
1056
+ Beeps repeatedly sound.
1057
+ A man is speaking and zipping his clothing in a small room with monologue narration and breathing.
1058
+ Sound effects and clapping are interspersed with occasional farts.
1059
+ There is silence in a small room.
1060
+ Breathing is heard.
1061
+ Beeps are repeating.
1062
+ A door opens and closes with footsteps and a sigh in a large room or hall.
1063
+ Bicycle bells ring with background noise.
1064
+ A man and a woman are speaking, with music and silence in between.
1065
+ Wind noise is heard in the countryside.
1066
+ People chew and tap sounds are made.
1067
+ A telephone bell ringing, tapping, and a man speaking are heard.
1068
+ Mechanisms and a printer are heard.
1069
+ Sound effects play.
1070
+ Only the sound of a heartbeat is heard.
1071
+ The sound of a bell is heard.
1072
+ Fire is burning and people are speaking with spraying mechanisms.
1073
+ Whooshes, swooshes, and swishes are heard repeatedly.
1074
+ Liquid is heard, followed by thumps and tapping, and then men speaking.
1075
+ A man is speaking and hooting sounds are heard.
1076
+ Men are speaking and using a computer keyboard.
1077
+ Men speak and a camera makes sounds.
1078
+ Scissors cutting are heard in background noise.
1079
+ The sound of humming fills the air.
1080
+ Sound effects play continuously.
1081
+ A chirp tone is heard.
1082
+ A heart is beating with ticking sounds.
1083
+ Music and sound effects are playing.
1084
+ Something is breaking with shouting and screaming.
1085
+ Men are speaking and a printer is working with mechanisms.
1086
+ People whisper, breathe, chew, and crunch with wind noise.
1087
+ Dogs are barking and birds are chirping in the background.
1088
+ Sound effects and animal noises are heard.
1089
+ Coins are dropped, and objects are tapped and contacted with a surface.
1090
+ A church bell rings over background noise.
1091
+ Silence is interrupted by a man speaking and more speech.
1092
+ Background noise is heard, interrupted by chopping sounds.
1093
+ Men and a woman are speaking and croaks are heard.
1094
+ Water is flowing with clicking sounds.
1095
+ Music with sound effects and women speaking.
1096
+ Background music is playing with sound effects and croaking.
1097
+ A tuning fork is being struck multiple times in a small room.
1098
+ Bells are ringing and a man is speaking while wind noise is heard.
1099
+ Footsteps are heard with a woman speaking.
1100
+ Music is playing.
1101
+ Soft music is playing.
1102
+ Sound effects, silence, speech and a child speaking are heard with occasional plops.
1103
+ Music can be heard.
1104
+ Footsteps are heard in a small room, followed by silence and more footsteps, then a camera clicking.
1105
+ Human voice and a rumble are heard.
1106
+ Heartbeats alternate with noise, clicking, and more heartbeats.
1107
+ A noise is followed by a ding.
1108
+ White noise is being generated.
1109
+ Music plays with liquid flowing in the background.
1110
+ Wind is blowing, a man is speaking on a rowboat, splashing and splattering sounds with animal sounds and laughter.
1111
+ Silence is broken by beeps.
1112
+ The sound of a mechanical fan and ticking and tapping sounds can be heard.
1113
+ Music is playing, with a man speaking and tapping in the background.
1114
+ Wolves are howling.
1115
+ Music is playing.
1116
+ A sound effect, tap and creaking.
1117
+ Music and a chorus effect are heard.
1118
+ A croak is heard over background noise.
1119
+ A man is speaking and a ticking sound is heard.
1120
+ Scraping sounds are heard multiple times.
1121
+ Televisions, female speech, and the sound of rats can be heard in a small room.
1122
+ Music is playing.
1123
+ A chirp tone can be heard.
1124
+ Men are speaking, sighing, and walking.
1125
+ Wind noise can be heard in a field recording.
1126
+ Whispers, speech, and biting sounds are heard.
1127
+ Bicycles ring their bells and wind noise can be heard.
1128
+ A stomach is rumbling.
1129
+ Tapping, motor vehicles, speech, laughter, and ticking sounds can be heard in the background.
1130
+ Silence, drilling, tooling, and power tool sounds are heard.
1131
+ A church bell is ringing with an echo and music is playing.
1132
+ A bell is ringing.
1133
+ Microphone wind noise and machine gun fire heard.
1134
+ Music is playing.
1135
+ Breathing and water sounds can be heard.
1136
+ A woman is speaking and pouring liquid, with ticking and glass clinking heard.
1137
+ A series of beeps can be heard repeatedly.
1138
+ A woman speaks and speech synthesizer, speech, and sound effects are heard.
1139
+ Men speak and tap while turkeys vocalize.
1140
+ A man speaks while slapping and thumping sounds are heard with laughter.
1141
+ A telephone line is silent until a busy signal is heard.
1142
+ A slapping or smacking sound is heard.
1143
+ A buzzer is heard.
1144
+ Mechanisms and animal sounds with chirping birds are heard.
1145
+ Silence alternates with beeps.
1146
+ Music is played with a chorus effect.
1147
+ A subway door is opening and closing with a ding-dong sound.
1148
+ A woman whispers and speaks with human sounds and breathing in the background.
1149
+ Mechanisms and ticking are heard as a man speaks.
1150
+ Sounds in a small room.
1151
+ Mechanisms, people speaking, squealing, and conversation are heard.
1152
+ A man speaks, then there is silence followed by music.
1153
+ A snake rattles, people are speaking, and breathing is heard with wind blowing into the microphone.
1154
+ An alarm clock goes off and the sound of tapping and ticking follows.
1155
+ A microwave beeps in a quiet room with sounds of mechanisms.
1156
+ Coughing, speech, clicking sounds, and breathing can be heard.
1157
+ A tuning fork in a small room.
1158
+ Music is playing in the background.
1159
+ Heartbeats are heard.
1160
+ Sound effects, whooshes, and male speech occur with breathing.
1161
+ Music is playing.
1162
+ A person is breathing and sounds effect can be heard.
1163
+ Heartbeat sounds can be heard.
1164
+ Skateboards are being ridden.
1165
+ A sine wave is being played.
1166
+ A child is speaking, sheep are bleating, and a man is tapping and laughing.
1167
+ A cat purrs and meows as a ticking sound continues.
1168
+ Music is playing.
1169
+ A tuning fork is ringing in a small room.
1170
+ Writing and speech can be heard in a small room with narration and monologue.
1171
+ Roars and growls are heard.
1172
+ Animals and rodents are making noise in a small room with laughter.
1173
+ Silence alternates with sine wave sounds.
1174
+ A mechanical fan runs.
1175
+ Sonar is the only sound.
1176
+ Tapping precedes music.
1177
+ Clicking sounds play rhythmically with music.
1178
+ Whale vocalization is audible.
1179
+ Whacking noises and bouncing sounds are heard in a small room.
1180
+ A sine wave is produced.
1181
+ Animal sounds mix with beeps and a man's speech.
1182
+ Bird calls punctuate a siren's wail.
1183
+ Taps and whacks produce rhythmic sounds.
1184
+ A hammer strikes in silence.
1185
+ Laughter and speech mix with television and breathing sounds.
1186
+ There are various sound effects and rumbling noises.
1187
+ Computer keyboards click repeatedly.
1188
+ Music plays in a small room with a chorus effect.
1189
+ Stomach rumbling and a tap sound can be heard.
1190
+ Beeping and busy signals are heard during telephone calls, along with speech from a man.
1191
+ Whispering accompanies the sound of chewing.
1192
+ There is speech and tick-tock sounds with occasional scraping.
1193
+ A coin drops.
1194
+ A sine wave is heard.
1195
+ Music plays with shouting, gunshots, glass shattering, clapping, and singing.
1196
+ Music is heard.
1197
+ Scissors cut while tapping occurs and women speak.
1198
+ Laughter is followed by the sound of a fart and noise made by a human voice.
1199
+ Music plays near the ocean with the sound of a bell in the distance.
1200
+ Ticking sounds are repeated many times.
1201
+ The music plays intermittently among silences.
1202
+ Babbling is followed by silence.
1203
+ Music plays with background noise.
1204
+ Wind, shouting, clicking and speech can be heard.
1205
+ Gunshots and tapping sounds can be heard.
1206
+ Music plays inside a large room or hall.
1207
+ There is silence and a sound effect.
1208
+ An engine runs as animals walk, pant, and breathe with children's speech and footsteps.
1209
+ Domestic animals purr in a small room.
1210
+ Farts and a beep sound.
1211
+ There are various sounds including silence, television, music, and sound effects.
1212
+ Camera mechanisms click.
1213
+ Conversations, laughter, and breathing are heard along with frog and animal sounds.
1214
+ Sound effects including farts are present.
1215
+ A person is breathing, and a telephone is ringing while conversations occur.
1216
+ Speech and breathing can be heard along with computer keyboard sounds in a small room.
1217
+ A cat meows twice.
1218
+ There is silence and then music starts playing.
1219
+ The sounds of mechanisms and chewing are heard with breathing and tapping.
1220
+ Someone is screaming.
1221
+ Taps and alarm clocks make repetitive sounds.
1222
+ Wind and water make noise while a man speaks.
assets/demo/prompts.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A group of anthropomorphic mushrooms having a disco party in the middle of a dark enchanted forest, with glowing neon lights and exaggerated dance moves, their smooth textures and reflective surfaces emphasizing a comical 3D look.
2
+ A panda bear with distinct black patches climbs and rests on a wooden log platform amid lush, natural foliage.
3
+ A vibrant green parrot with hints of yellow and blue perches on a person’s lap, who is wearing grey pants. The parrot features a white beak, grey head, and a black eye. In the background, a red couch and a TV displaying a colorful video with “bilibili” text complete the scene.
4
+ A black-and-white film captures a pianist playing in an empty, decaying theater. His deft fingers and echoing music create a haunting ambiance as dust motes float in the faint light. The gleaming grand piano under the spotlight contrasts with the worn seats and peeling walls, evoking nostalgia.
5
+ Chinese ancient style, realism. A young woman, dressed in an embroidered red qipao, walks along the ancient streets of a bustling Chinese town. The red lanterns hanging above her sway gently in the evening breeze, and her calm, confident stride contrasts with the lively atmosphere of merchants and performers around her.
6
+ A tomato surfing on a piece of lettuce down a waterfall of ranch dressing, with exaggerated surfing moves and creamy wave effects to highlight the 3D animated fun.
7
+ A man in a gray hoodie and a woman in a light gray jacket jog along a residential sidewalk, smiling and chatting. They pass a beige house with a vibrant garden and street lamp on a bright, sunny day. The medium shot captures their movement amid lush greenery, creating a serene, cinematic scene.
8
+ A coastal landscape painting with a prominent archway is displayed on an easel in a bright studio. A camera pan reveals a table cluttered with art supplies and a potted plant, enhancing the artistic vibe. Large windows and soft natural lighting create a cozy, creative atmosphere.
9
+ A scene from disaster movie.
10
+ A candid medium shot captures a woman in a white car, wearing glasses, a yellow top, and a black jacket, with her arm resting on the open window. Behind her, a stone-faced house surrounded by lush greenery basks in natural sunlight, creating a warm and realistic scene.
11
+ Two women sit on a beige couch in a cozy, warmly lit room with a brick wall backdrop. They engage in a cheerful conversation, smiling and toasting red wine in an intimate medium shot.
12
+ A woman with her hair in a bun walks along a city sidewalk, gently touching a lush hedge. Dressed in a plaid jacket and beige pants with a tan backpack, her calm presence is captured in natural daylight against an urban backdrop.
13
+ A breathtaking aerial view shows a river winding like a dark ribbon through lush fields and hills, reflecting the soft pink-orange hues of sunrise or sunset in a serene, picturesque landscape.
14
+ A man performs push-ups on a wooden bench in a sunny park, captured from a side angle in a medium shot. The focus is on his upper body and technique, with natural sunlight accentuating the scene. Lush greenery and distant park-goers contribute to the energetic, realistic setting
15
+ A playful dog in a pink coat with a red leash dashes across a muddy field with sparse crops. The camera tracks its energetic movement from right to left against a backdrop of trees and distant power lines under an overcast sky. The realistic, medium shot captures a candid, lively moment in soft, diffused light.
16
+ A drone camera circles a historic church on a rocky outcrop along the Amalfi Coast, highlighting its stunning architecture, tiered patios, and the dramatic coastal views with waves crashing below and people enjoying the scene in the warm afternoon light.
assets/docs/data.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Data Preparation
3
+
4
+
5
+ ### Stage1 - JavisDiT-audio
6
+
7
+
8
+ In this stage, we only need audio files to initialize the audio generation capability:
9
+
10
+ | path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text| audio_text|
11
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
12
+ | placeholder.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | placeholder | yyy |
13
+
14
+ Download the audios (including [AudioCaps](https://drive.google.com/file/d/16J1CVu7EZPD_22FxitZ0TpOd__FwzOmx/view?usp=drive_link), [VGGSound](https://huggingface.co/datasets/Loie/VGGSound), [AudioSet](https://huggingface.co/datasets/agkphysics/AudioSet), [WavCaps](ttps://huggingface.co/datasets/cvssp/WavCaps), [Clotho](https://zenodo.org/records/3490684), [ESC50](https://github.com/karolpiczak/ESC-50?tab=readme-ov-file#download), [MACS](https://zenodo.org/records/2589280), [UrbanSound8K](https://urbansounddataset.weebly.com/urbansound8k.html), [MusicInstrument](https://www.kaggle.com/datasets/soumendraprasad/musical-instruments-sound-dataset), [GTZAN](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification), etc.), and put them into the same folder `/path/to/audios`. Follow the commands to automatically generate a `train_audio.csv` for configuration:
15
+
16
+ ```bash
17
+ ROOT_AUDIO="/path/to/audios"
18
+ ROOT_META="./data/meta/audio"
19
+
20
+ # 1.1 Create a meta file from a unified audio folder. This should output ${ROOT_META}/meta.csv
21
+ python -m tools.datasets.convert audio ${ROOT_AUDIO} --output ${ROOT_META}/meta.csv
22
+
23
+ # 1.2 Get audio information. This should output ${ROOT_META}/meta_ainfo.csv
24
+ python -m tools.datasets.datautil ${ROOT_META}/meta.csv --audio-info
25
+
26
+ # 2.1 Trim audios within 30 seconds. This should overwrite the raw audios by default and output ${ROOT_META}/meta_ainfo_trim30s.csv
27
+ python -m tools.datasets.datautil ${ROOT_META}/audio_meta.csv --trim-audio 30
28
+
29
+ # 2.2 Unify the sample rate to 16k Hz for all audios. This should output ${ROOT_META}/audio_meta_trim30s_sr16000.csv
30
+ python -m tools.datasets.datautil ${ROOT_META}/meta_ainfo_trim30s.csv --resample-audio --audio-sr 16000
31
+
32
+ # 3.1 Set dummy videos. This should output ${ROOT_META}/audio_meta_trim30s_sr16000_dummy_videos.csv
33
+ python -m tools.datasets.datautil ${ROOT_META}/audio_meta_trim30s_sr16000.csv --dummy-video
34
+
35
+ # 3.2 Get training meta csv. This should output ${ROOT_META}/train_audio.csv
36
+ python -m tools.datasets.find_audio_ds all \
37
+ --data_root ${ROOT_AUDIO} \
38
+ --meta_file ${ROOT_META}/audio_meta_trim30s_sr16000_dummy_videos.csv \
39
+ --save_file ${ROOT_META}/train_audio.csv
40
+ ```
41
+
42
+ ### Stage2 - JavisDiT-prior
43
+
44
+ As detailed in our [paper](https://arxiv.org/pdf/2503.23377), the prior estimator is trainning with the contrastive learning paradigm.
45
+ We take the extracted spatio-temporal priors as **anchor**, view the paired audio-video in the training datasets as **positive samples**, and randomly augment the audio or video to construct asychronized audio-video pairs as **negative samples**.
46
+ In particular, saptial- and temporal-asynchronization are separately generated.
47
+
48
+ | path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text | unpaired_audio_path |
49
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
50
+ | /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy | /path/to/zzz.wav |
51
+
52
+ #### Ground-truth synchronized audio-video pairs
53
+
54
+ Follow the instructions in [Stage3](#stage3---javisdit-jav) to read the audio-video information from training dataset (eg, [TAVGBench](https://github.com/OpenNLPLab/TAVGBench)).
55
+ The obtained basic meta file can be `/path/to/train_jav.csv`.
56
+
57
+
58
+ #### Offline asynchronized audio generation
59
+
60
+ Given a synchronized audio-video pair, we efficiently construct asynchronized audio-video pairs by generating standalone audios from [AudioLDM2](https://github.com/haoheliu/AudioLDM2) without reference videos.
61
+ The native text descrption, native video, generated audio jointly contribute to an asynchronized (negative) sample for contrastive learning.
62
+ Generated audio paths will be recorded in the `unpaired_audio_path` column.
63
+
64
+ ```bash
65
+ ROOT_META="./data/meta/prior"
66
+
67
+ CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4\
68
+ tools/st_prior/gen_unpaired_audios.py \
69
+ --input_meta ${ROOT_META}/train_jav.csv \
70
+ --output_dir ./data/st_prior/audio/unpaired \
71
+ --output_meta ${ROOT_META}/train_prior.csv \
72
+ --match_duration
73
+ ```
74
+
75
+ #### Online asynchronized audio-video augmentation
76
+
77
+ This part is implemented in `javisdit/datasets/augment.py`, where we developed various spatial/temporal augmentations for video/audio samples independently to constructing spatially/temporally asynchronized audio-video pairs.
78
+ For implementation details please kindly refer to our [paper](https://arxiv.org/pdf/2503.23377) and [code](javisdit/datasets/augment.py), and here we introduce the data preparation to perform corresponding augmentations:
79
+
80
+ - Auxiliary Video Resource ([SA-V](https://ai.meta.com/datasets/segment-anything-video/))
81
+
82
+ For video spatial augmentation, one of the efficient approaches is to randomly adding a sounding-object's masklet into a video sequence, causing spatial asynchrony between video and audio pairs.
83
+ Here we take the training set of [SA-V](https://ai.meta.com/datasets/segment-anything-video/) to collect native object maskelets at 6fps:
84
+
85
+ ```
86
+ data/st_prior/video/SA_V/
87
+ ├── sav_train
88
+ │ ├── sav_000
89
+ │ ├── sav_001
90
+ │ └── sav_002
91
+ ```
92
+
93
+ Then, we utilize [GroundedSAM](https://github.com/zhengyuhang123/GroundedSAM.git) to extend 6fps annotations to 24fps masklets:
94
+
95
+ ```bash
96
+ mkdir third_party && cd third_party
97
+
98
+ git clone https://github.com/zhengyuhang123/GroundedSAM.git
99
+
100
+ cd GroundedSAM
101
+
102
+ export AM_I_DOCKER=False
103
+ export BUILD_WITH_CUDA=True
104
+
105
+ python -m pip install -e segment_anything
106
+ pip install --no-build-isolation -e GroundingDINO
107
+
108
+ wget -P EfficientSAM/ https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_sam.pt
109
+
110
+ cd ../../
111
+
112
+ ls data/st_prior/video/SA_V/sav_train/sav_*/*.mp4 > data/st_prior/video/SA_V/sa_v_list.txt
113
+
114
+ CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 \
115
+ tools/st_prior/get_masklets.py \
116
+ --data_path data/st_prior/video/SA_V/sa_v_list.txt \
117
+ --output_dir data/st_prior/video/SA_V/crops
118
+
119
+ ls data/st_prior/video/SA_V/crops/*.json > data/st_prior/video/SA_V/crops/pool_list.txt
120
+ ```
121
+
122
+ The exracted masklets will be stored as:
123
+ ```
124
+ data/st_prior/video/SA_V/crops/
125
+ ├── pool_list.txt
126
+ ├── sav_000001_mask_000.mp4
127
+ ├── sav_000001_masklet_000.mp4
128
+ ├── sav_000001_meta_000.json
129
+ ├── sav_000002_mask_000.mp4
130
+ ├── sav_000002_mask_001.mp4
131
+ ├── sav_000002_masklet_000.mp4
132
+ ├── sav_000002_masklet_001.mp4
133
+ ├── sav_000002_meta_000.json
134
+ ├── sav_000002_meta_001.json
135
+ ├── ...
136
+ ```
137
+
138
+ - Auxiliary Audio Resource ([AudioSep](https://github.com/Audio-AGI/AudioSep))
139
+
140
+ After seperating audio sources from original audio files, we can apply arbitrary addition and deletion operations on audios to introduce spatial asynchrony between video and audio pairs:
141
+
142
+ ```bash
143
+ cd third_party
144
+ git clone https://github.com/Audio-AGI/AudioSep.git
145
+
146
+ CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 \
147
+ tools/st_prior/sep_audios.py \
148
+ --audio_path /path/to/TAVGBench \
149
+ --output_path ./data/st_prior/audio/TAVGBench
150
+
151
+ ls data/st_prior/audio/TAVGBench/*.wav > data/st_prior/audio/TAVGBench/pool_list.txt
152
+ ```
153
+
154
+
155
+ ### Stage3 - JavisDiT-jav
156
+
157
+ Here we provide an example with [TAVGBench](https://github.com/OpenNLPLab/TAVGBench) to prepare video-audio-text triplets for training. You can easily transfer to your own datasets.
158
+
159
+ | path | id | relpath | num_frames | height | width | aspect_ratio | fps | resolution | audio_path | audio_fps | text|
160
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---|
161
+ | /path/to/xxx.mp4 | xxx | xxx.mp4 | 240 | 480 | 640 | 0.75 | 24 | 307200 | /path/to/xxx.wav | 16000 | yyy |
162
+
163
+ With our cleaned [`release_captions_clean.txt`](https://huggingface.co/datasets/JavisDiT/TAVGBench_clean/tree/main) file, the following script will automatically generate a `train_jav.csv` for configuration:
164
+
165
+ ```bash
166
+ ROOT_VIDEO="/path/to/videos"
167
+ ROOT_META="./data/meta/TAVGBench"
168
+
169
+ fmin=10 # minial frames for each video
170
+
171
+ # 1.1 Create a meta file from a video folder. This should output ${ROOT_META}/meta.csv
172
+ python -m tools.datasets.convert video ${ROOT_VIDEO} --output ${ROOT_META}/meta.csv
173
+
174
+ # 1.2 Get video information and remove broken videos. This should output ${ROOT_META}/meta_info_fmin${fmin}.csv
175
+ python -m tools.datasets.datautil ${ROOT_META}/meta.csv --info --fmin ${fmin}
176
+
177
+ # 2.1 Unify FPS to 24 Hz for all videos. This will change the raw videos, and output ${ROOT_META}/meta_info_fmin${fmin}_fps24.csv
178
+ python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}.csv --uni-fps 24 --overwrite
179
+
180
+ # 2.2 Extract audios from videos, and fix the sample rate to 16k Hz for all audios. This should output ${ROOT_META}/meta_info_fmin${fmin}_fps24_au_sr16000.csv
181
+ python -m tools.datasets.datautil ${ROOT_META}/meta_info_fmin${fmin}_fps24.csv --extract-audio --audio-sr 16000
182
+
183
+ # 3.1 Get training meta csv. This should output ${ROOT_META}/train_jav.csv
184
+ python -m tools.datasets.find_jav_ds tavgbench \
185
+ --meta_src /path/to/TAVGBench_clean/release_captions_clean.txt \
186
+ --meta_file ${ROOT_META}/meta_info_fmin${fmin}_fps24_au_sr16000.csv \
187
+ --save_file ${ROOT_META}/train_jav.csv
188
+ ```
189
+
190
+ If you get multiple data sources, just merge the csv files to a single one:
191
+ ```bash
192
+ python -m tools.datasets.datautil ds1.csv ds2.csv ... --output /path/to/output.csv
193
+ ```
assets/image/JavisDiT-framework-resized.png ADDED

Git LFS Details

  • SHA256: 5bc4e9c38d8e249568a982151f52db29d47a99a3d1b11cb97b55cefe737339cc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.75 MB
assets/image/JavisDiT-intro-resized.png ADDED

Git LFS Details

  • SHA256: 03bd381cfe4945dbd85b2680bb199469c015c93bebc49ab05f096de684df7e4a
  • Pointer size: 132 Bytes
  • Size of remote file: 9.24 MB
assets/image/logo.png ADDED

Git LFS Details

  • SHA256: a04bc6126c6bf8f3c6e92815ba203ac8c5950ad360c85d3341f06dbbe98d5fdc
  • Pointer size: 131 Bytes
  • Size of remote file: 241 kB
assets/src/funasr_utils_load_utils.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import json
4
+ from io import BytesIO
5
+ import torch.distributed as dist
6
+ import numpy as np
7
+ import kaldiio
8
+ import librosa
9
+ import torchaudio
10
+ import time
11
+ import logging
12
+ from torch.nn.utils.rnn import pad_sequence
13
+
14
+ try:
15
+ from funasr.download.file import download_from_url
16
+ except:
17
+ print("urllib is not installed, if you infer from url, please install it first.")
18
+ import pdb
19
+ import subprocess
20
+ from subprocess import CalledProcessError, run
21
+
22
+ try:
23
+ from pydub import AudioSegment
24
+ except:
25
+ pass
26
+
27
+
28
+ def is_ffmpeg_installed():
29
+ try:
30
+ output = subprocess.check_output(["ffmpeg", "-version"], stderr=subprocess.STDOUT)
31
+ return "ffmpeg version" in output.decode("utf-8")
32
+ except (subprocess.CalledProcessError, FileNotFoundError):
33
+ return False
34
+
35
+
36
+ use_ffmpeg = False
37
+ if is_ffmpeg_installed():
38
+ use_ffmpeg = True
39
+ else:
40
+ print(
41
+ "Notice: ffmpeg is not installed. torchaudio is used to load audio\n"
42
+ "If you want to use ffmpeg backend to load audio, please install it by:"
43
+ "\n\tsudo apt install ffmpeg # ubuntu"
44
+ "\n\t# brew install ffmpeg # mac"
45
+ )
46
+
47
+
48
+ def load_audio_text_image_video(
49
+ data_or_path_or_list,
50
+ fs: int = 16000,
51
+ audio_fs: int = 16000,
52
+ data_type="sound",
53
+ tokenizer=None,
54
+ **kwargs,
55
+ ):
56
+ if isinstance(data_or_path_or_list, (list, tuple)):
57
+ if data_type is not None and isinstance(data_type, (list, tuple)):
58
+ data_types = [data_type] * len(data_or_path_or_list)
59
+ data_or_path_or_list_ret = [[] for d in data_type]
60
+ for i, (data_type_i, data_or_path_or_list_i) in enumerate(
61
+ zip(data_types, data_or_path_or_list)
62
+ ):
63
+ for j, (data_type_j, data_or_path_or_list_j) in enumerate(
64
+ zip(data_type_i, data_or_path_or_list_i)
65
+ ):
66
+ data_or_path_or_list_j = load_audio_text_image_video(
67
+ data_or_path_or_list_j,
68
+ fs=fs,
69
+ audio_fs=audio_fs,
70
+ data_type=data_type_j,
71
+ tokenizer=tokenizer,
72
+ **kwargs,
73
+ )
74
+ data_or_path_or_list_ret[j].append(data_or_path_or_list_j)
75
+
76
+ return data_or_path_or_list_ret
77
+ else:
78
+ return [
79
+ load_audio_text_image_video(
80
+ audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs
81
+ )
82
+ for audio in data_or_path_or_list
83
+ ]
84
+ if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith(
85
+ ("http://", "https://")
86
+ ): # download url to local file
87
+ data_or_path_or_list = download_from_url(data_or_path_or_list)
88
+
89
+ if (isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list)) or hasattr(data_or_path_or_list, 'read'): # local file or bytes io
90
+ if data_type is None or data_type == "sound":
91
+ if hasattr(data_or_path_or_list, "read") and hasattr(data_or_path_or_list, "seek"):
92
+ data_or_path_or_list.seek(0)
93
+ # if use_ffmpeg:
94
+ # data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
95
+ # data_or_path_or_list = torch.from_numpy(data_or_path_or_list).squeeze() # [n_samples,]
96
+ # else:
97
+ # data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
98
+ # if kwargs.get("reduce_channels", True):
99
+ # data_or_path_or_list = data_or_path_or_list.mean(0)
100
+ try:
101
+ data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
102
+ if kwargs.get("reduce_channels", True):
103
+ data_or_path_or_list = data_or_path_or_list.mean(0)
104
+ except:
105
+ data_or_path_or_list = _load_audio_ffmpeg(data_or_path_or_list, sr=fs)
106
+ data_or_path_or_list = torch.from_numpy(
107
+ data_or_path_or_list
108
+ ).squeeze() # [n_samples,]
109
+ elif data_type == "text" and tokenizer is not None:
110
+ data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
111
+ elif data_type == "image": # undo
112
+ pass
113
+ elif data_type == "video": # undo
114
+ pass
115
+
116
+ # if data_in is a file or url, set is_final=True
117
+ if "cache" in kwargs:
118
+ kwargs["cache"]["is_final"] = True
119
+ kwargs["cache"]["is_streaming_input"] = False
120
+ elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
121
+ data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
122
+ elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point
123
+ data_or_path_or_list = torch.from_numpy(data_or_path_or_list) # .squeeze() # [n_samples,]
124
+ elif isinstance(data_or_path_or_list, str) and data_type == "kaldi_ark":
125
+ data_mat = kaldiio.load_mat(data_or_path_or_list)
126
+ if isinstance(data_mat, tuple):
127
+ audio_fs, mat = data_mat
128
+ else:
129
+ mat = data_mat
130
+ if mat.dtype == "int16" or mat.dtype == "int32":
131
+ mat = mat.astype(np.float64)
132
+ mat = mat / 32768
133
+ if mat.ndim == 2:
134
+ mat = mat[:, 0]
135
+ data_or_path_or_list = mat
136
+ else:
137
+ pass
138
+ # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
139
+
140
+ if audio_fs != fs and data_type != "text":
141
+ resampler = torchaudio.transforms.Resample(audio_fs, fs)
142
+ data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]
143
+ return data_or_path_or_list
144
+
145
+
146
+ def load_bytes(input):
147
+ try:
148
+ input = validate_frame_rate(input)
149
+ except:
150
+ pass
151
+ middle_data = np.frombuffer(input, dtype=np.int16)
152
+ middle_data = np.asarray(middle_data)
153
+ if middle_data.dtype.kind not in "iu":
154
+ raise TypeError("'middle_data' must be an array of integers")
155
+ dtype = np.dtype("float32")
156
+ if dtype.kind != "f":
157
+ raise TypeError("'dtype' must be a floating point type")
158
+
159
+ i = np.iinfo(middle_data.dtype)
160
+ abs_max = 2 ** (i.bits - 1)
161
+ offset = i.min + abs_max
162
+ array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
163
+ return array
164
+
165
+
166
+ def validate_frame_rate(
167
+ input,
168
+ fs: int = 16000,
169
+ ):
170
+
171
+ # 将文件读取为字节流
172
+ byte_data = BytesIO(input)
173
+
174
+ # 使用 pydub 加载音频
175
+ try:
176
+ audio = AudioSegment.from_file(byte_data)
177
+ except:
178
+ raise RuntimeError(
179
+ "You are decoding the pcm data, please install pydub first. via `pip install pydub`."
180
+ )
181
+
182
+ # 确保采样率为 16000 Hz
183
+ if audio.frame_rate != fs:
184
+ audio = audio.set_frame_rate(fs)
185
+
186
+ # 将重新采样后的音频导出为字节流
187
+ output = BytesIO()
188
+ audio.export(output, format="wav")
189
+ output.seek(0)
190
+
191
+ # 获取重新采样后的字节流数据
192
+ input = output.read()
193
+
194
+ return input
195
+
196
+
197
+ def extract_fbank(data, data_len=None, data_type: str = "sound", frontend=None, **kwargs):
198
+ if isinstance(data, np.ndarray):
199
+ data = torch.from_numpy(data)
200
+ if len(data.shape) < 2:
201
+ data = data[None, :] # data: [batch, N]
202
+ data_len = [data.shape[1]] if data_len is None else data_len
203
+ elif isinstance(data, torch.Tensor):
204
+ if len(data.shape) < 2:
205
+ data = data[None, :] # data: [batch, N]
206
+ data_len = [data.shape[1]] if data_len is None else data_len
207
+ elif isinstance(data, (list, tuple)):
208
+ data_list, data_len = [], []
209
+ for data_i in data:
210
+ if isinstance(data_i, np.ndarray):
211
+ data_i = torch.from_numpy(data_i)
212
+ if not isinstance(data_i, torch.Tensor) or data_i.shape[0] < 2:
213
+ data_i = torch.zeros(2)
214
+ data_list.append(data_i)
215
+ data_len.append(data_i.shape[0])
216
+ data = pad_sequence(data_list, batch_first=True) # data: [batch, N]
217
+
218
+ data, data_len = frontend(data, data_len, **kwargs)
219
+
220
+ if isinstance(data_len, (list, tuple)):
221
+ data_len = torch.tensor([data_len])
222
+ return data.to(torch.float32), data_len.to(torch.int32)
223
+
224
+
225
+ def _load_audio_ffmpeg(file: str, sr: int = 16000):
226
+ """
227
+ Open an audio file and read as mono waveform, resampling as necessary
228
+
229
+ Parameters
230
+ ----------
231
+ file: str
232
+ The audio file to open
233
+
234
+ sr: int
235
+ The sample rate to resample the audio if necessary
236
+
237
+ Returns
238
+ -------
239
+ A NumPy array containing the audio waveform, in float32 dtype.
240
+ """
241
+
242
+ # This launches a subprocess to decode audio while down-mixing
243
+ # and resampling as necessary. Requires the ffmpeg CLI in PATH.
244
+ # fmt: off
245
+ cmd = [
246
+ "ffmpeg",
247
+ "-nostdin",
248
+ "-threads", "0",
249
+ "-i", file,
250
+ "-f", "s16le",
251
+ "-ac", "1",
252
+ "-acodec", "pcm_s16le",
253
+ "-ar", str(sr),
254
+ "-"
255
+ ]
256
+ # fmt: on
257
+ try:
258
+ out = run(cmd, capture_output=True, check=True).stdout
259
+ except CalledProcessError as e:
260
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
261
+
262
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
assets/src/pytorchvideo_augmentations.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+
3
+ """Video transforms that are used for advanced augmentation methods."""
4
+
5
+ from typing import Any, Callable, Dict, Optional, Tuple
6
+
7
+ import torch
8
+ import torchvision
9
+ from torchvision.transforms.functional import InterpolationMode, F_t
10
+
11
+
12
+ # Maximum global magnitude used for video augmentation.
13
+ _AUGMENTATION_MAX_LEVEL = 10
14
+
15
+
16
+ def _check_fill_arg(kwargs):
17
+ """
18
+ Check if kwargs contains key ``fill``.
19
+ """
20
+ assert "fill" in kwargs, "Need to have fill in kwargs."
21
+
22
+
23
+ def _autocontrast(video: torch.Tensor, **kwargs) -> torch.Tensor:
24
+ """
25
+ Maximize contrast of a video by remapping its pixels per channel so that the lowest
26
+ becomes black and the lightest becomes white.
27
+
28
+ Args:
29
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
30
+ """
31
+ return torchvision.transforms.functional.autocontrast(video)
32
+
33
+
34
+ def _equalize(video: torch.Tensor, **kwargs) -> torch.Tensor:
35
+ """
36
+ Equalize the histogram of a video by applying a non-linear mapping to the input in
37
+ order to create a uniform distribution of grayscale values in the output.
38
+
39
+ Args:
40
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
41
+ """
42
+ if video.dtype != torch.uint8:
43
+ video_type = video.dtype
44
+ video = (video * 255).to(torch.uint8)
45
+ return (torchvision.transforms.functional.equalize(video) / 255).to(video_type)
46
+ return torchvision.transforms.functional.equalize(video)
47
+
48
+
49
+ def _invert(video: torch.Tensor, **kwargs) -> torch.Tensor:
50
+ """
51
+ Invert the colors of a video.
52
+
53
+ Args:
54
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
55
+ """
56
+ return torchvision.transforms.functional.invert(video)
57
+
58
+
59
+ def _rotate(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
60
+ """
61
+ Rotate the image by angle.
62
+
63
+ Args:
64
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
65
+ factor (float): The rotation angle value in degrees, counter-clockwise.
66
+ """
67
+ _check_fill_arg(kwargs)
68
+ return torchvision.transforms.functional.rotate(
69
+ video, factor, fill=kwargs["fill"], interpolation=InterpolationMode.BILINEAR
70
+ )
71
+
72
+
73
+ def _solarize(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
74
+ """
75
+ Solarize an video by inverting all pixel values above a threshold.
76
+
77
+ Args:
78
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
79
+ """
80
+ if video.dtype == torch.uint8:
81
+ return torchvision.transforms.functional.solarize(video, int(factor * 255.0))
82
+ else:
83
+ return torchvision.transforms.functional.solarize(video, factor)
84
+
85
+
86
+ def _adjust_contrast(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
87
+ """
88
+ Adjust contrast of an a video.
89
+
90
+ Args:
91
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
92
+ factor (float): How much to adjust the contrast. Can be any non-negative
93
+ number. 0 gives a solid gray video, 1 gives the original video while 2
94
+ increases the contrast by a factor of 2.
95
+ """
96
+ return torchvision.transforms.functional.adjust_contrast(video, factor)
97
+
98
+
99
+ def _adjust_saturation(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
100
+ """
101
+ Adjust the saturation of a video.
102
+
103
+ Args:
104
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
105
+ factor (float): How much to adjust the saturation. 0 will give a black and
106
+ white video, 1 will give the original video while 2 will enhance the
107
+ saturation by a factor of 2.
108
+ """
109
+ return torchvision.transforms.functional.adjust_saturation(video, factor)
110
+
111
+
112
+ def _adjust_brightness(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
113
+ """
114
+ Adjust brightness of a video.
115
+
116
+ Args:
117
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
118
+ sharpness_factor (float): How much to adjust the sharpness. Can be any
119
+ non-negative number. 0 gives a blurred video, 1 gives the original video
120
+ while 2 increases the sharpness by a factor of 2.
121
+ """
122
+ return torchvision.transforms.functional.adjust_brightness(video, factor)
123
+
124
+
125
+ def _adjust_sharpness(video: torch.Tensor, factor: float, **kwargs) -> torch.Tensor:
126
+ """
127
+ Adjust the sharpness of a video.
128
+
129
+ Args:
130
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
131
+ factor (float): How much to adjust the sharpness. Can be any non-negative
132
+ number. 0 gives a blurred video, 1 gives the original video while 2
133
+ increases the sharpness by a factor of 2.
134
+ """
135
+ return torchvision.transforms.functional.adjust_sharpness(video, factor)
136
+
137
+
138
+ def _posterize(video: torch.Tensor, factor: float, **kwargs):
139
+ """
140
+ Posterize an image by reducing the number of bits for each color channel.
141
+
142
+ Args:
143
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
144
+ factor (float): The number of bits to keep for each channel (0-8).
145
+ """
146
+ if factor >= 8:
147
+ return video
148
+ if video.dtype != torch.uint8:
149
+ video_type = video.dtype
150
+ video = (video * 255).to(torch.uint8)
151
+ return (torchvision.transforms.functional.posterize(video, factor) / 255).to(
152
+ video_type
153
+ )
154
+ return torchvision.transforms.functional.posterize(video, factor)
155
+
156
+
157
+ def _shear_x(video: torch.Tensor, factor: float, **kwargs):
158
+ """
159
+ Shear the video along the horizontal axis.
160
+
161
+ Args:
162
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
163
+ factor (float): How much to shear along the horizontal axis using the affine
164
+ matrix.
165
+ """
166
+ _check_fill_arg(kwargs)
167
+ translation_offset = video.size(-2) * factor / 2
168
+ return F_t.affine(
169
+ video,
170
+ [1, factor, translation_offset, 0, 1, 0],
171
+ fill=kwargs["fill"],
172
+ interpolation="bilinear",
173
+ )
174
+
175
+
176
+ def _shear_y(video: torch.Tensor, factor: float, **kwargs):
177
+ """
178
+ Shear the video along the vertical axis.
179
+
180
+ Args:
181
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
182
+ factor (float): How much to shear along the vertical axis using the affine
183
+ matrix.
184
+ """
185
+ _check_fill_arg(kwargs)
186
+ translation_offset = video.size(-1) * factor / 2
187
+ return F_t.affine(
188
+ video,
189
+ [1, 0, 0, factor, 1, translation_offset],
190
+ fill=kwargs["fill"],
191
+ interpolation="bilinear",
192
+ )
193
+
194
+
195
+ def _translate_x(video: torch.Tensor, factor: float, **kwargs):
196
+ """
197
+ Translate the video along the vertical axis.
198
+
199
+ Args:
200
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
201
+ factor (float): How much (relative to the image size) to translate along the
202
+ vertical axis.
203
+ """
204
+ _check_fill_arg(kwargs)
205
+ translation_offset = factor * video.size(-1)
206
+ return F_t.affine(
207
+ video,
208
+ [1, 0, translation_offset, 0, 1, 0],
209
+ fill=kwargs["fill"],
210
+ interpolation="bilinear",
211
+ )
212
+
213
+
214
+ def _translate_y(video: torch.Tensor, factor: float, **kwargs):
215
+ """
216
+ Translate the video along the vertical axis.
217
+
218
+ Args:
219
+ video (torch.Tensor): Video tensor with shape (T, C, H, W).
220
+ factor (float): How much (relative to the image size) to translate along the
221
+ horizontal axis.
222
+ """
223
+ _check_fill_arg(kwargs)
224
+ translation_offset = factor * video.size(-2)
225
+ return F_t.affine(
226
+ video,
227
+ [1, 0, 0, 0, 1, translation_offset],
228
+ fill=kwargs["fill"],
229
+ interpolation="bilinear",
230
+ )
231
+
232
+
233
+ def _randomly_negate(magnitude: float) -> float:
234
+ """
235
+ Negate input value with 50% chance.
236
+
237
+ Args:
238
+ magnitude (float): Input value.
239
+ """
240
+ return magnitude if torch.rand(1).item() > 0.5 else -magnitude
241
+
242
+
243
+ def _increasing_magnitude_to_arg(level: int, params: Tuple[float, float]) -> float:
244
+ """
245
+ Convert level to transform magnitude. This assumes transform magnitude increases
246
+ linearly with level.
247
+
248
+ Args:
249
+ level (int): Level value.
250
+ params (Tuple[float, float]): Params contains two values: 1) Base transform
251
+ magnitude when level is 0; 2) Maxmimum increasing in transform magnitude
252
+ when level is at Maxmimum.
253
+ """
254
+ magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
255
+ return (params[0] + magnitude,)
256
+
257
+
258
+ def _increasing_randomly_negate_to_arg(
259
+ level: int, params: Tuple[float, float]
260
+ ) -> Tuple[float]:
261
+ """
262
+ Convert level to transform magnitude. This assumes transform magnitude increases
263
+ (or decreases with 50% chance) linearly with level.
264
+
265
+ Args:
266
+ level (int): Level value.
267
+ params (Tuple[float, float]): Params contains two values: 1) Base transform
268
+ magnitude when level is 0; 2) Maxmimum increasing in transform magnitude
269
+ when level is at maxmimum.
270
+ """
271
+ magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
272
+ return (params[0] + _randomly_negate(magnitude),)
273
+
274
+
275
+ def _decreasing_int_to_arg(level: int, params: Tuple[int, int]) -> Tuple[int]:
276
+ """
277
+ Convert level to transform magnitude. This assumes transform magnitude decreases
278
+ linearly with level. The return value is converted to int.
279
+
280
+ Args:
281
+ level (int): Level value.
282
+ params (Tuple[float, float]): Params contains two values: 1) Base transform
283
+ magnitude when level is 0; 2) Maxmimum decreasing in transform magnitude
284
+ when level is at maxmimum.
285
+ """
286
+ magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
287
+ return (params[0] - int(magnitude),)
288
+
289
+
290
+ def _decreasing_to_arg(level: int, params: Tuple[float, float]) -> Tuple[float]:
291
+ """
292
+ Convert level to transform magnitude. This assumes transform magnitude decreases
293
+ linearly with level.
294
+
295
+ Args:
296
+ level (int): Level value.
297
+ params (Tuple[float, float]): Params contains two values: 1) Base transform
298
+ magnitude when level is 0; 2) Maxmimum decreasing in transform magnitude
299
+ when level is at maxmimum.
300
+ """
301
+ magnitude = (level / _AUGMENTATION_MAX_LEVEL) * params[1]
302
+ return (params[0] - magnitude,)
303
+
304
+
305
+ # A dictionary that contains transform names (key) and their corresponding transform
306
+ # functions (value).
307
+ _NAME_TO_TRANSFORM_FUNC = {
308
+ "AdjustBrightness": _adjust_brightness,
309
+ "AdjustContrast": _adjust_contrast,
310
+ "AdjustSaturation": _adjust_saturation,
311
+ "AdjustSharpness": _adjust_sharpness,
312
+ "AutoContrast": _autocontrast,
313
+ "Equalize": _equalize,
314
+ "Invert": _invert,
315
+ "Rotate": _rotate,
316
+ "Posterize": _posterize,
317
+ "Solarize": _solarize,
318
+ "ShearX": _shear_x,
319
+ "ShearY": _shear_y,
320
+ "TranslateX": _translate_x,
321
+ "TranslateY": _translate_y,
322
+ }
323
+
324
+ # A dictionary that contains transform names (key) and their corresponding level
325
+ # functions (value), which converts the magnitude to the transform function arguments.
326
+ _LEVEL_TO_ARG = {
327
+ "AdjustBrightness": _increasing_randomly_negate_to_arg,
328
+ "AdjustContrast": _increasing_randomly_negate_to_arg,
329
+ "AdjustSaturation": _increasing_randomly_negate_to_arg,
330
+ "AdjustSharpness": _increasing_randomly_negate_to_arg,
331
+ "AutoContrast": None,
332
+ "Equalize": None,
333
+ "Invert": None,
334
+ "Rotate": _increasing_randomly_negate_to_arg,
335
+ "Posterize": _decreasing_int_to_arg,
336
+ "Solarize": _decreasing_to_arg,
337
+ "ShearX": _increasing_randomly_negate_to_arg,
338
+ "ShearY": _increasing_randomly_negate_to_arg,
339
+ "TranslateX": _increasing_randomly_negate_to_arg,
340
+ "TranslateY": _increasing_randomly_negate_to_arg,
341
+ }
342
+
343
+ # A dictionary that contains transform names (key) and their corresponding maximum
344
+ # transform (value).
345
+ _TRANSFORM_MAX_PARAMS = {
346
+ "AdjustBrightness": (1, 0.9),
347
+ "AdjustContrast": (1, 0.9),
348
+ "AdjustSaturation": (1, 0.9),
349
+ "AdjustSharpness": (1, 0.9),
350
+ "AutoContrast": None,
351
+ "Equalize": None,
352
+ "Invert": None,
353
+ "Rotate": (0, 30),
354
+ "Posterize": (4, 4),
355
+ "Solarize": (1, 1),
356
+ "ShearX": (0, 0.3),
357
+ "ShearY": (0, 0.3),
358
+ "TranslateX": (0, 0.45),
359
+ "TranslateY": (0, 0.45),
360
+ }
361
+
362
+ # Hyperparameters for sampling magnitude.
363
+ SAMPLING_DEFAULT_HPARAS = {"sampling_std": 0.5}
364
+
365
+ # Hyperparameters for transform functions.
366
+ TRANSFORM_DEFAULT_HPARAS = {"fill": (0.5, 0.5, 0.5)}
367
+
368
+
369
+ class AugmentTransform:
370
+ def __init__(
371
+ self,
372
+ transform_name: str,
373
+ magnitude: int = 10,
374
+ prob: float = 0.5,
375
+ name_to_transform_func: Optional[Dict[str, Callable]] = None,
376
+ level_to_arg: Optional[Dict[str, Callable]] = None,
377
+ transform_max_paras: Optional[Dict[str, Tuple]] = None,
378
+ transform_hparas: Optional[Dict[str, Any]] = None,
379
+ sampling_type: str = "gaussian",
380
+ sampling_hparas: Optional[Dict[str, Any]] = None,
381
+ ) -> None:
382
+ """
383
+ The AugmentTransform composes a video transform that performs augmentation
384
+ based on a maximum magnitude. AugmentTransform also offers flexible ways to
385
+ generate augmentation magnitude based on different sampling strategies.
386
+
387
+ Args:
388
+ transform_name (str): The name of the video transform function.
389
+ magnitude (int): Magnitude used for transform function.
390
+ prob (float): The probablity of applying each transform function.
391
+ name_to_transform_func (Optional[Dict[str, Callable]]): A Dictionary that
392
+ contains mapping of the transform name to the transform function.
393
+ level_to_arg (Optional[Dict[str, Callable]]): A Dictionary that contains
394
+ mapping of the transform name to its level function, which converts
395
+ the the magnitude to the transform function arguments.
396
+ transform_max_paras (Optional[Dict[str, Tuple]]): A Dictionary that
397
+ contains mapping of the transform name to its maximum transform
398
+ magnitude.
399
+ transform_hparas (Optional[Dict[Any]]): Transform hyper parameters.
400
+ Needs to have key fill. By default, it uses transform_default_hparas.
401
+ sampling_type (str): Sampling method for magnitude of transform. It should
402
+ be either gaussian or uniform.
403
+ sampling_hparas (Optional[Dict[Any]]): Hyper parameters for sampling. If
404
+ gaussian sampling is used, it needs to have key sampling_std. By
405
+ default, it uses transform_default_hparas.
406
+ """
407
+
408
+ assert sampling_type in ["gaussian", "uniform"]
409
+ name_to_transform_func = name_to_transform_func or _NAME_TO_TRANSFORM_FUNC
410
+ level_to_arg = level_to_arg or _LEVEL_TO_ARG
411
+ transform_max_paras = transform_max_paras or _TRANSFORM_MAX_PARAMS
412
+ self.transform_hparas = transform_hparas or TRANSFORM_DEFAULT_HPARAS
413
+ self.sampling_type = sampling_type
414
+ self.sampling_hparas = sampling_hparas or SAMPLING_DEFAULT_HPARAS
415
+ assert "fill" in self.transform_hparas
416
+ if self.sampling_type == "gaussian":
417
+ assert "sampling_std" in self.sampling_hparas
418
+ if self.sampling_type == "uniform":
419
+ assert "sampling_data_type" in self.sampling_hparas
420
+ assert "sampling_min" in self.sampling_hparas
421
+ if self.sampling_hparas["sampling_data_type"] == "int":
422
+ assert isinstance(self.sampling_hparas["sampling_min"], int)
423
+ elif self.sampling_hparas["sampling_data_type"] == "float":
424
+ assert isinstance(self.sampling_hparas["sampling_min"], (int, float))
425
+ assert transform_name in name_to_transform_func
426
+
427
+ self.max_level = _AUGMENTATION_MAX_LEVEL
428
+ self.transform_name = transform_name
429
+ self.magnitude = magnitude
430
+ self.transform_fn = name_to_transform_func[transform_name]
431
+ self.level_fn = level_to_arg[transform_name]
432
+ self.level_paras = transform_max_paras[transform_name]
433
+ self.prob = prob
434
+ self.sampling_type = sampling_type
435
+
436
+ def _get_magnitude(self) -> float:
437
+ """
438
+ Get magnitude based on sampling type.
439
+ """
440
+ if self.sampling_type == "gaussian":
441
+ return max(
442
+ 0,
443
+ min(
444
+ self.max_level,
445
+ torch.normal(
446
+ self.magnitude, self.sampling_hparas["sampling_std"], size=(1,)
447
+ ).item(),
448
+ ),
449
+ )
450
+ elif self.sampling_type == "uniform":
451
+ if self.sampling_hparas["sampling_data_type"] == "int":
452
+ return torch.randint(
453
+ self.sampling_hparas["sampling_min"], self.magnitude + 1, size=(1,)
454
+ ).item()
455
+ elif self.sampling_hparas["sampling_data_type"] == "float":
456
+ return (
457
+ torch.rand(size=(1,)).item()
458
+ * (self.magnitude - self.sampling_hparas["sampling_min"])
459
+ + self.sampling_hparas["sampling_min"]
460
+ )
461
+ else:
462
+ raise ValueError("sampling_data_type must be either 'int' or 'float'")
463
+ else:
464
+ raise NotImplementedError
465
+
466
+ def __call__(self, video: torch.Tensor) -> torch.Tensor:
467
+ """
468
+ The input is a video tensor.
469
+
470
+ Args:
471
+ video (torch.Tensor): Input video tensor with shape (T, C, H, W).
472
+ """
473
+ if torch.rand(1).item() > self.prob:
474
+ return video
475
+ magnitude = self._get_magnitude()
476
+ level_args = (
477
+ self.level_fn(magnitude, self.level_paras)
478
+ if self.level_fn is not None
479
+ else ()
480
+ )
481
+ return self.transform_fn(video, *level_args, **self.transform_hparas)
configs/dit/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "bf16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256-class.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="label_1000",
10
+ from_pretrained="DiT-XL-2-256x256.pt",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="classes",
18
+ num_classes=1000,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "bf16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/imagenet_id.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="text",
10
+ from_pretrained="PRETRAINED_MODEL",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="clip",
18
+ from_pretrained="openai/clip-vit-base-patch32",
19
+ model_max_length=77,
20
+ )
21
+ scheduler = dict(
22
+ type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=4.0,
25
+ )
26
+ dtype = "bf16"
27
+
28
+ # Others
29
+ batch_size = 2
30
+ seed = 42
31
+ prompt_path = "./assets/texts/imagenet_labels.txt"
32
+ save_dir = "./samples/samples/"
configs/dit/train/16x256x256.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="DiT-XL/2",
20
+ from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flash_attn=True,
22
+ enable_layernorm_kernel=True,
23
+ )
24
+ vae = dict(
25
+ type="VideoAutoencoderKL",
26
+ from_pretrained="stabilityai/sd-vae-ft-ema",
27
+ )
28
+ text_encoder = dict(
29
+ type="clip",
30
+ from_pretrained="openai/clip-vit-base-patch32",
31
+ model_max_length=77,
32
+ )
33
+ scheduler = dict(
34
+ type="iddpm",
35
+ timestep_respacing="",
36
+ )
37
+
38
+ # Others
39
+ seed = 42
40
+ outputs = "outputs"
41
+ wandb = False
42
+
43
+ epochs = 1000
44
+ log_every = 10
45
+ ckpt_every = 1000
46
+ load = None
47
+
48
+ batch_size = 8
49
+ lr = 2e-5
50
+ grad_clip = 1.0
configs/dit/train/1x256x256.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=1,
7
+ image_size=(256, 256),
8
+ transform_name="center",
9
+ )
10
+
11
+ # Define acceleration
12
+ num_workers = 4
13
+ dtype = "bf16"
14
+ grad_checkpoint = False
15
+ plugin = "zero2"
16
+ sp_size = 1
17
+
18
+ # Define model
19
+ model = dict(
20
+ type="DiT-XL/2",
21
+ no_temporal_pos_emb=True,
22
+ enable_flash_attn=True,
23
+ enable_layernorm_kernel=True,
24
+ )
25
+ vae = dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="stabilityai/sd-vae-ft-ema",
28
+ )
29
+ text_encoder = dict(
30
+ type="clip",
31
+ from_pretrained="openai/clip-vit-base-patch32",
32
+ model_max_length=77,
33
+ )
34
+ scheduler = dict(
35
+ type="iddpm",
36
+ timestep_respacing="",
37
+ )
38
+
39
+ # Others
40
+ seed = 42
41
+ outputs = "outputs"
42
+ wandb = False
43
+
44
+ epochs = 1000
45
+ log_every = 10
46
+ ckpt_every = 1000
47
+ load = None
48
+
49
+ batch_size = 128
50
+ lr = 1e-4 # according to DiT repo
51
+ grad_clip = 1.0
configs/javisdit-v0-1/inference/audio_sample.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = "4s"
4
+ fps = 24
5
+ audio_fps = 16000
6
+ frame_interval = 1
7
+ save_fps = 24
8
+
9
+ save_dir = "./samples/samples/"
10
+ seed = 42
11
+ batch_size = 1
12
+ multi_resolution = "OpenSora"
13
+ dtype = "bf16"
14
+ loop = 1 # loop for video extension
15
+ condition_frame_length = 5 # used for video extension conditioning
16
+ align = 5 # TODO: unknown mechanism, maybe for conditional frame alignment?
17
+ verbose = 2
18
+
19
+ audio_only = True
20
+
21
+ model = dict(
22
+ type="VASTDiT3-XL/2",
23
+ from_pretrained="JavisDiT/JavisDiT-v0.1-audio",
24
+ qk_norm=True,
25
+ enable_flash_attn=True,
26
+ enable_layernorm_kernel=True,
27
+ # audio generation only
28
+ only_infer_audio=True,
29
+ freeze_video_branch=True,
30
+ freeze_y_embedder=False,
31
+ train_st_prior_attn=False,
32
+ train_va_cross_attn=False,
33
+ audio_patch_size=(4, 1)
34
+ )
35
+ vae = dict(
36
+ type="OpenSoraVAE_V1_2",
37
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
38
+ micro_frame_size=17,
39
+ micro_batch_size=4,
40
+ )
41
+ audio_vae = dict(
42
+ type="AudioLDM2",
43
+ from_pretrained="cvssp/audioldm2",
44
+ )
45
+ text_encoder = dict(
46
+ type="t5",
47
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
48
+ model_max_length=300,
49
+ )
50
+ scheduler = dict(
51
+ type="rflow",
52
+ use_timestep_transform=True,
53
+ num_sampling_steps=30,
54
+ cfg_scale=7.0,
55
+ )
56
+
57
+ aes = 6.5 # aesthetic score
58
+ flow = None # motion score
configs/javisdit-v0-1/inference/sample.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = "4s"
4
+ fps = 24
5
+ audio_fps = 16000
6
+ frame_interval = 1
7
+ save_fps = 24
8
+
9
+ save_dir = "./samples/samples/"
10
+ seed = 42
11
+ batch_size = 1
12
+ multi_resolution = "OpenSora"
13
+ dtype = "bf16"
14
+ loop = 1 # loop for video extension
15
+ condition_frame_length = 5 # used for video extension conditioning
16
+ align = 5 # TODO: unknown mechanism, maybe for conditional frame alignment?
17
+ verbose = 2
18
+
19
+ spatial_token_num = 32
20
+ temporal_token_num = 32
21
+ st_prior_channel = 128
22
+
23
+ model = dict(
24
+ type="VASTDiT3-XL/2",
25
+ from_pretrained="JavisDiT/JavisDiT-v0.1-jav",
26
+ qk_norm=True,
27
+ enable_flash_attn=True,
28
+ enable_layernorm_kernel=True,
29
+ # video-audio joint generation
30
+ freeze_y_embedder=True,
31
+ freeze_video_branch=True,
32
+ freeze_audio_branch=True,
33
+ train_st_prior_attn=True,
34
+ train_va_cross_attn=True,
35
+ spatial_prior_len=spatial_token_num,
36
+ temporal_prior_len=temporal_token_num,
37
+ st_prior_channel=st_prior_channel,
38
+ audio_patch_size=(4, 1)
39
+ )
40
+ vae = dict(
41
+ type="OpenSoraVAE_V1_2",
42
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
43
+ micro_frame_size=17,
44
+ micro_batch_size=4,
45
+ )
46
+ audio_vae = dict(
47
+ type="AudioLDM2",
48
+ from_pretrained="cvssp/audioldm2",
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ )
55
+ prior_encoder = dict(
56
+ type="STIBPrior",
57
+ imagebind_ckpt_path="./checkpoints",
58
+ from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
59
+ spatial_token_num=spatial_token_num,
60
+ temporal_token_num=temporal_token_num,
61
+ out_dim=st_prior_channel,
62
+ hidden_size=512,
63
+ apply_sampling=True,
64
+ encode_va=False,
65
+ qk_norm=True,
66
+ enable_flash_attn=True,
67
+ enable_layernorm_kernel=True,
68
+ )
69
+ scheduler = dict(
70
+ type="rflow",
71
+ use_timestep_transform=True,
72
+ num_sampling_steps=30,
73
+ cfg_scale=7.0,
74
+ )
75
+
76
+ aes = 6.5 # aesthetic score
77
+ flow = None # motion score
configs/javisdit-v0-1/inference/sample_240p4s.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = "4s"
4
+ fps = 24
5
+ audio_fps = 16000
6
+ frame_interval = 1
7
+ save_fps = 24
8
+
9
+ save_dir = "./samples/samples/"
10
+ seed = 42
11
+ batch_size = 1
12
+ multi_resolution = "OpenSora"
13
+ dtype = "bf16"
14
+ loop = 1 # loop for video extension
15
+ condition_frame_length = 5 # used for video extension conditioning
16
+ align = 5 # TODO: unknown mechanism, maybe for conditional frame alignment?
17
+ verbose = 2
18
+
19
+ spatial_token_num = 32
20
+ temporal_token_num = 32
21
+ st_prior_channel = 128
22
+
23
+ model = dict(
24
+ type="VASTDiT3-XL/2",
25
+ from_pretrained="JavisDiT/JavisDiT-v0.1-jav-240p4s",
26
+ qk_norm=True,
27
+ enable_flash_attn=True,
28
+ enable_layernorm_kernel=True,
29
+ # video-audio joint generation
30
+ freeze_y_embedder=True,
31
+ freeze_video_branch=True,
32
+ freeze_audio_branch=True,
33
+ train_st_prior_attn=True,
34
+ train_va_cross_attn=True,
35
+ spatial_prior_len=spatial_token_num,
36
+ temporal_prior_len=temporal_token_num,
37
+ st_prior_channel=st_prior_channel,
38
+ audio_patch_size=(4, 1)
39
+ )
40
+ vae = dict(
41
+ type="OpenSoraVAE_V1_2",
42
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
43
+ micro_frame_size=17,
44
+ micro_batch_size=4,
45
+ )
46
+ audio_vae = dict(
47
+ type="AudioLDM2",
48
+ from_pretrained="cvssp/audioldm2",
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ )
55
+ prior_encoder = dict(
56
+ type="STIBPrior",
57
+ imagebind_ckpt_path="./checkpoints",
58
+ from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
59
+ spatial_token_num=spatial_token_num,
60
+ temporal_token_num=temporal_token_num,
61
+ out_dim=st_prior_channel,
62
+ hidden_size=512,
63
+ apply_sampling=True,
64
+ encode_va=False,
65
+ qk_norm=True,
66
+ enable_flash_attn=True,
67
+ enable_layernorm_kernel=True,
68
+ )
69
+ scheduler = dict(
70
+ type="rflow",
71
+ use_timestep_transform=True,
72
+ num_sampling_steps=30,
73
+ cfg_scale=7.0,
74
+ )
75
+
76
+ aes = 6.5 # aesthetic score
77
+ flow = None # motion score
configs/javisdit-v0-1/misc/extract_st_prior_va.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoAudioTextDataset",
4
+ direct_load_video_clip=True,
5
+ transform_name="resize_crop",
6
+ audio_transform_name="mel_spec_audioldm2",
7
+ neg_aug=1,
8
+ neg_aug_kwargs=dict(
9
+ video_augmentation_pool="./data/st_prior/video/SA-V",
10
+ audio_augmentation_pool="./data/st_prior/audio/TAVGBench",
11
+ ),
12
+ require_onset=True
13
+ )
14
+
15
+ # webvid
16
+ bucket_config = { # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
17
+ # image size : {num frame : {accept_probs, batch size}}
18
+ "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
19
+ # ---
20
+ "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
21
+ "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
22
+ # ---
23
+ "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
24
+ "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
25
+ # ---
26
+ "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
27
+ # ---
28
+ "720p": {51: (0.03, 1)},
29
+ "1024": {51: (0.03, 1)},
30
+ }
31
+ grad_checkpoint = True
32
+
33
+ # Acceleration settings
34
+ num_workers = 4
35
+ num_bucket_build_workers = 16
36
+ dtype = "bf16"
37
+ plugin = "zero2"
38
+
39
+ # Model settings
40
+ vae = dict(
41
+ type="OpenSoraVAE_V1_2",
42
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
43
+ micro_frame_size=17,
44
+ micro_batch_size=4,
45
+ )
46
+ audio_vae = dict(
47
+ type="AudioLDM2",
48
+ from_pretrained="cvssp/audioldm2",
49
+ )
50
+ # text_encoder = dict(
51
+ # type="t5",
52
+ # from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ # model_max_length=300,
54
+ # )
55
+
56
+ # Log settings
57
+ seed = 42
58
+ outputs = "outputs"
59
+ wandb = False
60
+ epochs = 1000
61
+ log_every = 10
62
+ ckpt_every = 50
63
+ save_total_limit = 2
64
+
65
+ bin_size = 16 # 1GB, 4195 bins
66
+ log_time = False
67
+
68
+ # audio settings
69
+ sampling_rate = 16000
70
+ mel_bins = 64
71
+ audio_cfg = {
72
+ "preprocessing": {
73
+ "audio": {
74
+ "sampling_rate": sampling_rate,
75
+ "max_wav_value": 32768.0,
76
+ "duration": 10.24,
77
+ },
78
+ "stft": {
79
+ "filter_length": 1024,
80
+ "hop_length": 160,
81
+ "win_length": 1024,
82
+ },
83
+ "mel": {
84
+ "n_mel_channels": mel_bins,
85
+ "mel_fmin": 0,
86
+ "mel_fmax": 8000,
87
+ }
88
+ },
89
+ "augmentation": {
90
+ "mixup": 0.0,
91
+ }
92
+ }
configs/javisdit-v0-1/misc/extract_va.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoAudioTextDataset",
4
+ direct_load_video_clip=True,
5
+ transform_name="resize_crop",
6
+ audio_transform_name="mel_spec_audioldm2",
7
+ )
8
+ # load_text_features = True
9
+
10
+ # webvid
11
+ bucket_config = { # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
12
+ # image size : {num frame : {accept_probs, batch size}}
13
+ "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
14
+ # ---
15
+ "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
16
+ "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
17
+ # ---
18
+ "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
19
+ "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
20
+ # ---
21
+ "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
22
+ # ---
23
+ "720p": {51: (0.03, 1)},
24
+ "1024": {51: (0.03, 1)},
25
+ }
26
+ grad_checkpoint = True
27
+
28
+ # Acceleration settings
29
+ num_workers = 8
30
+ num_bucket_build_workers = 16
31
+ dtype = "bf16"
32
+ plugin = "zero2"
33
+
34
+ # Model settings
35
+ vae = dict(
36
+ type="OpenSoraVAE_V1_2",
37
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
38
+ micro_frame_size=17,
39
+ micro_batch_size=4,
40
+ )
41
+ audio_vae = dict(
42
+ type="AudioLDM2",
43
+ from_pretrained="cvssp/audioldm2",
44
+ )
45
+ # text_encoder = dict(
46
+ # type="t5",
47
+ # from_pretrained="DeepFloyd/t5-v1_1-xxl",
48
+ # model_max_length=300,
49
+ # )
50
+
51
+
52
+ # Log settings
53
+ seed = 42
54
+ outputs = "outputs"
55
+ wandb = False
56
+ epochs = 1000
57
+ log_every = 10
58
+ ckpt_every = 50
59
+ save_total_limit = 2
60
+
61
+ bin_size = 64 # 1GB, 4195 bins
62
+ log_time = False
63
+
64
+ # audio settings
65
+ sampling_rate = 16000
66
+ mel_bins = 64
67
+ audio_cfg = {
68
+ "preprocessing": {
69
+ "audio": {
70
+ "sampling_rate": sampling_rate,
71
+ "max_wav_value": 32768.0,
72
+ "duration": 10.24,
73
+ },
74
+ "stft": {
75
+ "filter_length": 1024,
76
+ "hop_length": 160,
77
+ "win_length": 1024,
78
+ },
79
+ "mel": {
80
+ "n_mel_channels": mel_bins,
81
+ "mel_fmin": 0,
82
+ "mel_fmax": 8000,
83
+ }
84
+ },
85
+ "augmentation": {
86
+ "mixup": 0.0,
87
+ }
88
+ }
configs/javisdit-v0-1/train/stage1_audio.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ audio_only=True
3
+
4
+ dataset = dict(
5
+ type="VariableVideoAudioTextDataset",
6
+ transform_name="resize_crop",
7
+ audio_transform_name="mel_spec_audioldm2",
8
+ audio_only=audio_only
9
+ )
10
+
11
+ # webvid
12
+ bucket_config = { # 5s/it, randomly assigning raw videos to pre-defined and proper buckets
13
+ # image size : {num frame : {accept_probs, batch size}}
14
+ # # 28G?
15
+ # "144p": {51: (1.0, 96), 102: ((1.0, 0.7), 48), 204: ((1.0, 0.3), 24), 408: ((1.0, 0.5), 12)},
16
+ # # 32G
17
+ # "144p": {51: (1.0, 128), 102: ((1.0, 0.7), 64), 204: ((1.0, 0.3), 32), 408: ((1.0, 0.5), 16)},
18
+ # 45G
19
+ "144p": {51: (1.0, 256), 102: ((1.0, 0.7), 128), 204: ((1.0, 0.3), 64), 408: ((1.0, 0.5), 32)},
20
+ # 60-70G
21
+ # "144p": {51: (1.0, 384), 102: ((1.0, 0.7), 192), 204: ((1.0, 0.3), 128), 96: ((1.0, 0.5), 48)},
22
+ # 80G+
23
+ # "144p": {51: (1.0, 512), 102: ((1.0, 0.7), 256), 204: ((1.0, 0.3), 128), 408: ((1.0, 0.5), 64)},
24
+ }
25
+
26
+ grad_checkpoint = True
27
+
28
+ # Acceleration settings
29
+ num_workers = 16
30
+ num_bucket_build_workers = 8
31
+ dtype = "bf16"
32
+ plugin = "zero2"
33
+
34
+ # Model settings
35
+ model = dict(
36
+ type="VASTDiT3-XL/2",
37
+ weight_init_from=[
38
+ "./checkpoints/OpenSora-STDiT-v3/model.safetensors"
39
+ ],
40
+ qk_norm=True,
41
+ enable_flash_attn=True,
42
+ enable_layernorm_kernel=True,
43
+ # audio generation only
44
+ only_train_audio=True,
45
+ freeze_video_branch=True,
46
+ freeze_y_embedder=False,
47
+ train_st_prior_attn=False,
48
+ train_va_cross_attn=False,
49
+ audio_patch_size=(4, 1)
50
+ )
51
+ vae = dict(
52
+ type="OpenSoraVAE_V1_2",
53
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
54
+ micro_frame_size=17,
55
+ micro_batch_size=4,
56
+ )
57
+ audio_vae = dict(
58
+ type="AudioLDM2",
59
+ from_pretrained="cvssp/audioldm2",
60
+ )
61
+ text_encoder = dict(
62
+ type="t5",
63
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
64
+ model_max_length=300,
65
+ )
66
+ scheduler = dict(
67
+ type="rflow",
68
+ use_timestep_transform=True,
69
+ sample_method="logit-normal",
70
+ )
71
+
72
+ # Log settings
73
+ seed = 42
74
+ outputs = "outputs"
75
+ wandb = False
76
+ epochs = 50
77
+ log_every = 10
78
+ ckpt_every = 250
79
+ save_total_limit = 2
80
+
81
+ # optimization settings
82
+ load = None
83
+ grad_clip = 1.0
84
+ lr = 1e-4
85
+ ema_decay = 0.99
86
+ adam_eps = 1e-15
87
+ warmup_steps = 1000
88
+
89
+ # audio settings
90
+ sampling_rate = 16000
91
+ mel_bins = 64
92
+ audio_cfg = {
93
+ "preprocessing": {
94
+ "audio": {
95
+ "sampling_rate": sampling_rate,
96
+ "max_wav_value": 32768.0,
97
+ "duration": 10.24,
98
+ },
99
+ "stft": {
100
+ "filter_length": 1024,
101
+ "hop_length": 160,
102
+ "win_length": 1024,
103
+ },
104
+ "mel": {
105
+ "n_mel_channels": mel_bins,
106
+ "mel_fmin": 0,
107
+ "mel_fmax": 8000,
108
+ }
109
+ },
110
+ "augmentation": {
111
+ "mixup": 0.0,
112
+ }
113
+ }
configs/javisdit-v0-1/train/stage2_prior.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spatial_token_num = 32
2
+ temporal_token_num = 32
3
+ st_prior_channel = 128
4
+
5
+ # Dataset settings
6
+ dataset = dict(
7
+ type="VariableVideoAudioTextDataset",
8
+ direct_load_video_clip=True,
9
+ transform_name="resize_crop",
10
+ audio_transform_name="mel_spec_audioldm2",
11
+ neg_aug=1,
12
+ neg_aug_kwargs=dict(
13
+ video_augmentation_pool="./data/st_prior/video/SA-V",
14
+ audio_augmentation_pool="./data/st_prior/audio/TAVGBench",
15
+ ),
16
+ )
17
+ load_text_features = False # TODO: text encoder does not take too much time
18
+
19
+ # webvid
20
+ bucket_config = { # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
21
+ # image size : {num frame : {accept_probs, batch size}}
22
+ "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
23
+ # ---
24
+ "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
25
+ "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
26
+ # ---
27
+ "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
28
+ "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
29
+ # ---
30
+ "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
31
+ # ---
32
+ "720p": {51: (0.03, 1)},
33
+ "1024": {51: (0.03, 1)},
34
+ }
35
+
36
+ # Acceleration settings
37
+ num_workers = 4
38
+ num_bucket_build_workers = 16
39
+ dtype = "bf16"
40
+ grad_checkpoint = True
41
+ plugin = "zero2"
42
+
43
+ # Model settings
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ audio_vae = dict(
51
+ type="AudioLDM2",
52
+ from_pretrained="cvssp/audioldm2",
53
+ )
54
+ model = dict(
55
+ type="STIBPrior",
56
+ imagebind_ckpt_path="./checkpoints",
57
+ spatial_token_num=spatial_token_num,
58
+ temporal_token_num=temporal_token_num,
59
+ out_dim=st_prior_channel,
60
+ hidden_size=512,
61
+ apply_sampling=True,
62
+ encode_va=True,
63
+ qk_norm=True,
64
+ enable_flash_attn=True,
65
+ enable_layernorm_kernel=True
66
+ )
67
+
68
+ # Log settings
69
+ seed = 42
70
+ outputs = "outputs"
71
+ wandb = False
72
+ epochs = 2
73
+ log_every = 10
74
+ ckpt_every = 200
75
+ save_total_limit = 2
76
+
77
+ # optimization settings
78
+ load = None
79
+ grad_clip = 1.0
80
+ lr = 1e-5
81
+ warmup_steps = 100
82
+
83
+ # audio settings
84
+ sampling_rate = 16000
85
+ mel_bins = 64
86
+ audio_cfg = {
87
+ "preprocessing": {
88
+ "audio": {
89
+ "sampling_rate": sampling_rate,
90
+ "max_wav_value": 32768.0,
91
+ "duration": 10.24,
92
+ },
93
+ "stft": {
94
+ "filter_length": 1024,
95
+ "hop_length": 160,
96
+ "win_length": 1024,
97
+ },
98
+ "mel": {
99
+ "n_mel_channels": mel_bins,
100
+ "mel_fmin": 0,
101
+ "mel_fmax": 8000,
102
+ }
103
+ },
104
+ "augmentation": {
105
+ "mixup": 0.0,
106
+ }
107
+ }
configs/javisdit-v0-1/train/stage2_prior_feat.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spatial_token_num = 32
2
+ temporal_token_num = 32
3
+ st_prior_channel = 128
4
+
5
+ # Dataset settings
6
+ dataset = dict(type="BatchFeatureDataset")
7
+ load_va_features = True
8
+ load_text_features = False # TODO: text encoder does not take too much time
9
+
10
+ # Acceleration settings
11
+ num_workers = 4
12
+ num_bucket_build_workers = 16
13
+ dtype = "bf16"
14
+ grad_checkpoint = True
15
+ plugin = "zero2"
16
+
17
+ # Model settings
18
+ # vae = dict(
19
+ # type="OpenSoraVAE_V1_2",
20
+ # from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
21
+ # micro_frame_size=17,
22
+ # micro_batch_size=4,
23
+ # )
24
+ # audio_vae = dict(
25
+ # type="AudioLDM2",
26
+ # from_pretrained="cvssp/audioldm2",
27
+ # )
28
+ model = dict(
29
+ type="STIBPrior",
30
+ imagebind_ckpt_path="./checkpoints",
31
+ spatial_token_num=spatial_token_num,
32
+ temporal_token_num=temporal_token_num,
33
+ out_dim=st_prior_channel,
34
+ hidden_size=512,
35
+ apply_sampling=True,
36
+ encode_va=True,
37
+ qk_norm=True,
38
+ enable_flash_attn=True,
39
+ enable_layernorm_kernel=True
40
+ )
41
+
42
+ # Log settings
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+ epochs = 2
47
+ log_every = 10
48
+ ckpt_every = 200
49
+ save_total_limit = 2
50
+
51
+ # optimization settings
52
+ load = None
53
+ grad_clip = 1.0
54
+ lr = 1e-5
55
+ warmup_steps = 100
56
+
57
+ # audio settings
58
+ sampling_rate = 16000
59
+ mel_bins = 64
60
+ audio_cfg = {
61
+ "preprocessing": {
62
+ "audio": {
63
+ "sampling_rate": sampling_rate,
64
+ "max_wav_value": 32768.0,
65
+ "duration": 10.24,
66
+ },
67
+ "stft": {
68
+ "filter_length": 1024,
69
+ "hop_length": 160,
70
+ "win_length": 1024,
71
+ },
72
+ "mel": {
73
+ "n_mel_channels": mel_bins,
74
+ "mel_fmin": 0,
75
+ "mel_fmax": 8000,
76
+ }
77
+ },
78
+ "augmentation": {
79
+ "mixup": 0.0,
80
+ }
81
+ }
configs/javisdit-v0-1/train/stage3_jav.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoAudioTextDataset",
4
+ direct_load_video_clip=True,
5
+ transform_name="resize_crop",
6
+ audio_transform_name="mel_spec_audioldm2",
7
+ )
8
+ load_text_features = False
9
+
10
+ # webvid
11
+ bucket_config = { # 20s/it, randomly assigning raw videos to pre-defined and proper buckets
12
+ # image size : {num frame : {accept_probs, batch size}}
13
+ "144p": {51: (1.0, 16), 102: ((1.0, 0.5), 12), 204: ((1.0, 0.5), 6), 408: (1.0, 3)},
14
+ # ---
15
+ "256": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
16
+ "240p": {51: (0.5, 10), 102: ((0.5, 0.5), 4), 204: ((0.5, 0.5), 2), 408: (1.0, 1)},
17
+ # ---
18
+ "360p": {51: (0.3, 4), 102: ((0.3, 0.5), 2), 204: ((0.3, 0.5), 1)},
19
+ "512": {51: (0.2, 4), 102: ((0.2, 0.5), 2), 204: ((0.2, 0.4), 1)},
20
+ # ---
21
+ "480p": {51: (0.2, 2), 102: ((0.2, 0.5), 1)},
22
+ # ---
23
+ "720p": {51: (0.03, 1)},
24
+ "1024": {51: (0.03, 1)},
25
+ }
26
+ grad_checkpoint = True
27
+
28
+ # Acceleration settings
29
+ num_workers = 8
30
+ num_bucket_build_workers = 16
31
+ dtype = "bf16"
32
+ plugin = "zero2"
33
+
34
+ # Model settings
35
+ spatial_prior_len = 32
36
+ temporal_prior_len = 32
37
+ st_prior_channel = 128
38
+ model = dict(
39
+ type="VASTDiT3-XL/2",
40
+ weight_init_from=[
41
+ "./checkpoints/JavisDiT-v0.1-audio",
42
+ "./checkpoints/OpenSora-STDiT-v3/model.safetensors",
43
+ ],
44
+ qk_norm=True,
45
+ enable_flash_attn=True,
46
+ enable_layernorm_kernel=True,
47
+ # video-audio joint generation
48
+ only_train_audio=False,
49
+ freeze_y_embedder=True,
50
+ freeze_video_branch=True,
51
+ freeze_audio_branch=True,
52
+ train_st_prior_attn=True,
53
+ train_va_cross_attn=True,
54
+ spatial_prior_len=spatial_prior_len,
55
+ temporal_prior_len=temporal_prior_len,
56
+ st_prior_channel=st_prior_channel,
57
+ audio_patch_size=(4, 1)
58
+ )
59
+ vae = dict(
60
+ type="OpenSoraVAE_V1_2",
61
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
62
+ micro_frame_size=17,
63
+ micro_batch_size=4,
64
+ )
65
+ audio_vae = dict(
66
+ type="AudioLDM2",
67
+ from_pretrained="cvssp/audioldm2",
68
+ )
69
+ text_encoder = dict(
70
+ type="t5",
71
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
72
+ model_max_length=300,
73
+ # shardformer=True,
74
+ )
75
+ prior_encoder = dict(
76
+ type="STIBPrior",
77
+ imagebind_ckpt_path="./checkpoints",
78
+ from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
79
+ spatial_token_num=spatial_prior_len,
80
+ temporal_token_num=temporal_prior_len,
81
+ out_dim=st_prior_channel,
82
+ apply_sampling=True,
83
+ encode_va=False,
84
+ qk_norm=True,
85
+ enable_flash_attn=True,
86
+ enable_layernorm_kernel=True,
87
+ )
88
+ scheduler = dict(
89
+ type="rflow",
90
+ use_timestep_transform=True,
91
+ sample_method="logit-normal",
92
+ )
93
+
94
+ # Mask settings
95
+ # 30%
96
+ mask_ratios = {
97
+ "random": 0.01,
98
+ "video_to_audio": 0.05, # func1
99
+ "audio_to_video": 0.05, # func2
100
+ "sound_image_animate": 0.03,
101
+ "intepolate": 0.03,
102
+ "quarter_random": 0.005,
103
+ "quarter_head": 0.05, # func3
104
+ "quarter_tail": 0.005,
105
+ "quarter_head_tail": 0.005,
106
+ "image_random": 0.005,
107
+ "image_head": 0.05, # func4
108
+ "image_tail": 0.005,
109
+ "image_head_tail": 0.005,
110
+ }
111
+ # Log settings
112
+ seed = 42
113
+ outputs = "outputs"
114
+ wandb = False
115
+ epochs = 2
116
+ log_every = 10
117
+ ckpt_every = 50
118
+ save_total_limit = 2
119
+
120
+ # optimization settings
121
+ load = None
122
+ grad_clip = 1.0
123
+ lr = 1e-4
124
+ ema_decay = 0.99
125
+ adam_eps = 1e-15
126
+ warmup_steps = 1000
127
+
128
+ # audio settings
129
+ sampling_rate = 16000
130
+ mel_bins = 64
131
+ audio_cfg = {
132
+ "preprocessing": {
133
+ "audio": {
134
+ "sampling_rate": sampling_rate,
135
+ "max_wav_value": 32768.0,
136
+ "duration": 10.24,
137
+ },
138
+ "stft": {
139
+ "filter_length": 1024,
140
+ "hop_length": 160,
141
+ "win_length": 1024,
142
+ },
143
+ "mel": {
144
+ "n_mel_channels": mel_bins,
145
+ "mel_fmin": 0,
146
+ "mel_fmax": 8000,
147
+ }
148
+ },
149
+ "augmentation": {
150
+ "mixup": 0.0,
151
+ }
152
+ }
configs/javisdit-v0-1/train/stage3_jav_feat.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(type="BatchFeatureDataset")
3
+ load_va_features = True
4
+ load_text_features = False
5
+
6
+ # Acceleration settings
7
+ num_workers = 8
8
+ grad_checkpoint = True
9
+ dtype = "bf16"
10
+ plugin = "zero2"
11
+
12
+ # Model settings
13
+ spatial_prior_len = 32
14
+ temporal_prior_len = 32
15
+ st_prior_channel = 128
16
+ model = dict(
17
+ type="VASTDiT3-XL/2",
18
+ weight_init_from=[
19
+ "./checkpoints/JavisDiT-v0.1-audio",
20
+ "./checkpoints/OpenSora-STDiT-v3/model.safetensors",
21
+ ],
22
+ qk_norm=True,
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ # video-audio joint generation
26
+ only_train_audio=False,
27
+ freeze_y_embedder=True,
28
+ freeze_video_branch=True,
29
+ freeze_audio_branch=True,
30
+ train_st_prior_attn=True,
31
+ train_va_cross_attn=True,
32
+ spatial_prior_len=spatial_prior_len,
33
+ temporal_prior_len=temporal_prior_len,
34
+ st_prior_channel=st_prior_channel,
35
+ audio_patch_size=(4, 1)
36
+ )
37
+ # vae = dict(
38
+ # type="OpenSoraVAE_V1_2",
39
+ # from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
40
+ # micro_frame_size=17,
41
+ # micro_batch_size=4,
42
+ # )
43
+ # audio_vae = dict(
44
+ # type="AudioLDM2",
45
+ # from_pretrained="cvssp/audioldm2",
46
+ # )
47
+ text_encoder = dict(
48
+ type="t5",
49
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
50
+ model_max_length=300,
51
+ # shardformer=True,
52
+ )
53
+ prior_encoder = dict(
54
+ type="STIBPrior",
55
+ imagebind_ckpt_path="./checkpoints",
56
+ from_pretrained="JavisDiT/JavisDiT-v0.1-prior",
57
+ spatial_token_num=spatial_prior_len,
58
+ temporal_token_num=temporal_prior_len,
59
+ out_dim=st_prior_channel,
60
+ apply_sampling=True,
61
+ encode_va=False,
62
+ qk_norm=True,
63
+ enable_flash_attn=True,
64
+ enable_layernorm_kernel=True,
65
+ )
66
+ scheduler = dict(
67
+ type="rflow",
68
+ use_timestep_transform=True,
69
+ sample_method="logit-normal",
70
+ )
71
+
72
+ # Mask settings
73
+ # 30%
74
+ mask_ratios = {
75
+ "random": 0.01,
76
+ "video_to_audio": 0.05, # func1
77
+ "audio_to_video": 0.05, # func2
78
+ "sound_image_animate": 0.03,
79
+ "intepolate": 0.03,
80
+ "quarter_random": 0.005,
81
+ "quarter_head": 0.05, # func3
82
+ "quarter_tail": 0.005,
83
+ "quarter_head_tail": 0.005,
84
+ "image_random": 0.005,
85
+ "image_head": 0.05, # func4
86
+ "image_tail": 0.005,
87
+ "image_head_tail": 0.005,
88
+ }
89
+ # Log settings
90
+ seed = 42
91
+ outputs = "outputs"
92
+ wandb = False
93
+ epochs = 2
94
+ log_every = 10
95
+ ckpt_every = 50
96
+ save_total_limit = 2
97
+
98
+ # optimization settings
99
+ load = None
100
+ grad_clip = 1.0
101
+ lr = 1e-4
102
+ ema_decay = 0.99
103
+ adam_eps = 1e-15
104
+ warmup_steps = 1000
105
+
106
+ # audio settings
107
+ sampling_rate = 16000
108
+ mel_bins = 64
109
+ audio_cfg = {
110
+ "preprocessing": {
111
+ "audio": {
112
+ "sampling_rate": sampling_rate,
113
+ "max_wav_value": 32768.0,
114
+ "duration": 10.24,
115
+ },
116
+ "stft": {
117
+ "filter_length": 1024,
118
+ "hop_length": 160,
119
+ "win_length": 1024,
120
+ },
121
+ "mel": {
122
+ "n_mel_channels": mel_bins,
123
+ "mel_fmin": 0,
124
+ "mel_fmax": 8000,
125
+ }
126
+ },
127
+ "augmentation": {
128
+ "mixup": 0.0,
129
+ }
130
+ }
configs/latte/inference/16x256x256-class.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="label_101",
9
+ from_pretrained="Latte-XL-2-256x256-ucf101.pt",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="classes",
17
+ num_classes=101,
18
+ )
19
+ scheduler = dict(
20
+ type="dpm-solver",
21
+ num_sampling_steps=20,
22
+ cfg_scale=4.0,
23
+ )
24
+ dtype = "bf16"
25
+
26
+ # Others
27
+ batch_size = 2
28
+ seed = 42
29
+ prompt_path = "./assets/texts/ucf101_id.txt"
30
+ save_dir = "./samples/samples/"
configs/latte/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "bf16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/latte/train/16x256x256.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="Latte-XL/2",
20
+ enable_flash_attn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="VideoAutoencoderKL",
25
+ from_pretrained="stabilityai/sd-vae-ft-ema",
26
+ )
27
+ text_encoder = dict(
28
+ type="clip",
29
+ from_pretrained="openai/clip-vit-base-patch32",
30
+ model_max_length=77,
31
+ )
32
+ scheduler = dict(
33
+ type="iddpm",
34
+ timestep_respacing="",
35
+ )
36
+
37
+ # Others
38
+ seed = 42
39
+ outputs = "outputs"
40
+ wandb = False
41
+
42
+ epochs = 1000
43
+ log_every = 10
44
+ ckpt_every = 1000
45
+ load = None
46
+
47
+ batch_size = 8
48
+ lr = 2e-5
49
+ grad_clip = 1.0
configs/opensora-v1-1/inference/sample-ref.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Condition
8
+ prompt_path = None
9
+ prompt = [
10
+ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
11
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
12
+ 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
13
+ 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
14
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
15
+ '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
16
+ ]
17
+
18
+ loop = 2
19
+ condition_frame_length = 4
20
+ # (
21
+ # loop id, [the loop index of the condition image or video]
22
+ # reference id, [the index of the condition image or video in the reference_path]
23
+ # reference start, [the start frame of the condition image or video]
24
+ # target start, [the location to insert]
25
+ # length, [the number of frames to insert]
26
+ # edit_ratio [the edit rate of the condition image or video]
27
+ # )
28
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
29
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
30
+
31
+ # Define model
32
+ model = dict(
33
+ type="STDiT2-XL/2",
34
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
35
+ input_sq_size=512,
36
+ qk_norm=True,
37
+ qk_norm_legacy=True,
38
+ enable_flash_attn=True,
39
+ enable_layernorm_kernel=True,
40
+ )
41
+ vae = dict(
42
+ type="VideoAutoencoderKL",
43
+ from_pretrained="stabilityai/sd-vae-ft-ema",
44
+ cache_dir=None, # "/mnt/hdd/cached_models",
45
+ micro_batch_size=4,
46
+ )
47
+ text_encoder = dict(
48
+ type="t5",
49
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
50
+ cache_dir=None, # "/mnt/hdd/cached_models",
51
+ model_max_length=200,
52
+ )
53
+ scheduler = dict(
54
+ type="iddpm",
55
+ num_sampling_steps=100,
56
+ cfg_scale=7.0,
57
+ cfg_channel=3, # or None
58
+ )
59
+ dtype = "bf16"
60
+
61
+ # Others
62
+ batch_size = 1
63
+ seed = 42
64
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/inference/sample.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Define model
8
+ model = dict(
9
+ type="STDiT2-XL/2",
10
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11
+ input_sq_size=512,
12
+ qk_norm=True,
13
+ qk_norm_legacy=True,
14
+ enable_flash_attn=True,
15
+ enable_layernorm_kernel=True,
16
+ )
17
+ vae = dict(
18
+ type="VideoAutoencoderKL",
19
+ from_pretrained="stabilityai/sd-vae-ft-ema",
20
+ cache_dir=None, # "/mnt/hdd/cached_models",
21
+ micro_batch_size=4,
22
+ )
23
+ text_encoder = dict(
24
+ type="t5",
25
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
26
+ cache_dir=None, # "/mnt/hdd/cached_models",
27
+ model_max_length=200,
28
+ )
29
+ scheduler = dict(
30
+ type="iddpm",
31
+ num_sampling_steps=100,
32
+ cfg_scale=7.0,
33
+ cfg_channel=3, # or None
34
+ )
35
+ dtype = "bf16"
36
+
37
+ # Condition
38
+ prompt_path = "./assets/texts/t2v_samples.txt"
39
+ prompt = None # prompt has higher priority than prompt_path
40
+
41
+ # Others
42
+ batch_size = 1
43
+ seed = 42
44
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/train/benchmark.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this file is only for batch size search and is not used for training
2
+
3
+ # Define dataset
4
+ dataset = dict(
5
+ type="VariableVideoTextDataset",
6
+ data_path=None,
7
+ num_frames=None,
8
+ frame_interval=3,
9
+ image_size=(None, None),
10
+ transform_name="resize_crop",
11
+ )
12
+
13
+ # bucket config format:
14
+ # 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
15
+ # 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
16
+ # 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
17
+ # 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
18
+ # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
19
+
20
+ bucket_config = {
21
+ # == manual search ==
22
+ # "240p": {128: (1.0, 2)}, # 4.28s/it
23
+ # "240p": {64: (1.0, 4)},
24
+ # "240p": {32: (1.0, 8)}, # 4.6s/it
25
+ # "240p": {16: (1.0, 16)}, # 4.6s/it
26
+ # "480p": {16: (1.0, 4)}, # 4.6s/it
27
+ # "720p": {16: (1.0, 2)}, # 5.89s/it
28
+ # "256": {1: (1.0, 256)}, # 4.5s/it
29
+ # "512": {1: (1.0, 96)}, # 4.7s/it
30
+ # "512": {1: (1.0, 128)}, # 6.3s/it
31
+ # "480p": {1: (1.0, 50)}, # 4.0s/it
32
+ # "1024": {1: (1.0, 32)}, # 6.8s/it
33
+ # "1024": {1: (1.0, 20)}, # 4.3s/it
34
+ # "1080p": {1: (1.0, 16)}, # 8.6s/it
35
+ # "1080p": {1: (1.0, 8)}, # 4.4s/it
36
+ # == stage 2 ==
37
+ # "240p": {
38
+ # 16: (1.0, (2, 32)),
39
+ # 32: (1.0, (2, 16)),
40
+ # 64: (1.0, (2, 8)),
41
+ # 128: (1.0, (2, 6)),
42
+ # },
43
+ # "256": {1: (1.0, (128, 300))},
44
+ # "512": {1: (0.5, (64, 128))},
45
+ # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
46
+ # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now
47
+ # "1024": {1: (0.3, (8, 64))},
48
+ # "1080p": {1: (0.3, (2, 32))},
49
+ # == stage 3 ==
50
+ "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
51
+ }
52
+
53
+
54
+ # Define acceleration
55
+ num_workers = 4
56
+ num_bucket_build_workers = 16
57
+ dtype = "bf16"
58
+ grad_checkpoint = True
59
+ plugin = "zero2"
60
+ sp_size = 1
61
+
62
+ # Define model
63
+ model = dict(
64
+ type="STDiT2-XL/2",
65
+ from_pretrained=None,
66
+ input_sq_size=512, # pretrained model is trained on 512x512
67
+ qk_norm=True,
68
+ qk_norm_legacy=True,
69
+ enable_flash_attn=True,
70
+ enable_layernorm_kernel=True,
71
+ )
72
+ vae = dict(
73
+ type="VideoAutoencoderKL",
74
+ from_pretrained="stabilityai/sd-vae-ft-ema",
75
+ micro_batch_size=4,
76
+ local_files_only=True,
77
+ )
78
+ text_encoder = dict(
79
+ type="t5",
80
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
81
+ model_max_length=200,
82
+ shardformer=True,
83
+ local_files_only=True,
84
+ )
85
+ scheduler = dict(
86
+ type="iddpm",
87
+ timestep_respacing="",
88
+ )
89
+
90
+ # Others
91
+ seed = 42
92
+ outputs = "outputs"
93
+ wandb = False
94
+
95
+ epochs = 1000
96
+ log_every = 10
97
+ ckpt_every = 1000
98
+ load = None
99
+
100
+ batch_size = None
101
+ lr = 2e-5
102
+ grad_clip = 1.0
configs/opensora-v1-1/train/image.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 6s/it
11
+ "256": {1: (1.0, 256)},
12
+ "512": {1: (1.0, 80)},
13
+ "480p": {1: (1.0, 52)},
14
+ "1024": {1: (1.0, 20)},
15
+ "1080p": {1: (1.0, 8)},
16
+ }
17
+
18
+ # Define acceleration
19
+ num_workers = 4
20
+ num_bucket_build_workers = 16
21
+ dtype = "bf16"
22
+ grad_checkpoint = True
23
+ plugin = "zero2"
24
+ sp_size = 1
25
+
26
+ # Define model
27
+ model = dict(
28
+ type="STDiT2-XL/2",
29
+ from_pretrained=None,
30
+ input_sq_size=512, # pretrained model is trained on 512x512
31
+ qk_norm=True,
32
+ qk_norm_legacy=True,
33
+ enable_flash_attn=True,
34
+ enable_layernorm_kernel=True,
35
+ )
36
+ vae = dict(
37
+ type="VideoAutoencoderKL",
38
+ from_pretrained="stabilityai/sd-vae-ft-ema",
39
+ micro_batch_size=4,
40
+ local_files_only=True,
41
+ )
42
+ text_encoder = dict(
43
+ type="t5",
44
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
45
+ model_max_length=200,
46
+ shardformer=True,
47
+ local_files_only=True,
48
+ )
49
+ scheduler = dict(
50
+ type="iddpm",
51
+ timestep_respacing="",
52
+ )
53
+
54
+ # Others
55
+ seed = 42
56
+ outputs = "outputs"
57
+ wandb = False
58
+
59
+ epochs = 1000
60
+ log_every = 10
61
+ ckpt_every = 500
62
+ load = None
63
+
64
+ batch_size = 10 # only for logging
65
+ lr = 2e-5
66
+ grad_clip = 1.0