Spaces:
Running on Zero
Running on Zero
Jack Wu commited on
Commit ·
22b2290
1
Parent(s): 3886668
Remove non-inference files from all three model folders
Browse filesKeep only what is imported at runtime by app.py:
- TARO: remove dataset.py, infer.py, loss.py, train.py, train.sh, preprocess/, README.md
- MMAudio: remove train.py, batch_eval.py, eval_onsets.py, demo.py, gradio_demo.py,
config/, docs/, sets/, training/, README.md, LICENSE, .gitignore
- HunyuanFoley: remove infer.py, gradio_app.py, tests/, assets/, build_package.sh,
download_test_videos.sh, DEVELOPMENT.md, INSTALL.md, LICENSE, MANIFEST.in,
NOTICE, pytest.ini, README.md, .gitattributes, .gitignore, .pre-commit-config.yaml
Update .gitignore to permanently exclude all of the above.
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +47 -0
- HunyuanVideo-Foley/.gitattributes +0 -3
- HunyuanVideo-Foley/.gitignore +0 -159
- HunyuanVideo-Foley/.pre-commit-config.yaml +0 -38
- HunyuanVideo-Foley/DEVELOPMENT.md +0 -187
- HunyuanVideo-Foley/INSTALL.md +0 -203
- HunyuanVideo-Foley/LICENSE +0 -77
- HunyuanVideo-Foley/MANIFEST.in +0 -38
- HunyuanVideo-Foley/NOTICE +0 -27
- HunyuanVideo-Foley/README.md +0 -519
- HunyuanVideo-Foley/build_package.sh +0 -58
- HunyuanVideo-Foley/download_test_videos.sh +0 -11
- HunyuanVideo-Foley/gradio_app.py +0 -834
- HunyuanVideo-Foley/infer.py +0 -304
- HunyuanVideo-Foley/pytest.ini +0 -11
- HunyuanVideo-Foley/tests/__init__.py +0 -1
- HunyuanVideo-Foley/tests/test_config_utils.py +0 -89
- HunyuanVideo-Foley/tests/test_media_utils.py +0 -82
- MMAudio/.gitignore +0 -146
- MMAudio/LICENSE +0 -21
- MMAudio/README.md +0 -198
- MMAudio/batch_eval.py +0 -110
- MMAudio/config/__init__.py +0 -0
- MMAudio/config/base_config.yaml +0 -62
- MMAudio/config/data/base.yaml +0 -70
- MMAudio/config/eval_config.yaml +0 -17
- MMAudio/config/eval_data/base.yaml +0 -22
- MMAudio/config/hydra/job_logging/custom-eval.yaml +0 -32
- MMAudio/config/hydra/job_logging/custom-no-rank.yaml +0 -32
- MMAudio/config/hydra/job_logging/custom-simplest.yaml +0 -26
- MMAudio/config/hydra/job_logging/custom.yaml +0 -33
- MMAudio/config/train_config.yaml +0 -41
- MMAudio/demo.py +0 -141
- MMAudio/docs/EVAL.md +0 -23
- MMAudio/docs/MODELS.md +0 -50
- MMAudio/docs/TRAINING.md +0 -184
- MMAudio/docs/demo.html +0 -81
- MMAudio/docs/images/icon.png +0 -0
- MMAudio/docs/index.html +0 -156
- MMAudio/docs/style.css +0 -78
- MMAudio/docs/style_videos.css +0 -52
- MMAudio/docs/video_gen.html +0 -254
- MMAudio/docs/video_main.html +0 -98
- MMAudio/docs/video_vgg.html +0 -452
- MMAudio/eval_onsets.py +0 -141
- MMAudio/gradio_demo.py +0 -343
- MMAudio/sets/vgg-test.tsv +0 -0
- MMAudio/sets/vgg-train.tsv +0 -0
- MMAudio/sets/vgg-val.tsv +0 -2049
- MMAudio/train.py +0 -209
.gitignore
CHANGED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ---- TARO: training / preprocessing only ----
|
| 2 |
+
TARO/dataset.py
|
| 3 |
+
TARO/infer.py
|
| 4 |
+
TARO/loss.py
|
| 5 |
+
TARO/train.py
|
| 6 |
+
TARO/train.sh
|
| 7 |
+
TARO/preprocess/
|
| 8 |
+
TARO/README.md
|
| 9 |
+
|
| 10 |
+
# ---- MMAudio: training / eval / docs only ----
|
| 11 |
+
MMAudio/batch_eval.py
|
| 12 |
+
MMAudio/eval_onsets.py
|
| 13 |
+
MMAudio/train.py
|
| 14 |
+
MMAudio/demo.py
|
| 15 |
+
MMAudio/gradio_demo.py
|
| 16 |
+
MMAudio/config/
|
| 17 |
+
MMAudio/docs/
|
| 18 |
+
MMAudio/sets/
|
| 19 |
+
MMAudio/training/
|
| 20 |
+
MMAudio/README.md
|
| 21 |
+
MMAudio/.gitignore
|
| 22 |
+
MMAudio/LICENSE
|
| 23 |
+
|
| 24 |
+
# ---- HunyuanFoley: build / test / docs only ----
|
| 25 |
+
HunyuanVideo-Foley/.gitattributes
|
| 26 |
+
HunyuanVideo-Foley/.gitignore
|
| 27 |
+
HunyuanVideo-Foley/.pre-commit-config.yaml
|
| 28 |
+
HunyuanVideo-Foley/assets/
|
| 29 |
+
HunyuanVideo-Foley/build_package.sh
|
| 30 |
+
HunyuanVideo-Foley/download_test_videos.sh
|
| 31 |
+
HunyuanVideo-Foley/gradio_app.py
|
| 32 |
+
HunyuanVideo-Foley/infer.py
|
| 33 |
+
HunyuanVideo-Foley/DEVELOPMENT.md
|
| 34 |
+
HunyuanVideo-Foley/INSTALL.md
|
| 35 |
+
HunyuanVideo-Foley/LICENSE
|
| 36 |
+
HunyuanVideo-Foley/MANIFEST.in
|
| 37 |
+
HunyuanVideo-Foley/NOTICE
|
| 38 |
+
HunyuanVideo-Foley/pytest.ini
|
| 39 |
+
HunyuanVideo-Foley/README.md
|
| 40 |
+
HunyuanVideo-Foley/tests/
|
| 41 |
+
|
| 42 |
+
# ---- Python / IDE ----
|
| 43 |
+
__pycache__/
|
| 44 |
+
*.pyc
|
| 45 |
+
.venv/
|
| 46 |
+
.DS_Store
|
| 47 |
+
.idea/
|
HunyuanVideo-Foley/.gitattributes
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
assets/data_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
assets/model_arch.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/.gitignore
DELETED
|
@@ -1,159 +0,0 @@
|
|
| 1 |
-
# Byte-compiled / optimized / DLL files
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
|
| 6 |
-
# C extensions
|
| 7 |
-
*.so
|
| 8 |
-
|
| 9 |
-
# Distribution / packaging
|
| 10 |
-
.Python
|
| 11 |
-
build/
|
| 12 |
-
develop-eggs/
|
| 13 |
-
dist/
|
| 14 |
-
downloads/
|
| 15 |
-
eggs/
|
| 16 |
-
.eggs/
|
| 17 |
-
lib/
|
| 18 |
-
lib64/
|
| 19 |
-
parts/
|
| 20 |
-
sdist/
|
| 21 |
-
var/
|
| 22 |
-
wheels/
|
| 23 |
-
pip-wheel-metadata/
|
| 24 |
-
share/python-wheels/
|
| 25 |
-
*.egg-info/
|
| 26 |
-
.installed.cfg
|
| 27 |
-
*.egg
|
| 28 |
-
MANIFEST
|
| 29 |
-
|
| 30 |
-
# PyInstaller
|
| 31 |
-
# Usually these files are written by a python script from a template
|
| 32 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 33 |
-
*.manifest
|
| 34 |
-
*.spec
|
| 35 |
-
|
| 36 |
-
# Installer logs
|
| 37 |
-
pip-log.txt
|
| 38 |
-
pip-delete-this-directory.txt
|
| 39 |
-
|
| 40 |
-
# Unit test / coverage reports
|
| 41 |
-
htmlcov/
|
| 42 |
-
.tox/
|
| 43 |
-
.nox/
|
| 44 |
-
.coverage
|
| 45 |
-
.coverage.*
|
| 46 |
-
.cache
|
| 47 |
-
nosetests.xml
|
| 48 |
-
coverage.xml
|
| 49 |
-
*.cover
|
| 50 |
-
*.py,cover
|
| 51 |
-
.hypothesis/
|
| 52 |
-
.pytest_cache/
|
| 53 |
-
|
| 54 |
-
# Translations
|
| 55 |
-
*.mo
|
| 56 |
-
*.pot
|
| 57 |
-
|
| 58 |
-
# Django stuff:
|
| 59 |
-
*.log
|
| 60 |
-
local_settings.py
|
| 61 |
-
db.sqlite3
|
| 62 |
-
db.sqlite3-journal
|
| 63 |
-
|
| 64 |
-
# Flask stuff:
|
| 65 |
-
instance/
|
| 66 |
-
.webassets-cache
|
| 67 |
-
|
| 68 |
-
# Scrapy stuff:
|
| 69 |
-
.scrapy
|
| 70 |
-
|
| 71 |
-
# Sphinx documentation
|
| 72 |
-
docs/_build/
|
| 73 |
-
|
| 74 |
-
# PyBuilder
|
| 75 |
-
target/
|
| 76 |
-
|
| 77 |
-
# Jupyter Notebook
|
| 78 |
-
.ipynb_checkpoints
|
| 79 |
-
|
| 80 |
-
# IPython
|
| 81 |
-
profile_default/
|
| 82 |
-
ipython_config.py
|
| 83 |
-
|
| 84 |
-
# pyenv
|
| 85 |
-
.python-version
|
| 86 |
-
|
| 87 |
-
# pipenv
|
| 88 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 89 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 90 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 91 |
-
# install all needed dependencies.
|
| 92 |
-
#Pipfile.lock
|
| 93 |
-
|
| 94 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 95 |
-
__pypackages__/
|
| 96 |
-
|
| 97 |
-
# Celery stuff
|
| 98 |
-
celerybeat-schedule
|
| 99 |
-
celerybeat.pid
|
| 100 |
-
|
| 101 |
-
# SageMath parsed files
|
| 102 |
-
*.sage.py
|
| 103 |
-
|
| 104 |
-
# Environments
|
| 105 |
-
.env
|
| 106 |
-
.venv
|
| 107 |
-
env/
|
| 108 |
-
venv/
|
| 109 |
-
ENV/
|
| 110 |
-
env.bak/
|
| 111 |
-
venv.bak/
|
| 112 |
-
|
| 113 |
-
# Spyder project settings
|
| 114 |
-
.spyderproject
|
| 115 |
-
.spyproject
|
| 116 |
-
|
| 117 |
-
# Rope project settings
|
| 118 |
-
.ropeproject
|
| 119 |
-
|
| 120 |
-
# mkdocs documentation
|
| 121 |
-
/site
|
| 122 |
-
|
| 123 |
-
# mypy
|
| 124 |
-
.mypy_cache/
|
| 125 |
-
.dmypy.json
|
| 126 |
-
dmypy.json
|
| 127 |
-
|
| 128 |
-
# Pyre type checker
|
| 129 |
-
.pyre/
|
| 130 |
-
|
| 131 |
-
# ==========================================
|
| 132 |
-
# Custom settings
|
| 133 |
-
# ==========================================
|
| 134 |
-
|
| 135 |
-
# For MacOS
|
| 136 |
-
.DS_Store
|
| 137 |
-
|
| 138 |
-
# For IDEs
|
| 139 |
-
.idea/
|
| 140 |
-
.vscode/
|
| 141 |
-
pyrightconfig.json
|
| 142 |
-
.cursorignore
|
| 143 |
-
|
| 144 |
-
assets/
|
| 145 |
-
examples/
|
| 146 |
-
|
| 147 |
-
# For global settings
|
| 148 |
-
__*/
|
| 149 |
-
**/my_*
|
| 150 |
-
tmp*.*
|
| 151 |
-
.my*
|
| 152 |
-
# Model checkpoints
|
| 153 |
-
*.pt
|
| 154 |
-
*.ckpt
|
| 155 |
-
*.pth
|
| 156 |
-
*.safetensors
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
CLAUDE.md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/.pre-commit-config.yaml
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
repos:
|
| 2 |
-
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 3 |
-
rev: v4.4.0
|
| 4 |
-
hooks:
|
| 5 |
-
- id: trailing-whitespace
|
| 6 |
-
- id: end-of-file-fixer
|
| 7 |
-
- id: check-yaml
|
| 8 |
-
- id: check-added-large-files
|
| 9 |
-
- id: check-merge-conflict
|
| 10 |
-
- id: debug-statements
|
| 11 |
-
- id: check-docstring-first
|
| 12 |
-
|
| 13 |
-
- repo: https://github.com/psf/black
|
| 14 |
-
rev: 23.3.0
|
| 15 |
-
hooks:
|
| 16 |
-
- id: black
|
| 17 |
-
language_version: python3
|
| 18 |
-
args: [--line-length=120]
|
| 19 |
-
|
| 20 |
-
- repo: https://github.com/pycqa/isort
|
| 21 |
-
rev: 5.12.0
|
| 22 |
-
hooks:
|
| 23 |
-
- id: isort
|
| 24 |
-
args: [--profile, black, --line-length=120]
|
| 25 |
-
|
| 26 |
-
- repo: https://github.com/pycqa/flake8
|
| 27 |
-
rev: 6.0.0
|
| 28 |
-
hooks:
|
| 29 |
-
- id: flake8
|
| 30 |
-
args: [--max-line-length=120]
|
| 31 |
-
additional_dependencies: [flake8-docstrings]
|
| 32 |
-
|
| 33 |
-
- repo: https://github.com/pre-commit/mirrors-mypy
|
| 34 |
-
rev: v1.3.0
|
| 35 |
-
hooks:
|
| 36 |
-
- id: mypy
|
| 37 |
-
additional_dependencies: [types-all]
|
| 38 |
-
args: [--ignore-missing-imports]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/DEVELOPMENT.md
DELETED
|
@@ -1,187 +0,0 @@
|
|
| 1 |
-
# Development Guide
|
| 2 |
-
|
| 3 |
-
This document provides guidelines for developing and contributing to the HunyuanVideo-Foley project.
|
| 4 |
-
|
| 5 |
-
## Code Style and Quality
|
| 6 |
-
|
| 7 |
-
### Code Formatting
|
| 8 |
-
|
| 9 |
-
We use the following tools to maintain consistent code style:
|
| 10 |
-
|
| 11 |
-
- **Black**: Code formatter with 120 character line length
|
| 12 |
-
- **isort**: Import sorter compatible with Black
|
| 13 |
-
- **flake8**: Linting and style checking
|
| 14 |
-
- **mypy**: Static type checking
|
| 15 |
-
|
| 16 |
-
### Pre-commit Hooks
|
| 17 |
-
|
| 18 |
-
Install pre-commit hooks to automatically format code before commits:
|
| 19 |
-
|
| 20 |
-
```bash
|
| 21 |
-
pip install pre-commit
|
| 22 |
-
pre-commit install
|
| 23 |
-
```
|
| 24 |
-
|
| 25 |
-
### Manual Code Formatting
|
| 26 |
-
|
| 27 |
-
Format code manually:
|
| 28 |
-
|
| 29 |
-
```bash
|
| 30 |
-
# Format all Python files
|
| 31 |
-
black --line-length 120 .
|
| 32 |
-
|
| 33 |
-
# Sort imports
|
| 34 |
-
isort --profile black --line-length 120 .
|
| 35 |
-
|
| 36 |
-
# Check code style
|
| 37 |
-
flake8 --max-line-length 120
|
| 38 |
-
|
| 39 |
-
# Type checking
|
| 40 |
-
mypy --ignore-missing-imports .
|
| 41 |
-
```
|
| 42 |
-
|
| 43 |
-
## Project Structure
|
| 44 |
-
|
| 45 |
-
```
|
| 46 |
-
hunyuanvideo_foley/
|
| 47 |
-
├── models/ # Model implementations
|
| 48 |
-
│ ├── hifi_foley.py # Main model
|
| 49 |
-
│ ├── nn/ # Neural network layers
|
| 50 |
-
│ ├── dac_vae/ # Audio VAE
|
| 51 |
-
│ └── synchformer/ # Synchronization model
|
| 52 |
-
├── utils/ # Utilities
|
| 53 |
-
│ ├── config_utils.py # Configuration handling
|
| 54 |
-
│ ├── feature_utils.py # Feature extraction
|
| 55 |
-
│ ├── model_utils.py # Model loading/saving
|
| 56 |
-
│ └── media_utils.py # Audio/video processing
|
| 57 |
-
└── constants.py # Project constants
|
| 58 |
-
```
|
| 59 |
-
|
| 60 |
-
## Coding Standards
|
| 61 |
-
|
| 62 |
-
### Error Handling
|
| 63 |
-
|
| 64 |
-
- Use custom exceptions for domain-specific errors
|
| 65 |
-
- Always validate inputs at function boundaries
|
| 66 |
-
- Log errors with appropriate levels (ERROR, WARNING, INFO)
|
| 67 |
-
- Provide helpful error messages to users
|
| 68 |
-
|
| 69 |
-
### Type Hints
|
| 70 |
-
|
| 71 |
-
- Add type hints to all function parameters and return values
|
| 72 |
-
- Use `Optional[Type]` for nullable parameters
|
| 73 |
-
- Import types from `typing` module
|
| 74 |
-
|
| 75 |
-
### Documentation
|
| 76 |
-
|
| 77 |
-
- Add docstrings to all public functions and classes
|
| 78 |
-
- Use Google-style docstrings
|
| 79 |
-
- Document parameters, return values, and exceptions
|
| 80 |
-
|
| 81 |
-
### Example Function
|
| 82 |
-
|
| 83 |
-
```python
|
| 84 |
-
def process_video(
|
| 85 |
-
video_path: str,
|
| 86 |
-
max_duration: Optional[float] = None
|
| 87 |
-
) -> Tuple[np.ndarray, float]:
|
| 88 |
-
"""
|
| 89 |
-
Process video file and extract frames.
|
| 90 |
-
|
| 91 |
-
Args:
|
| 92 |
-
video_path: Path to input video file
|
| 93 |
-
max_duration: Maximum duration in seconds (optional)
|
| 94 |
-
|
| 95 |
-
Returns:
|
| 96 |
-
Tuple of (frames array, duration in seconds)
|
| 97 |
-
|
| 98 |
-
Raises:
|
| 99 |
-
FileNotFoundError: If video file doesn't exist
|
| 100 |
-
VideoProcessingError: If video processing fails
|
| 101 |
-
"""
|
| 102 |
-
if not os.path.exists(video_path):
|
| 103 |
-
raise FileNotFoundError(f"Video file not found: {video_path}")
|
| 104 |
-
|
| 105 |
-
# Implementation here...
|
| 106 |
-
```
|
| 107 |
-
|
| 108 |
-
## Testing
|
| 109 |
-
|
| 110 |
-
### Running Tests
|
| 111 |
-
|
| 112 |
-
```bash
|
| 113 |
-
# Run all tests
|
| 114 |
-
python -m pytest
|
| 115 |
-
|
| 116 |
-
# Run specific test file
|
| 117 |
-
python -m pytest tests/test_feature_utils.py
|
| 118 |
-
|
| 119 |
-
# Run with coverage
|
| 120 |
-
python -m pytest --cov=hunyuanvideo_foley
|
| 121 |
-
```
|
| 122 |
-
|
| 123 |
-
### Writing Tests
|
| 124 |
-
|
| 125 |
-
- Place tests in `tests/` directory
|
| 126 |
-
- Name test files as `test_*.py`
|
| 127 |
-
- Use descriptive test function names
|
| 128 |
-
- Test edge cases and error conditions
|
| 129 |
-
|
| 130 |
-
## Development Workflow
|
| 131 |
-
|
| 132 |
-
1. **Setup Environment**
|
| 133 |
-
```bash
|
| 134 |
-
python -m venv venv
|
| 135 |
-
source venv/bin/activate # Linux/Mac
|
| 136 |
-
# or
|
| 137 |
-
venv\Scripts\activate # Windows
|
| 138 |
-
|
| 139 |
-
pip install -r requirements.txt
|
| 140 |
-
pip install -e .
|
| 141 |
-
```
|
| 142 |
-
|
| 143 |
-
2. **Install Development Tools**
|
| 144 |
-
```bash
|
| 145 |
-
pre-commit install
|
| 146 |
-
```
|
| 147 |
-
|
| 148 |
-
3. **Make Changes**
|
| 149 |
-
- Follow the coding standards above
|
| 150 |
-
- Add tests for new functionality
|
| 151 |
-
- Update documentation as needed
|
| 152 |
-
|
| 153 |
-
4. **Run Quality Checks**
|
| 154 |
-
```bash
|
| 155 |
-
black --check --line-length 120 .
|
| 156 |
-
isort --check-only --profile black .
|
| 157 |
-
flake8 --max-line-length 120
|
| 158 |
-
mypy --ignore-missing-imports .
|
| 159 |
-
pytest
|
| 160 |
-
```
|
| 161 |
-
|
| 162 |
-
5. **Commit Changes**
|
| 163 |
-
```bash
|
| 164 |
-
git add .
|
| 165 |
-
git commit -m "feat: add new feature"
|
| 166 |
-
```
|
| 167 |
-
|
| 168 |
-
## Performance Considerations
|
| 169 |
-
|
| 170 |
-
- Use `torch.no_grad()` for inference-only code
|
| 171 |
-
- Leverage GPU when available
|
| 172 |
-
- Implement batch processing where possible
|
| 173 |
-
- Profile code to identify bottlenecks
|
| 174 |
-
|
| 175 |
-
## Dependencies
|
| 176 |
-
|
| 177 |
-
- Keep dependencies minimal and well-maintained
|
| 178 |
-
- Pin versions for reproducibility
|
| 179 |
-
- Separate development dependencies from runtime dependencies
|
| 180 |
-
- Document any special installation requirements
|
| 181 |
-
|
| 182 |
-
## Configuration
|
| 183 |
-
|
| 184 |
-
- Use centralized configuration in `constants.py`
|
| 185 |
-
- Support environment variable overrides
|
| 186 |
-
- Provide sensible defaults for all parameters
|
| 187 |
-
- Validate configuration at startup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/INSTALL.md
DELETED
|
@@ -1,203 +0,0 @@
|
|
| 1 |
-
# 安装指南 - HunyuanVideo-Foley
|
| 2 |
-
|
| 3 |
-
本文档提供了将 HunyuanVideo-Foley 作为 Python 包安装和使用的详细指南。
|
| 4 |
-
|
| 5 |
-
## 安装方式
|
| 6 |
-
|
| 7 |
-
### 方式1:从源码安装(推荐)
|
| 8 |
-
|
| 9 |
-
```bash
|
| 10 |
-
# 克隆仓库
|
| 11 |
-
git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
|
| 12 |
-
cd HunyuanVideo-Foley
|
| 13 |
-
|
| 14 |
-
# 安装包(开发模式)
|
| 15 |
-
pip install -e .
|
| 16 |
-
|
| 17 |
-
# 或安装包含所有可选依赖
|
| 18 |
-
pip install -e .[all]
|
| 19 |
-
```
|
| 20 |
-
|
| 21 |
-
### 方式2:直接从GitHub安装
|
| 22 |
-
|
| 23 |
-
```bash
|
| 24 |
-
pip install git+https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git
|
| 25 |
-
```
|
| 26 |
-
|
| 27 |
-
### 方式3:构建wheel包安装
|
| 28 |
-
|
| 29 |
-
```bash
|
| 30 |
-
# 在项目根目录下
|
| 31 |
-
python setup.py bdist_wheel
|
| 32 |
-
pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl
|
| 33 |
-
```
|
| 34 |
-
|
| 35 |
-
## 特殊依赖安装
|
| 36 |
-
|
| 37 |
-
由于某些依赖不在PyPI上,需要单独安装:
|
| 38 |
-
|
| 39 |
-
```bash
|
| 40 |
-
# 安装audiotools(必需)
|
| 41 |
-
pip install git+https://github.com/descriptinc/audiotools
|
| 42 |
-
|
| 43 |
-
# 安装特定版本的transformers(支持SigLIP2)
|
| 44 |
-
pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
|
| 45 |
-
```
|
| 46 |
-
|
| 47 |
-
## 可选依赖安装
|
| 48 |
-
|
| 49 |
-
```bash
|
| 50 |
-
# 安装开发依赖
|
| 51 |
-
pip install hunyuanvideo-foley[dev]
|
| 52 |
-
|
| 53 |
-
# 安装测试依赖
|
| 54 |
-
pip install hunyuanvideo-foley[test]
|
| 55 |
-
|
| 56 |
-
# 安装Gradio界面依赖
|
| 57 |
-
pip install hunyuanvideo-foley[gradio]
|
| 58 |
-
|
| 59 |
-
# 安装所有可选依赖
|
| 60 |
-
pip install hunyuanvideo-foley[all]
|
| 61 |
-
```
|
| 62 |
-
|
| 63 |
-
## 验证安装
|
| 64 |
-
|
| 65 |
-
```bash
|
| 66 |
-
# 检查包是否正确安装
|
| 67 |
-
python -c "import hunyuanvideo_foley; print(hunyuanvideo_foley.__version__)"
|
| 68 |
-
|
| 69 |
-
# 检查命令行工具
|
| 70 |
-
hunyuanvideo-foley --help
|
| 71 |
-
```
|
| 72 |
-
|
| 73 |
-
## 使用方法
|
| 74 |
-
|
| 75 |
-
### 1. 作为Python包使用
|
| 76 |
-
|
| 77 |
-
```python
|
| 78 |
-
import hunyuanvideo_foley as hvf
|
| 79 |
-
|
| 80 |
-
# 加载模型
|
| 81 |
-
model_dict, cfg = hvf.load_model(
|
| 82 |
-
model_path="path/to/model",
|
| 83 |
-
config_path="configs/hunyuanvideo-foley-xxl.yaml"
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
# 处理特征
|
| 87 |
-
visual_feats, text_feats, audio_len = hvf.feature_process(
|
| 88 |
-
video_path="video.mp4",
|
| 89 |
-
prompt="footsteps on gravel",
|
| 90 |
-
model_dict=model_dict,
|
| 91 |
-
cfg=cfg
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
# 生成音频
|
| 95 |
-
audio, sample_rate = hvf.denoise_process(
|
| 96 |
-
visual_feats, text_feats, audio_len,
|
| 97 |
-
model_dict, cfg
|
| 98 |
-
)
|
| 99 |
-
```
|
| 100 |
-
|
| 101 |
-
### 2. 使用命令行工具
|
| 102 |
-
|
| 103 |
-
```bash
|
| 104 |
-
# 单个视频处理
|
| 105 |
-
hunyuanvideo-foley \
|
| 106 |
-
--model_path ./pretrained_models \
|
| 107 |
-
--single_video video.mp4 \
|
| 108 |
-
--single_prompt "footsteps on gravel" \
|
| 109 |
-
--output_dir ./outputs
|
| 110 |
-
|
| 111 |
-
# 批量处理
|
| 112 |
-
hunyuanvideo-foley \
|
| 113 |
-
--model_path ./pretrained_models \
|
| 114 |
-
--csv_path batch_videos.csv \
|
| 115 |
-
--output_dir ./outputs
|
| 116 |
-
|
| 117 |
-
# 启动Gradio界面
|
| 118 |
-
hunyuanvideo-foley --gradio --model_path ./pretrained_models
|
| 119 |
-
```
|
| 120 |
-
|
| 121 |
-
### 3. 使用原始脚本(向后兼容)
|
| 122 |
-
|
| 123 |
-
```bash
|
| 124 |
-
# 使用原始infer.py脚本
|
| 125 |
-
python infer.py --model_path ./pretrained_models --single_video video.mp4 --single_prompt "audio description"
|
| 126 |
-
|
| 127 |
-
# 启动Gradio应用
|
| 128 |
-
export HIFI_FOLEY_MODEL_PATH=./pretrained_models
|
| 129 |
-
python gradio_app.py
|
| 130 |
-
```
|
| 131 |
-
|
| 132 |
-
## 开发环境设置
|
| 133 |
-
|
| 134 |
-
如果你想参与开发:
|
| 135 |
-
|
| 136 |
-
```bash
|
| 137 |
-
# 克隆项目
|
| 138 |
-
git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
|
| 139 |
-
cd HunyuanVideo-Foley
|
| 140 |
-
|
| 141 |
-
# 安装开发版本
|
| 142 |
-
pip install -e .[dev]
|
| 143 |
-
|
| 144 |
-
# 安装pre-commit钩子
|
| 145 |
-
pre-commit install
|
| 146 |
-
|
| 147 |
-
# 运行测试
|
| 148 |
-
python -m pytest
|
| 149 |
-
|
| 150 |
-
# 代码格式化
|
| 151 |
-
black --line-length 120 .
|
| 152 |
-
isort --profile black .
|
| 153 |
-
|
| 154 |
-
# 类型检查
|
| 155 |
-
mypy --ignore-missing-imports .
|
| 156 |
-
```
|
| 157 |
-
|
| 158 |
-
## 系统要求
|
| 159 |
-
|
| 160 |
-
- **Python**: 3.8+
|
| 161 |
-
- **操作系统**: Linux(主要支持),macOS,Windows
|
| 162 |
-
- **GPU内存**: 推荐 ≥24GB VRAM(如RTX 3090/4090)
|
| 163 |
-
- **CUDA版本**: 12.4 或 11.8(推荐)
|
| 164 |
-
|
| 165 |
-
## 故障排除
|
| 166 |
-
|
| 167 |
-
### 常见问题
|
| 168 |
-
|
| 169 |
-
1. **ImportError: No module named 'audiotools'**
|
| 170 |
-
```bash
|
| 171 |
-
pip install git+https://github.com/descriptinc/audiotools
|
| 172 |
-
```
|
| 173 |
-
|
| 174 |
-
2. **CUDA内存不足**
|
| 175 |
-
- 使用较小的批次大小
|
| 176 |
-
- 确保GPU有足够的VRAM(推荐24GB+)
|
| 177 |
-
|
| 178 |
-
3. **transformers版本问题**
|
| 179 |
-
```bash
|
| 180 |
-
pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2
|
| 181 |
-
```
|
| 182 |
-
|
| 183 |
-
### 获取帮助
|
| 184 |
-
|
| 185 |
-
- 查看项目README: [GitHub](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley)
|
| 186 |
-
- 报告问题: [GitHub Issues](https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley/issues)
|
| 187 |
-
- 论文: [arXiv:2508.16930](https://arxiv.org/abs/2508.16930)
|
| 188 |
-
|
| 189 |
-
## 模型下载
|
| 190 |
-
|
| 191 |
-
```bash
|
| 192 |
-
# 使用HuggingFace Hub
|
| 193 |
-
git clone https://huggingface.co/tencent/HunyuanVideo-Foley
|
| 194 |
-
|
| 195 |
-
# 或使用huggingface-cli
|
| 196 |
-
huggingface-cli download tencent/HunyuanVideo-Foley
|
| 197 |
-
```
|
| 198 |
-
|
| 199 |
-
## 配置文件
|
| 200 |
-
|
| 201 |
-
包安装后,配置文件位于:
|
| 202 |
-
- `hunyuanvideo_foley/configs/` 目录
|
| 203 |
-
- 默认配置:`configs/hunyuanvideo-foley-xxl.yaml`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/LICENSE
DELETED
|
@@ -1,77 +0,0 @@
|
|
| 1 |
-
TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
|
| 2 |
-
Tencent HunyuanVideo-Foley Release Date: August 28, 2025
|
| 3 |
-
THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
|
| 4 |
-
By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
|
| 5 |
-
1. DEFINITIONS.
|
| 6 |
-
a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
|
| 7 |
-
b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
|
| 8 |
-
c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
|
| 9 |
-
d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
|
| 10 |
-
e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
|
| 11 |
-
f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
|
| 12 |
-
g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
|
| 13 |
-
h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
|
| 14 |
-
i. “Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
|
| 15 |
-
j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo-Foley released at [https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley].
|
| 16 |
-
k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
|
| 17 |
-
l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
|
| 18 |
-
m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
|
| 19 |
-
n. “including” shall mean including but not limited to.
|
| 20 |
-
2. GRANT OF RIGHTS.
|
| 21 |
-
We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
|
| 22 |
-
3. DISTRIBUTION.
|
| 23 |
-
You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
|
| 24 |
-
a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
|
| 25 |
-
b. You must cause any modified files to carry prominent notices stating that You changed the files;
|
| 26 |
-
c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
|
| 27 |
-
d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
|
| 28 |
-
You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
|
| 29 |
-
4. ADDITIONAL COMMERCIAL TERMS.
|
| 30 |
-
If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
|
| 31 |
-
5. RULES OF USE.
|
| 32 |
-
a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
|
| 33 |
-
b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
|
| 34 |
-
c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
|
| 35 |
-
6. INTELLECTUAL PROPERTY.
|
| 36 |
-
a. Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
|
| 37 |
-
b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
|
| 38 |
-
c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
|
| 39 |
-
d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
|
| 40 |
-
7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
|
| 41 |
-
a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
|
| 42 |
-
b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
|
| 43 |
-
c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
|
| 44 |
-
8. SURVIVAL AND TERMINATION.
|
| 45 |
-
a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
|
| 46 |
-
b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
|
| 47 |
-
9. GOVERNING LAW AND JURISDICTION.
|
| 48 |
-
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
|
| 49 |
-
b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
|
| 50 |
-
|
| 51 |
-
EXHIBIT A
|
| 52 |
-
ACCEPTABLE USE POLICY
|
| 53 |
-
|
| 54 |
-
Tencent reserves the right to update this Acceptable Use Policy from time to time.
|
| 55 |
-
Last modified: November 5, 2024
|
| 56 |
-
|
| 57 |
-
Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
|
| 58 |
-
1. Outside the Territory;
|
| 59 |
-
2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
|
| 60 |
-
3. To harm Yourself or others;
|
| 61 |
-
4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
|
| 62 |
-
5. To override or circumvent the safety guardrails and safeguards We have put in place;
|
| 63 |
-
6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
|
| 64 |
-
7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
|
| 65 |
-
8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
|
| 66 |
-
9. To intentionally defame, disparage or otherwise harass others;
|
| 67 |
-
10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
|
| 68 |
-
11. To generate or disseminate personal identifiable information with the purpose of harming others;
|
| 69 |
-
12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
|
| 70 |
-
13. To impersonate another individual without consent, authorization, or legal right;
|
| 71 |
-
14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
|
| 72 |
-
15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
|
| 73 |
-
16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
|
| 74 |
-
17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
|
| 75 |
-
18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
|
| 76 |
-
19. For military purposes;
|
| 77 |
-
20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/MANIFEST.in
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
# Include package metadata and documentation
|
| 2 |
-
include README.md
|
| 3 |
-
include LICENSE
|
| 4 |
-
include NOTICE
|
| 5 |
-
include DEVELOPMENT.md
|
| 6 |
-
include CLAUDE.md
|
| 7 |
-
include requirements.txt
|
| 8 |
-
include pyproject.toml
|
| 9 |
-
include pytest.ini
|
| 10 |
-
|
| 11 |
-
# Include configuration files
|
| 12 |
-
include configs/*.yaml
|
| 13 |
-
include configs/*.yml
|
| 14 |
-
recursive-include hunyuanvideo_foley/configs *.yaml *.yml
|
| 15 |
-
|
| 16 |
-
# Include test assets if any
|
| 17 |
-
include assets/*.csv
|
| 18 |
-
include assets/*.txt
|
| 19 |
-
recursive-include assets/test_videos *
|
| 20 |
-
|
| 21 |
-
# Include example scripts
|
| 22 |
-
include *.py
|
| 23 |
-
include *.sh
|
| 24 |
-
|
| 25 |
-
# Include test files
|
| 26 |
-
recursive-include tests *.py
|
| 27 |
-
|
| 28 |
-
# Exclude unnecessary files
|
| 29 |
-
global-exclude *.pyc
|
| 30 |
-
global-exclude *.pyo
|
| 31 |
-
global-exclude *~
|
| 32 |
-
global-exclude .DS_Store
|
| 33 |
-
global-exclude __pycache__
|
| 34 |
-
prune .git
|
| 35 |
-
prune .github
|
| 36 |
-
prune examples/*/outputs
|
| 37 |
-
prune **/__pycache__
|
| 38 |
-
prune **/*.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/NOTICE
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
Usage and Legal Notices:
|
| 2 |
-
|
| 3 |
-
Tencent is pleased to support the open source community by making Tencent HunyuanVideo-Foley available.
|
| 4 |
-
|
| 5 |
-
Copyright (C) 2025 Tencent. All rights reserved.
|
| 6 |
-
|
| 7 |
-
Tencent HunyuanVideo-Foley is licensed under TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent HunyuanVideo-Foley does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
|
| 8 |
-
|
| 9 |
-
For avoidance of doubts, Tencent HunyuanVideo-Foley means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
Other dependencies and licenses:
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
Open Source Software Licensed under the MIT License:
|
| 16 |
-
--------------------------------------------------------------------
|
| 17 |
-
1. syncformer
|
| 18 |
-
Copyright (c) 2024 Vladimir Iashin
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
Terms of the MIT License:
|
| 22 |
-
--------------------------------------------------------------------
|
| 23 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
| 24 |
-
|
| 25 |
-
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
| 26 |
-
|
| 27 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/README.md
DELETED
|
@@ -1,519 +0,0 @@
|
|
| 1 |
-
<div align="center">
|
| 2 |
-
|
| 3 |
-
https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
|
| 4 |
-
|
| 5 |
-
<img src="assets/logo.png" alt="HunyuanVideo-Foley Logo" width="400">
|
| 6 |
-
|
| 7 |
-
<h4>Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation</h4>
|
| 8 |
-
|
| 9 |
-
<p align="center">
|
| 10 |
-
<strong>Professional-grade AI sound effect generation for video content creators</strong>
|
| 11 |
-
</p>
|
| 12 |
-
|
| 13 |
-
<div align="center">
|
| 14 |
-
<a href=https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/Code-black.svg?logo=github height=22px></a>
|
| 15 |
-
<a href=https://szczesnys.github.io/hunyuanvideo-foley target="_blank"><img src=https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
|
| 16 |
-
<a href=https://huggingface.co/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
|
| 17 |
-
<a href=https://huggingface.co/spaces/tencent/HunyuanVideo-Foley target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Demo-276cb4.svg height=22px></a>
|
| 18 |
-
<a href=https://arxiv.org/abs/2508.16930 target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
|
| 19 |
-
<a href=https://x.com/TencentHunyuan target="_blank"><img src=https://img.shields.io/badge/Hunyuan-black.svg?logo=x height=22px></a>
|
| 20 |
-
<a href=https://discord.gg/YEyGGn6Bte target="_blank"><img src=https://img.shields.io/badge/Hunyuan-141984.svg?logo=discord height=22px></a>
|
| 21 |
-
</div>
|
| 22 |
-
|
| 23 |
-
</div>
|
| 24 |
-
|
| 25 |
-
---
|
| 26 |
-
|
| 27 |
-
<div align="center">
|
| 28 |
-
|
| 29 |
-
### 👥 **Authors**
|
| 30 |
-
|
| 31 |
-
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 15px; margin: 20px 0;">
|
| 32 |
-
|
| 33 |
-
**Sizhe Shan**<sup>1,2*</sup> • **Qiulin Li**<sup>1,3*</sup> • **Yutao Cui**<sup>1</sup> • **Miles Yang**<sup>1</sup> • **Yuehai Wang**<sup>2</sup> • **Qun Yang**<sup>3</sup> • **Jin Zhou**<sup>1†</sup> • **Zhao Zhong**<sup>1</sup>
|
| 34 |
-
|
| 35 |
-
</div>
|
| 36 |
-
|
| 37 |
-
<div style="margin-top: 15px; font-size: 14px; color: #666;">
|
| 38 |
-
|
| 39 |
-
🏢 <sup>1</sup>**Tencent Hunyuan** • 🎓 <sup>2</sup>**Zhejiang University** • ✈️ <sup>3</sup>**Nanjing University of Aeronautics and Astronautics**
|
| 40 |
-
|
| 41 |
-
*Equal contribution • †Project lead
|
| 42 |
-
|
| 43 |
-
</div>
|
| 44 |
-
|
| 45 |
-
</div>
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
---
|
| 49 |
-
|
| 50 |
-
## 🔥🔥🔥 **News**
|
| 51 |
-
|
| 52 |
-
<div style="background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); padding: 20px; border-radius: 15px; margin: 20px 0; border-left: 5px solid #2196f3;">
|
| 53 |
-
|
| 54 |
-
- **[2025.9.29]** 🚀 **HunyuanVideo-Foley-XL Model Release** - Release XL-sized model with offload inference support, significantly reducing VRAM requirements.
|
| 55 |
-
- **[2025.8.28]** 🌟 **HunyuanVideo-Foley Open Source Release** - Inference code and model weights publicly available.
|
| 56 |
-
|
| 57 |
-
</div>
|
| 58 |
-
|
| 59 |
-
---
|
| 60 |
-
|
| 61 |
-
## 🎥 **Demo & Showcase**
|
| 62 |
-
|
| 63 |
-
<div align="center">
|
| 64 |
-
|
| 65 |
-
> **Experience the magic of AI-generated Foley audio in perfect sync with video content!**
|
| 66 |
-
|
| 67 |
-
<div style="border: 3px solid #4A90E2; border-radius: 15px; padding: 10px; margin: 20px 0; background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);">
|
| 68 |
-
|
| 69 |
-
<video src="https://github.com/user-attachments/assets/d6e1b6fd-6980-4a68-8717-74298d064195" width="80%" controls style="border-radius: 10px; box-shadow: 0 8px 32px rgba(0,0,0,0.1);"> </video>
|
| 70 |
-
|
| 71 |
-
<p><em>🎬 Watch how HunyuanVideo-Foley generates immersive sound effects synchronized with video content</em></p>
|
| 72 |
-
|
| 73 |
-
</div>
|
| 74 |
-
|
| 75 |
-
---
|
| 76 |
-
|
| 77 |
-
## 🤝 **Community Contributions**
|
| 78 |
-
|
| 79 |
-
<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0; color: #333;">
|
| 80 |
-
|
| 81 |
-
**ComfyUI Integration** - Thanks to the amazing community for creating ComfyUI nodes:
|
| 82 |
-
|
| 83 |
-
- **[if-ai/ComfyUI_HunyuanVideoFoley](https://github.com/if-ai/ComfyUI_HunyuanVideoFoley)** - ComfyUI workflow integration which supports cpu offloading and FP8 quantization
|
| 84 |
-
- **[phazei/ComfyUI-HunyuanVideo-Foley](https://github.com/phazei/ComfyUI-HunyuanVideo-Foley)** - Alternative ComfyUI node implementation which supports different precision modes
|
| 85 |
-
|
| 86 |
-
</div>
|
| 87 |
-
|
| 88 |
-
<div align="center" style="margin: 20px 0;">
|
| 89 |
-
|
| 90 |
-
**🌟 We encourage and appreciate community contributions that make HunyuanVideo-Foley more accessible!**
|
| 91 |
-
|
| 92 |
-
</div>
|
| 93 |
-
|
| 94 |
-
---
|
| 95 |
-
### ✨ **Key Highlights**
|
| 96 |
-
|
| 97 |
-
<table align="center" style="border: none; margin: 20px 0;">
|
| 98 |
-
<tr>
|
| 99 |
-
<td align="center" width="33%">
|
| 100 |
-
|
| 101 |
-
🎭 **Multi-scenario Sync**
|
| 102 |
-
High-quality audio synchronized with complex video scenes
|
| 103 |
-
|
| 104 |
-
</td>
|
| 105 |
-
<td align="center" width="33%">
|
| 106 |
-
|
| 107 |
-
🧠 **Multi-modal Balance**
|
| 108 |
-
Perfect harmony between visual and textual information
|
| 109 |
-
|
| 110 |
-
</td>
|
| 111 |
-
<td align="center" width="33%">
|
| 112 |
-
|
| 113 |
-
🎵 **48kHz Hi-Fi Output**
|
| 114 |
-
Professional-grade audio generation with crystal clarity
|
| 115 |
-
|
| 116 |
-
</td>
|
| 117 |
-
</tr>
|
| 118 |
-
</table>
|
| 119 |
-
|
| 120 |
-
</div>
|
| 121 |
-
|
| 122 |
-
---
|
| 123 |
-
|
| 124 |
-
## 📄 **Abstract**
|
| 125 |
-
|
| 126 |
-
<div align="center" style="background: linear-gradient(135deg, #ffeef8 0%, #f0f8ff 100%); padding: 30px; border-radius: 20px; margin: 20px 0; border-left: 5px solid #ff6b9d; color: #333;">
|
| 127 |
-
|
| 128 |
-
**🚀 Tencent Hunyuan** open-sources **HunyuanVideo-Foley** an end-to-end video sound effect generation model!
|
| 129 |
-
|
| 130 |
-
*A professional-grade AI tool specifically designed for video content creators, widely applicable to diverse scenarios including short video creation, film production, advertising creativity, and game development.*
|
| 131 |
-
|
| 132 |
-
</div>
|
| 133 |
-
|
| 134 |
-
### 🎯 **Core Highlights**
|
| 135 |
-
|
| 136 |
-
<div style="display: grid; grid-template-columns: 1fr; gap: 15px; margin: 20px 0;">
|
| 137 |
-
|
| 138 |
-
<div style="border-left: 4px solid #4CAF50; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
|
| 139 |
-
|
| 140 |
-
**🎬 Multi-scenario Audio-Visual Synchronization**
|
| 141 |
-
Supports generating high-quality audio that is synchronized and semantically aligned with complex video scenes, enhancing realism and immersive experience for film/TV and gaming applications.
|
| 142 |
-
|
| 143 |
-
</div>
|
| 144 |
-
|
| 145 |
-
<div style="border-left: 4px solid #2196F3; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
|
| 146 |
-
|
| 147 |
-
**⚖️ Multi-modal Semantic Balance**
|
| 148 |
-
Intelligently balances visual and textual information analysis, comprehensively orchestrates sound effect elements, avoids one-sided generation, and meets personalized dubbing requirements.
|
| 149 |
-
|
| 150 |
-
</div>
|
| 151 |
-
|
| 152 |
-
<div style="border-left: 4px solid #FF9800; padding: 15px; background: #f8f9fa; border-radius: 8px; color: #333;">
|
| 153 |
-
|
| 154 |
-
**🎵 High-fidelity Audio Output**
|
| 155 |
-
Self-developed 48kHz audio VAE perfectly reconstructs sound effects, music, and vocals, achieving professional-grade audio generation quality.
|
| 156 |
-
|
| 157 |
-
</div>
|
| 158 |
-
|
| 159 |
-
</div>
|
| 160 |
-
|
| 161 |
-
<div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
|
| 162 |
-
|
| 163 |
-
**🏆 SOTA Performance Achieved**
|
| 164 |
-
|
| 165 |
-
*HunyuanVideo-Foley comprehensively leads the field across multiple evaluation benchmarks, achieving new state-of-the-art levels in audio fidelity, visual-semantic alignment, temporal alignment, and distribution matching - surpassing all open-source solutions!*
|
| 166 |
-
|
| 167 |
-
</div>
|
| 168 |
-
|
| 169 |
-
<div align="center">
|
| 170 |
-
|
| 171 |
-

|
| 172 |
-
*📊 Performance comparison across different evaluation metrics - HunyuanVideo-Foley leads in all categories*
|
| 173 |
-
|
| 174 |
-
</div>
|
| 175 |
-
|
| 176 |
-
---
|
| 177 |
-
|
| 178 |
-
## 🔧 **Technical Architecture**
|
| 179 |
-
|
| 180 |
-
### 📊 **Data Pipeline Design**
|
| 181 |
-
|
| 182 |
-
<div align="center" style="margin: 20px 0; color: #333;">
|
| 183 |
-
|
| 184 |
-

|
| 185 |
-
*🔄 Comprehensive data processing pipeline for high-quality text-video-audio datasets*
|
| 186 |
-
|
| 187 |
-
</div>
|
| 188 |
-
|
| 189 |
-
<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #17a2b8; margin: 20px 0;">
|
| 190 |
-
|
| 191 |
-
The **TV2A (Text-Video-to-Audio)** task presents a complex multimodal generation challenge requiring large-scale, high-quality datasets. Our comprehensive data pipeline systematically identifies and excludes unsuitable content to produce robust and generalizable audio generation capabilities.
|
| 192 |
-
|
| 193 |
-
</div>
|
| 194 |
-
|
| 195 |
-
### 🏗️ **Model Architecture**
|
| 196 |
-
|
| 197 |
-
<div align="center" style="margin: 20px 0; color: #333;">
|
| 198 |
-
|
| 199 |
-

|
| 200 |
-
*🧠 HunyuanVideo-Foley hybrid architecture with multimodal and unimodal transformer blocks*
|
| 201 |
-
|
| 202 |
-
</div>
|
| 203 |
-
|
| 204 |
-
<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #28a745; margin: 20px 0;">
|
| 205 |
-
|
| 206 |
-
**HunyuanVideo-Foley** employs a sophisticated hybrid architecture:
|
| 207 |
-
|
| 208 |
-
- **🔄 Multimodal Transformer Blocks**: Process visual-audio streams simultaneously
|
| 209 |
-
- **🎵 Unimodal Transformer Blocks**: Focus on audio stream refinement
|
| 210 |
-
- **👁️ Visual Encoding**: Pre-trained encoder extracts visual features from video frames
|
| 211 |
-
- **📝 Text Processing**: Semantic features extracted via pre-trained text encoder
|
| 212 |
-
- **🎧 Audio Encoding**: Latent representations with Gaussian noise perturbation
|
| 213 |
-
- **⏰ Temporal Alignment**: Synchformer-based frame-level synchronization with gated modulation
|
| 214 |
-
|
| 215 |
-
</div>
|
| 216 |
-
|
| 217 |
-
---
|
| 218 |
-
|
| 219 |
-
## 📈 **Performance Benchmarks**
|
| 220 |
-
|
| 221 |
-
### 🎬 **MovieGen-Audio-Bench Results**
|
| 222 |
-
|
| 223 |
-
<div align="center">
|
| 224 |
-
|
| 225 |
-
> *Objective and Subjective evaluation results demonstrating superior performance across all metrics*
|
| 226 |
-
|
| 227 |
-
</div>
|
| 228 |
-
|
| 229 |
-
<div style="overflow-x: auto; margin: 20px 0;">
|
| 230 |
-
|
| 231 |
-
| 🏆 **Method** | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ | **MOS-Q** ↑ | **MOS-S** ↑ | **MOS-T** ↑ |
|
| 232 |
-
|:-------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|:------------:|:------------:|:------------:|
|
| 233 |
-
| FoleyGrafter | 6.27 | 2.72 | 3.34 | 5.68 | 0.17 | 1.29 | 0.14 | 3.36±0.78 | 3.54±0.88 | 3.46±0.95 |
|
| 234 |
-
| V-AURA | 5.82 | 4.30 | 3.63 | 5.11 | 0.23 | 1.38 | 0.14 | 2.55±0.97 | 2.60±1.20 | 2.70±1.37 |
|
| 235 |
-
| Frieren | 5.71 | 2.81 | 3.47 | 5.31 | 0.18 | 1.39 | 0.16 | 2.92±0.95 | 2.76±1.20 | 2.94±1.26 |
|
| 236 |
-
| MMAudio | 6.17 | 2.84 | 3.59 | 5.62 | 0.27 | 0.80 | 0.35 | 3.58±0.84 | 3.63±1.00 | 3.47±1.03 |
|
| 237 |
-
| ThinkSound | 6.04 | 3.73 | 3.81 | 5.59 | 0.18 | 0.91 | 0.20 | 3.20±0.97 | 3.01±1.04 | 3.02±1.08 |
|
| 238 |
-
| **HunyuanVideo-Foley (ours)** | **6.59** | **2.74** | **3.88** | **6.13** | **0.35** | **0.74** | **0.33** | **4.14±0.68** | **4.12±0.77** | **4.15±0.75** |
|
| 239 |
-
|
| 240 |
-
</div>
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
### 🎯 **Kling-Audio-Eval Results**
|
| 244 |
-
|
| 245 |
-
<div align="center">
|
| 246 |
-
|
| 247 |
-
> *Comprehensive objective evaluation showcasing state-of-the-art performance*
|
| 248 |
-
|
| 249 |
-
</div>
|
| 250 |
-
|
| 251 |
-
<div style="overflow-x: auto; margin: 20px 0;">
|
| 252 |
-
|
| 253 |
-
| 🏆 **Method** | **FD_PANNs** ↓ | **FD_PASST** ↓ | **KL** ↓ | **IS** ↑ | **PQ** ↑ | **PC** ↓ | **CE** ↑ | **CU** ↑ | **IB** ↑ | **DeSync** ↓ | **CLAP** ↑ |
|
| 254 |
-
|:-------------:|:--------------:|:--------------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:-------------:|:-----------:|
|
| 255 |
-
| FoleyGrafter | 22.30 | 322.63 | 2.47 | 7.08 | 6.05 | 2.91 | 3.28 | 5.44 | 0.22 | 1.23 | 0.22 |
|
| 256 |
-
| V-AURA | 33.15 | 474.56 | 3.24 | 5.80 | 5.69 | 3.98 | 3.13 | 4.83 | 0.25 | 0.86 | 0.13 |
|
| 257 |
-
| Frieren | 16.86 | 293.57 | 2.95 | 7.32 | 5.72 | 2.55 | 2.88 | 5.10 | 0.21 | 0.86 | 0.16 |
|
| 258 |
-
| MMAudio | 9.01 | 205.85 | 2.17 | 9.59 | 5.94 | 2.91 | 3.30 | 5.39 | 0.30 | 0.56 | 0.27 |
|
| 259 |
-
| ThinkSound | 9.92 | 228.68 | 2.39 | 6.86 | 5.78 | 3.23 | 3.12 | 5.11 | 0.22 | 0.67 | 0.22 |
|
| 260 |
-
| **HunyuanVideo-Foley (ours)** | **6.07** | **202.12** | **1.89** | **8.30** | **6.12** | **2.76** | **3.22** | **5.53** | **0.38** | **0.54** | **0.24** |
|
| 261 |
-
|
| 262 |
-
</div>
|
| 263 |
-
|
| 264 |
-
<div align="center" style="background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); color: white; padding: 15px; border-radius: 10px; margin: 20px 0; color: #333;">
|
| 265 |
-
|
| 266 |
-
**🎉 Outstanding Results!** HunyuanVideo-Foley achieves the best scores across **ALL** evaluation metrics, demonstrating significant improvements in audio quality, synchronization, and semantic alignment.
|
| 267 |
-
|
| 268 |
-
</div>
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
---
|
| 273 |
-
|
| 274 |
-
## 🚀 **Quick Start**
|
| 275 |
-
|
| 276 |
-
### 📦 **Installation**
|
| 277 |
-
|
| 278 |
-
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0; color: #333;">
|
| 279 |
-
|
| 280 |
-
**🔧 System Requirements**
|
| 281 |
-
- **CUDA**: 12.4 or 11.8 recommended
|
| 282 |
-
- **Python**: 3.8+
|
| 283 |
-
- **OS**: Linux (primary support)
|
| 284 |
-
- **VRAM**: 20GB for XXL model (or 12GB with `--enable_offload`), 16GB for XL model (or 8GB with `--enable_offload`)
|
| 285 |
-
|
| 286 |
-
</div>
|
| 287 |
-
|
| 288 |
-
#### **Step 1: Clone Repository**
|
| 289 |
-
|
| 290 |
-
```bash
|
| 291 |
-
# 📥 Clone the repository
|
| 292 |
-
git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley
|
| 293 |
-
cd HunyuanVideo-Foley
|
| 294 |
-
```
|
| 295 |
-
|
| 296 |
-
#### **Step 2: Environment Setup**
|
| 297 |
-
|
| 298 |
-
<div style="background: #fff3cd; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0; color: #333;">
|
| 299 |
-
|
| 300 |
-
💡 **Tip**: We recommend using [Conda](https://docs.anaconda.com/free/miniconda/index.html) for Python environment management.
|
| 301 |
-
|
| 302 |
-
</div>
|
| 303 |
-
|
| 304 |
-
```bash
|
| 305 |
-
# 🔧 Install dependencies
|
| 306 |
-
pip install -r requirements.txt
|
| 307 |
-
```
|
| 308 |
-
|
| 309 |
-
#### **Step 3: Download Pretrained Models**
|
| 310 |
-
|
| 311 |
-
<div style="background: #d1ecf1; padding: 15px; border-radius: 8px; border-left: 4px solid #17a2b8; margin: 10px 0;color: #333;">
|
| 312 |
-
|
| 313 |
-
🔗 **Download Model weights from Huggingface**
|
| 314 |
-
```bash
|
| 315 |
-
# using git-lfs
|
| 316 |
-
git clone https://huggingface.co/tencent/HunyuanVideo-Foley
|
| 317 |
-
|
| 318 |
-
# using huggingface-cli
|
| 319 |
-
huggingface-cli download tencent/HunyuanVideo-Foley
|
| 320 |
-
```
|
| 321 |
-
|
| 322 |
-
<!-- 🔗 **Download Model weights from ModelScope** -->
|
| 323 |
-
<!-- ```bash -->
|
| 324 |
-
<!-- # using git-lfs -->
|
| 325 |
-
<!-- git clone https://huggingface.co/tencent/HunyuanVideo-Foley -->
|
| 326 |
-
<!-- -->
|
| 327 |
-
<!-- # using huggingface-cli -->
|
| 328 |
-
<!-- huggingface-cli download tencent/HunyuanVideo-Foley -->
|
| 329 |
-
<!-- ``` -->
|
| 330 |
-
|
| 331 |
-
</div>
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
---
|
| 335 |
-
|
| 336 |
-
## 💻 **Usage**
|
| 337 |
-
|
| 338 |
-
### 📊 **Model Specifications**
|
| 339 |
-
|
| 340 |
-
| Model | Checkpoint | VRAM (Normal) | VRAM (Offload) |
|
| 341 |
-
|-------|------------|---------------|----------------|
|
| 342 |
-
| **XXL** *(Default)* | `hunyuanvideo_foley.pth` | 20GB | 12GB |
|
| 343 |
-
| **XL** | `hunyuanvideo_foley_xl.pth` | 16GB | 8GB |
|
| 344 |
-
|
| 345 |
-
### 🎬 **Single Video Generation**
|
| 346 |
-
|
| 347 |
-
<div style="background: #e8f5e8; padding: 15px; border-radius: 8px; border-left: 4px solid #28a745; margin: 10px 0;color: #333;">
|
| 348 |
-
|
| 349 |
-
Generate Foley audio for a single video file with text description:
|
| 350 |
-
|
| 351 |
-
</div>
|
| 352 |
-
|
| 353 |
-
```bash
|
| 354 |
-
# Use XXL model (default, best quality)
|
| 355 |
-
python3 infer.py \
|
| 356 |
-
--model_path PRETRAINED_MODEL_PATH_DIR \
|
| 357 |
-
--single_video video_path \
|
| 358 |
-
--single_prompt "audio description" \
|
| 359 |
-
--output_dir OUTPUT_DIR \
|
| 360 |
-
# --enable_offload
|
| 361 |
-
|
| 362 |
-
# Use XL model (memory-friendly)
|
| 363 |
-
python3 infer.py \
|
| 364 |
-
--model_path PRETRAINED_MODEL_PATH_DIR \
|
| 365 |
-
--model_size xl \
|
| 366 |
-
--single_video video_path \
|
| 367 |
-
--single_prompt "audio description" \
|
| 368 |
-
--output_dir OUTPUT_DIR \
|
| 369 |
-
# --enable_offload
|
| 370 |
-
```
|
| 371 |
-
|
| 372 |
-
### 📂 **Batch Processing**
|
| 373 |
-
|
| 374 |
-
<div style="background: #fff3e0; padding: 15px; border-radius: 8px; border-left: 4px solid #ff9800; margin: 10px 0;color: #333;">
|
| 375 |
-
|
| 376 |
-
Process multiple videos using a CSV file with video paths and descriptions:
|
| 377 |
-
|
| 378 |
-
</div>
|
| 379 |
-
|
| 380 |
-
```bash
|
| 381 |
-
# Download sample test videos
|
| 382 |
-
bash ./download_test_videos.sh
|
| 383 |
-
|
| 384 |
-
# Batch processing
|
| 385 |
-
python3 infer.py \
|
| 386 |
-
--model_path PRETRAINED_MODEL_PATH_DIR \
|
| 387 |
-
--csv_path assets/test.csv \
|
| 388 |
-
--output_dir OUTPUT_DIR \
|
| 389 |
-
# --enable_offload
|
| 390 |
-
```
|
| 391 |
-
|
| 392 |
-
### 🌐 **Interactive Web Interface**
|
| 393 |
-
|
| 394 |
-
<div style="background: #f3e5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #9c27b0; margin: 10px 0;color: #333;">
|
| 395 |
-
|
| 396 |
-
Launch a user-friendly Gradio web interface for easy interaction:
|
| 397 |
-
|
| 398 |
-
</div>
|
| 399 |
-
|
| 400 |
-
```bash
|
| 401 |
-
# Launch with XXL model (default)
|
| 402 |
-
export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
|
| 403 |
-
python3 gradio_app.py
|
| 404 |
-
|
| 405 |
-
# Launch with XL model (memory-friendly)
|
| 406 |
-
export HIFI_FOLEY_MODEL_PATH=PRETRAINED_MODEL_PATH_DIR
|
| 407 |
-
MODEL_SIZE=xl python3 gradio_app.py
|
| 408 |
-
|
| 409 |
-
# Optional: Enable offload to reduce memory usage
|
| 410 |
-
ENABLE_OFFLOAD=true python3 gradio_app.py
|
| 411 |
-
```
|
| 412 |
-
|
| 413 |
-
<div align="center" style="margin: 20px 0; color: #333;">
|
| 414 |
-
|
| 415 |
-
*🚀 Then open your browser and navigate to the provided local URL to start generating Foley audio!*
|
| 416 |
-
|
| 417 |
-
</div>
|
| 418 |
-
|
| 419 |
-
---
|
| 420 |
-
|
| 421 |
-
## 📚 **Citation**
|
| 422 |
-
|
| 423 |
-
<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #6c757d; margin: 20px 0; color: #333;">
|
| 424 |
-
|
| 425 |
-
If you find **HunyuanVideo-Foley** useful for your research, please consider citing our paper:
|
| 426 |
-
|
| 427 |
-
</div>
|
| 428 |
-
|
| 429 |
-
```bibtex
|
| 430 |
-
@misc{shan2025hunyuanvideofoleymultimodaldiffusionrepresentation,
|
| 431 |
-
title={HunyuanVideo-Foley: Multimodal Diffusion with Representation Alignment for High-Fidelity Foley Audio Generation},
|
| 432 |
-
author={Sizhe Shan and Qiulin Li and Yutao Cui and Miles Yang and Yuehai Wang and Qun Yang and Jin Zhou and Zhao Zhong},
|
| 433 |
-
year={2025},
|
| 434 |
-
eprint={2508.16930},
|
| 435 |
-
archivePrefix={arXiv},
|
| 436 |
-
primaryClass={eess.AS},
|
| 437 |
-
url={https://arxiv.org/abs/2508.16930},
|
| 438 |
-
}
|
| 439 |
-
```
|
| 440 |
-
## Star History
|
| 441 |
-
|
| 442 |
-
[](https://www.star-history.com/#Tencent-Hunyuan/HunyuanVideo-Foley&Date)
|
| 443 |
-
---
|
| 444 |
-
|
| 445 |
-
## 🙏 **Acknowledgements**
|
| 446 |
-
|
| 447 |
-
<div align="center">
|
| 448 |
-
|
| 449 |
-
**We extend our heartfelt gratitude to the open-source community!**
|
| 450 |
-
|
| 451 |
-
</div>
|
| 452 |
-
|
| 453 |
-
<table align="center" style="width: 100%; border: none; margin: 20px 0;">
|
| 454 |
-
<tr>
|
| 455 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 456 |
-
|
| 457 |
-
🎨 **[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)**
|
| 458 |
-
*Foundation diffusion models*
|
| 459 |
-
|
| 460 |
-
</td>
|
| 461 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 462 |
-
|
| 463 |
-
⚡ **[FLUX](https://github.com/black-forest-labs/flux)**
|
| 464 |
-
*Advanced generation techniques*
|
| 465 |
-
|
| 466 |
-
</td>
|
| 467 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 468 |
-
|
| 469 |
-
🎵 **[MMAudio](https://github.com/hkchengrex/MMAudio)**
|
| 470 |
-
*Multimodal audio generation*
|
| 471 |
-
|
| 472 |
-
</td>
|
| 473 |
-
</tr>
|
| 474 |
-
<tr>
|
| 475 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 476 |
-
|
| 477 |
-
🤗 **[HuggingFace](https://huggingface.co)**
|
| 478 |
-
*Platform & diffusers library*
|
| 479 |
-
|
| 480 |
-
</td>
|
| 481 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 482 |
-
|
| 483 |
-
🗜️ **[DAC](https://github.com/descriptinc/descript-audio-codec)**
|
| 484 |
-
*High-Fidelity Audio Compression*
|
| 485 |
-
|
| 486 |
-
</td>
|
| 487 |
-
<td align="center" style="width: 33%; padding: 10px; vertical-align: top;">
|
| 488 |
-
|
| 489 |
-
🔗 **[Synchformer](https://github.com/v-iashin/Synchformer)**
|
| 490 |
-
*Audio-Visual Synchronization*
|
| 491 |
-
|
| 492 |
-
</td>
|
| 493 |
-
</tr>
|
| 494 |
-
</table>
|
| 495 |
-
|
| 496 |
-
<div align="center" style="background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px 0;, color: #333;">
|
| 497 |
-
|
| 498 |
-
**🌟 Special thanks to all researchers and developers who contribute to the advancement of AI-generated audio and multimodal learning!**
|
| 499 |
-
|
| 500 |
-
</div>
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
---
|
| 504 |
-
|
| 505 |
-
<div align="center" style="margin: 30px 0;">
|
| 506 |
-
|
| 507 |
-
### 🔗 **Connect with Us**
|
| 508 |
-
|
| 509 |
-
[](https://github.com/Tencent-Hunyuan)
|
| 510 |
-
[](https://twitter.com/Tencent)
|
| 511 |
-
[](https://hunyuan.tencent.com/)
|
| 512 |
-
|
| 513 |
-
<p style="color: #666; margin-top: 15px; font-size: 14px;">
|
| 514 |
-
|
| 515 |
-
© 2025 Tencent Hunyuan. All rights reserved. | Made with ❤️ for the AI community
|
| 516 |
-
|
| 517 |
-
</p>
|
| 518 |
-
|
| 519 |
-
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/build_package.sh
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# 构建 HunyuanVideo-Foley Python 包的脚本
|
| 3 |
-
|
| 4 |
-
set -e # 出现错误时退出
|
| 5 |
-
|
| 6 |
-
echo "🚀 开始构建 HunyuanVideo-Foley Python 包..."
|
| 7 |
-
|
| 8 |
-
# 清理之前的构建文件
|
| 9 |
-
echo "🧹 清理之前的构建文件..."
|
| 10 |
-
rm -rf build/ dist/ *.egg-info/
|
| 11 |
-
|
| 12 |
-
# 检查必要的工具
|
| 13 |
-
echo "🔍 检查构建工具..."
|
| 14 |
-
python -c "import setuptools, wheel; print('✅ setuptools和wheel已安装')" || {
|
| 15 |
-
echo "❌ 请安装构建工具: pip install setuptools wheel"
|
| 16 |
-
exit 1
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
# 检查setup.py
|
| 20 |
-
echo "🔍 验证setup.py配置..."
|
| 21 |
-
python setup.py check --restructuredtext --strict || {
|
| 22 |
-
echo "⚠️ setup.py验证有警告,但继续构建..."
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
# 构建源码分发包
|
| 26 |
-
echo "📦 构建源码分发包..."
|
| 27 |
-
python setup.py sdist
|
| 28 |
-
|
| 29 |
-
# 构建wheel包
|
| 30 |
-
echo "🎡 构建wheel包..."
|
| 31 |
-
python setup.py bdist_wheel
|
| 32 |
-
|
| 33 |
-
# 显示构建结果
|
| 34 |
-
echo "✅ 构建完成!生成的包:"
|
| 35 |
-
ls -la dist/
|
| 36 |
-
|
| 37 |
-
# 验证包
|
| 38 |
-
echo "🔍 验证生成的包..."
|
| 39 |
-
python -m pip check dist/*.whl || echo "⚠️ 包验证有警告"
|
| 40 |
-
|
| 41 |
-
echo ""
|
| 42 |
-
echo "📝 安装说明:"
|
| 43 |
-
echo "# 从wheel文件安装:"
|
| 44 |
-
echo "pip install dist/hunyuanvideo_foley-1.0.0-py3-none-any.whl"
|
| 45 |
-
echo ""
|
| 46 |
-
echo "# 开发模式安装:"
|
| 47 |
-
echo "pip install -e ."
|
| 48 |
-
echo ""
|
| 49 |
-
echo "# 安装所有可选依赖:"
|
| 50 |
-
echo "pip install -e .[all]"
|
| 51 |
-
echo ""
|
| 52 |
-
|
| 53 |
-
echo "⚠️ 注意:某些依赖需要单独安装:"
|
| 54 |
-
echo "pip install git+https://github.com/descriptinc/audiotools"
|
| 55 |
-
echo "pip install git+https://github.com/huggingface/transformers@v4.49.0-SigLIP-2"
|
| 56 |
-
|
| 57 |
-
echo ""
|
| 58 |
-
echo "🎉 构建完成!查看 INSTALL.md 获取详细安装指南。"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/download_test_videos.sh
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# Download MoviegenAudioBenchSfx 10 videos
|
| 4 |
-
curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/MovieGenAudioBenchSfx.tar.gz
|
| 5 |
-
tar -xzvf MovieGenAudioBenchSfx.tar.gz -C ./assets
|
| 6 |
-
rm MovieGenAudioBenchSfx.tar.gz
|
| 7 |
-
|
| 8 |
-
# Download gradio example video
|
| 9 |
-
curl -O https://texttoaudio-train-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuanvideo-foley_demo/examples.tar.gz
|
| 10 |
-
tar -xvzf examples.tar.gz
|
| 11 |
-
rm examples.tar.gz
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/gradio_app.py
DELETED
|
@@ -1,834 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import tempfile
|
| 3 |
-
import gradio as gr
|
| 4 |
-
import torch
|
| 5 |
-
import torchaudio
|
| 6 |
-
from loguru import logger
|
| 7 |
-
from typing import Optional, Tuple
|
| 8 |
-
import random
|
| 9 |
-
import numpy as np
|
| 10 |
-
|
| 11 |
-
from hunyuanvideo_foley.utils.model_utils import load_model
|
| 12 |
-
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
| 13 |
-
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 14 |
-
from hunyuanvideo_foley.utils.media_utils import merge_audio_video
|
| 15 |
-
|
| 16 |
-
# Global variables for model storage
|
| 17 |
-
model_dict = None
|
| 18 |
-
cfg = None
|
| 19 |
-
device = None
|
| 20 |
-
|
| 21 |
-
# need to modify the model path
|
| 22 |
-
MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
|
| 23 |
-
ENABLE_OFFLOAD = os.environ.get("ENABLE_OFFLOAD", "false").lower() in ("true", "1", "yes")
|
| 24 |
-
MODEL_SIZE = os.environ.get("MODEL_SIZE", "xxl") # default to xxl model
|
| 25 |
-
CONFIG_PATH = os.environ.get("CONFIG_PATH", "")
|
| 26 |
-
|
| 27 |
-
def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
|
| 28 |
-
"""Setup computing device"""
|
| 29 |
-
if device_str == "auto":
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
device = torch.device(f"cuda:{gpu_id}")
|
| 32 |
-
logger.info(f"Using CUDA device: {device}")
|
| 33 |
-
elif torch.backends.mps.is_available():
|
| 34 |
-
device = torch.device("mps")
|
| 35 |
-
logger.info("Using MPS device")
|
| 36 |
-
else:
|
| 37 |
-
device = torch.device("cpu")
|
| 38 |
-
logger.info("Using CPU device")
|
| 39 |
-
else:
|
| 40 |
-
if device_str == "cuda":
|
| 41 |
-
device = torch.device(f"cuda:{gpu_id}")
|
| 42 |
-
else:
|
| 43 |
-
device = torch.device(device_str)
|
| 44 |
-
logger.info(f"Using specified device: {device}")
|
| 45 |
-
|
| 46 |
-
return device
|
| 47 |
-
|
| 48 |
-
def auto_load_models() -> str:
|
| 49 |
-
"""Automatically load preset models"""
|
| 50 |
-
global model_dict, cfg, device
|
| 51 |
-
|
| 52 |
-
try:
|
| 53 |
-
if not os.path.exists(MODEL_PATH):
|
| 54 |
-
return f"❌ Model directory not found: {MODEL_PATH}"
|
| 55 |
-
|
| 56 |
-
# Use GPU by default
|
| 57 |
-
device = setup_device("auto", 0)
|
| 58 |
-
|
| 59 |
-
# Auto-select config if not specified
|
| 60 |
-
config_path = CONFIG_PATH
|
| 61 |
-
if not config_path:
|
| 62 |
-
config_mapping = {
|
| 63 |
-
"xl": "configs/hunyuanvideo-foley-xl.yaml",
|
| 64 |
-
"xxl": "configs/hunyuanvideo-foley-xxl.yaml"
|
| 65 |
-
}
|
| 66 |
-
config_path = config_mapping.get(MODEL_SIZE, "configs/hunyuanvideo-foley-xxl.yaml")
|
| 67 |
-
|
| 68 |
-
# Load model
|
| 69 |
-
logger.info("Auto-loading model...")
|
| 70 |
-
logger.info(f"Model path: {MODEL_PATH}")
|
| 71 |
-
logger.info(f"Model size: {MODEL_SIZE}")
|
| 72 |
-
logger.info(f"Config path: {config_path}")
|
| 73 |
-
logger.info(f"Offload mode: {'enabled' if ENABLE_OFFLOAD else 'disabled'}")
|
| 74 |
-
|
| 75 |
-
model_dict, cfg = load_model(MODEL_PATH, config_path, device, enable_offload=ENABLE_OFFLOAD, model_size=MODEL_SIZE)
|
| 76 |
-
|
| 77 |
-
logger.info("✅ Model loaded successfully!")
|
| 78 |
-
return "✅ Model loaded successfully!"
|
| 79 |
-
|
| 80 |
-
except Exception as e:
|
| 81 |
-
logger.error(f"Model loading failed: {str(e)}")
|
| 82 |
-
return f"❌ Model loading failed: {str(e)}"
|
| 83 |
-
|
| 84 |
-
def infer_single_video(
|
| 85 |
-
video_file,
|
| 86 |
-
text_prompt: str,
|
| 87 |
-
neg_prompt: str = None,
|
| 88 |
-
guidance_scale: float = 4.5,
|
| 89 |
-
num_inference_steps: int = 50,
|
| 90 |
-
sample_nums: int = 1
|
| 91 |
-
) -> Tuple[list, str]:
|
| 92 |
-
"""Single video inference"""
|
| 93 |
-
global model_dict, cfg, device
|
| 94 |
-
|
| 95 |
-
if model_dict is None or cfg is None:
|
| 96 |
-
return [], "❌ Please load the model first!"
|
| 97 |
-
|
| 98 |
-
if video_file is None:
|
| 99 |
-
return [], "❌ Please upload a video file!"
|
| 100 |
-
|
| 101 |
-
# Allow empty text prompt, use empty string if no prompt provided
|
| 102 |
-
if text_prompt is None:
|
| 103 |
-
text_prompt = ""
|
| 104 |
-
text_prompt = text_prompt.strip()
|
| 105 |
-
|
| 106 |
-
try:
|
| 107 |
-
logger.info(f"Processing video: {video_file}")
|
| 108 |
-
logger.info(f"Text prompt: {text_prompt}")
|
| 109 |
-
|
| 110 |
-
# Feature processing
|
| 111 |
-
visual_feats, text_feats, audio_len_in_s = feature_process(
|
| 112 |
-
video_file,
|
| 113 |
-
text_prompt,
|
| 114 |
-
model_dict,
|
| 115 |
-
cfg,
|
| 116 |
-
neg_prompt=neg_prompt
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
# Denoising process to generate multiple audio samples
|
| 120 |
-
# Note: The model now generates sample_nums audio samples per inference
|
| 121 |
-
# The denoise_process function returns audio with shape [batch_size, channels, samples]
|
| 122 |
-
logger.info(f"Generating {sample_nums} audio samples...")
|
| 123 |
-
audio, sample_rate = denoise_process(
|
| 124 |
-
visual_feats,
|
| 125 |
-
text_feats,
|
| 126 |
-
audio_len_in_s,
|
| 127 |
-
model_dict,
|
| 128 |
-
cfg,
|
| 129 |
-
guidance_scale=guidance_scale,
|
| 130 |
-
num_inference_steps=num_inference_steps,
|
| 131 |
-
batch_size=sample_nums
|
| 132 |
-
)
|
| 133 |
-
|
| 134 |
-
# Create temporary files to save results
|
| 135 |
-
temp_dir = tempfile.mkdtemp()
|
| 136 |
-
video_outputs = []
|
| 137 |
-
|
| 138 |
-
# Process each generated audio sample
|
| 139 |
-
for i in range(sample_nums):
|
| 140 |
-
# Save audio file
|
| 141 |
-
audio_output = os.path.join(temp_dir, f"generated_audio_{i+1}.wav")
|
| 142 |
-
torchaudio.save(audio_output, audio[i], sample_rate)
|
| 143 |
-
|
| 144 |
-
# Merge video and audio
|
| 145 |
-
video_output = os.path.join(temp_dir, f"video_with_audio_{i+1}.mp4")
|
| 146 |
-
merge_audio_video(audio_output, video_file, video_output)
|
| 147 |
-
video_outputs.append(video_output)
|
| 148 |
-
|
| 149 |
-
logger.info(f"Inference completed! Generated {sample_nums} samples.")
|
| 150 |
-
return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!"
|
| 151 |
-
|
| 152 |
-
except Exception as e:
|
| 153 |
-
logger.error(f"Inference failed: {str(e)}")
|
| 154 |
-
return [], f"❌ Inference failed: {str(e)}"
|
| 155 |
-
|
| 156 |
-
def update_video_outputs(video_list, status_msg):
|
| 157 |
-
"""Update video outputs based on the number of generated samples"""
|
| 158 |
-
# Initialize all outputs as None
|
| 159 |
-
outputs = [None] * 6
|
| 160 |
-
|
| 161 |
-
# Set values based on generated videos
|
| 162 |
-
for i, video_path in enumerate(video_list[:6]): # Max 6 samples
|
| 163 |
-
outputs[i] = video_path
|
| 164 |
-
|
| 165 |
-
# Return all outputs plus status message
|
| 166 |
-
return tuple(outputs + [status_msg])
|
| 167 |
-
|
| 168 |
-
def create_gradio_interface():
|
| 169 |
-
"""Create Gradio interface"""
|
| 170 |
-
|
| 171 |
-
# Custom CSS for beautiful interface with better contrast
|
| 172 |
-
css = """
|
| 173 |
-
.gradio-container {
|
| 174 |
-
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
| 175 |
-
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
| 176 |
-
min-height: 100vh;
|
| 177 |
-
}
|
| 178 |
-
|
| 179 |
-
.main-header {
|
| 180 |
-
text-align: center;
|
| 181 |
-
padding: 2rem 0;
|
| 182 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 183 |
-
border-radius: 20px;
|
| 184 |
-
margin-bottom: 2rem;
|
| 185 |
-
box-shadow: 0 8px 32px rgba(0,0,0,0.15);
|
| 186 |
-
}
|
| 187 |
-
|
| 188 |
-
.main-header h1 {
|
| 189 |
-
color: white;
|
| 190 |
-
font-size: 3rem;
|
| 191 |
-
font-weight: 700;
|
| 192 |
-
margin-bottom: 0.5rem;
|
| 193 |
-
text-shadow: 0 2px 10px rgba(0,0,0,0.3);
|
| 194 |
-
}
|
| 195 |
-
|
| 196 |
-
.main-header p {
|
| 197 |
-
color: rgba(255, 255, 255, 0.95);
|
| 198 |
-
font-size: 1.2rem;
|
| 199 |
-
font-weight: 300;
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
.status-card {
|
| 203 |
-
background: white;
|
| 204 |
-
border-radius: 15px;
|
| 205 |
-
padding: 1rem;
|
| 206 |
-
margin-bottom: 1.5rem;
|
| 207 |
-
border: 1px solid #e1e5e9;
|
| 208 |
-
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
|
| 209 |
-
}
|
| 210 |
-
|
| 211 |
-
.status-card label {
|
| 212 |
-
color: #2d3748 !important;
|
| 213 |
-
font-weight: 600 !important;
|
| 214 |
-
}
|
| 215 |
-
|
| 216 |
-
.usage-guide h3 {
|
| 217 |
-
color: #2d3748 !important;
|
| 218 |
-
font-weight: 600 !important;
|
| 219 |
-
margin-bottom: 0.5rem !important;
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
-
.usage-guide p {
|
| 223 |
-
color: #4a5568 !important;
|
| 224 |
-
font-size: 1rem !important;
|
| 225 |
-
line-height: 1.6 !important;
|
| 226 |
-
margin: 0.5rem 0 !important;
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
.usage-guide strong {
|
| 230 |
-
color: #1a202c !important;
|
| 231 |
-
font-weight: 700 !important;
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
.usage-guide em {
|
| 235 |
-
color: #1a202c !important;
|
| 236 |
-
font-weight: 700 !important;
|
| 237 |
-
font-style: normal !important;
|
| 238 |
-
}
|
| 239 |
-
|
| 240 |
-
.main-interface {
|
| 241 |
-
margin-bottom: 2rem;
|
| 242 |
-
}
|
| 243 |
-
|
| 244 |
-
.input-section {
|
| 245 |
-
background: white;
|
| 246 |
-
border-radius: 20px;
|
| 247 |
-
padding: 2rem;
|
| 248 |
-
margin-right: 1rem;
|
| 249 |
-
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
| 250 |
-
border: 1px solid #e1e5e9;
|
| 251 |
-
}
|
| 252 |
-
|
| 253 |
-
.input-section h3 {
|
| 254 |
-
color: #2d3748 !important;
|
| 255 |
-
font-weight: 600 !important;
|
| 256 |
-
margin-bottom: 1rem !important;
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
.input-section label {
|
| 260 |
-
color: #4a5568 !important;
|
| 261 |
-
font-weight: 500 !important;
|
| 262 |
-
}
|
| 263 |
-
|
| 264 |
-
.output-section {
|
| 265 |
-
background: white;
|
| 266 |
-
border-radius: 20px;
|
| 267 |
-
padding: 2rem;
|
| 268 |
-
margin-left: 1rem;
|
| 269 |
-
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
| 270 |
-
border: 1px solid #e1e5e9;
|
| 271 |
-
}
|
| 272 |
-
|
| 273 |
-
.output-section h3 {
|
| 274 |
-
color: #2d3748 !important;
|
| 275 |
-
font-weight: 600 !important;
|
| 276 |
-
margin-bottom: 1rem !important;
|
| 277 |
-
}
|
| 278 |
-
|
| 279 |
-
.output-section label {
|
| 280 |
-
color: #4a5568 !important;
|
| 281 |
-
font-weight: 500 !important;
|
| 282 |
-
}
|
| 283 |
-
|
| 284 |
-
.examples-section h3 {
|
| 285 |
-
color: #2d3748 !important;
|
| 286 |
-
font-weight: 600 !important;
|
| 287 |
-
margin-bottom: 1.5rem !important;
|
| 288 |
-
}
|
| 289 |
-
|
| 290 |
-
.generate-btn {
|
| 291 |
-
background: linear-gradient(45deg, #667eea, #764ba2) !important;
|
| 292 |
-
border: none !important;
|
| 293 |
-
color: white !important;
|
| 294 |
-
font-weight: 600 !important;
|
| 295 |
-
font-size: 1.1rem !important;
|
| 296 |
-
padding: 12px 30px !important;
|
| 297 |
-
border-radius: 25px !important;
|
| 298 |
-
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
|
| 299 |
-
transition: all 0.3s ease !important;
|
| 300 |
-
}
|
| 301 |
-
|
| 302 |
-
.generate-btn:hover {
|
| 303 |
-
transform: translateY(-2px) !important;
|
| 304 |
-
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6) !important;
|
| 305 |
-
}
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
.examples-section {
|
| 310 |
-
background: white;
|
| 311 |
-
border-radius: 20px;
|
| 312 |
-
padding: 2rem;
|
| 313 |
-
margin-top: 2rem;
|
| 314 |
-
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
| 315 |
-
border: 1px solid #e1e5e9;
|
| 316 |
-
}
|
| 317 |
-
|
| 318 |
-
.examples-section p {
|
| 319 |
-
color: #4a5568 !important;
|
| 320 |
-
margin-bottom: 1rem !important;
|
| 321 |
-
}
|
| 322 |
-
|
| 323 |
-
.example-row {
|
| 324 |
-
background: #f8fafc;
|
| 325 |
-
border: 1px solid #e2e8f0;
|
| 326 |
-
border-radius: 15px;
|
| 327 |
-
padding: 1.5rem;
|
| 328 |
-
margin: 1rem 0;
|
| 329 |
-
transition: all 0.3s ease;
|
| 330 |
-
align-items: center;
|
| 331 |
-
}
|
| 332 |
-
|
| 333 |
-
.example-row:hover {
|
| 334 |
-
border-color: #667eea;
|
| 335 |
-
transform: translateY(-2px);
|
| 336 |
-
box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
|
| 337 |
-
}
|
| 338 |
-
|
| 339 |
-
.example-row .markdown {
|
| 340 |
-
color: #2d3748 !important;
|
| 341 |
-
}
|
| 342 |
-
|
| 343 |
-
.example-row .markdown p {
|
| 344 |
-
color: #2d3748 !important;
|
| 345 |
-
margin: 0.5rem 0 !important;
|
| 346 |
-
line-height: 1.5 !important;
|
| 347 |
-
}
|
| 348 |
-
|
| 349 |
-
.example-row .markdown strong {
|
| 350 |
-
color: #1a202c !important;
|
| 351 |
-
font-weight: 600 !important;
|
| 352 |
-
}
|
| 353 |
-
|
| 354 |
-
/* Example grid layout styles */
|
| 355 |
-
.example-grid-row {
|
| 356 |
-
margin: 1rem 0;
|
| 357 |
-
gap: 1rem;
|
| 358 |
-
}
|
| 359 |
-
|
| 360 |
-
.example-item {
|
| 361 |
-
background: #f8fafc;
|
| 362 |
-
border: 1px solid #e2e8f0;
|
| 363 |
-
border-radius: 15px;
|
| 364 |
-
padding: 1rem;
|
| 365 |
-
transition: all 0.3s ease;
|
| 366 |
-
margin: 0.25rem;
|
| 367 |
-
max-width: 250px;
|
| 368 |
-
margin-left: auto;
|
| 369 |
-
margin-right: auto;
|
| 370 |
-
}
|
| 371 |
-
|
| 372 |
-
.example-item:hover {
|
| 373 |
-
border-color: #667eea;
|
| 374 |
-
transform: translateY(-2px);
|
| 375 |
-
box-shadow: 0 4px 20px rgba(102, 126, 234, 0.15);
|
| 376 |
-
}
|
| 377 |
-
|
| 378 |
-
.example-caption {
|
| 379 |
-
margin: 0.5rem 0 !important;
|
| 380 |
-
min-height: 2.8rem !important;
|
| 381 |
-
display: flex !important;
|
| 382 |
-
align-items: flex-start !important;
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
.example-caption p {
|
| 386 |
-
color: #2d3748 !important;
|
| 387 |
-
font-size: 0.9rem !important;
|
| 388 |
-
line-height: 1.4 !important;
|
| 389 |
-
margin: 0.5rem 0 !important;
|
| 390 |
-
}
|
| 391 |
-
|
| 392 |
-
/* Multi-video gallery styles */
|
| 393 |
-
.additional-samples {
|
| 394 |
-
margin-top: 1rem;
|
| 395 |
-
gap: 0.5rem;
|
| 396 |
-
}
|
| 397 |
-
|
| 398 |
-
.additional-samples .gradio-video {
|
| 399 |
-
border-radius: 10px;
|
| 400 |
-
overflow: hidden;
|
| 401 |
-
}
|
| 402 |
-
|
| 403 |
-
/* Video gallery responsive layout */
|
| 404 |
-
.video-gallery {
|
| 405 |
-
display: grid;
|
| 406 |
-
gap: 1rem;
|
| 407 |
-
margin-top: 1rem;
|
| 408 |
-
}
|
| 409 |
-
|
| 410 |
-
.video-gallery.single {
|
| 411 |
-
grid-template-columns: 1fr;
|
| 412 |
-
}
|
| 413 |
-
|
| 414 |
-
.video-gallery.dual {
|
| 415 |
-
grid-template-columns: 1fr 1fr;
|
| 416 |
-
}
|
| 417 |
-
|
| 418 |
-
.video-gallery.multi {
|
| 419 |
-
grid-template-columns: repeat(2, 1fr);
|
| 420 |
-
grid-template-rows: auto auto auto;
|
| 421 |
-
}
|
| 422 |
-
|
| 423 |
-
.footer-text {
|
| 424 |
-
color: #718096 !important;
|
| 425 |
-
text-align: center;
|
| 426 |
-
padding: 2rem;
|
| 427 |
-
font-size: 0.9rem;
|
| 428 |
-
}
|
| 429 |
-
|
| 430 |
-
/* Video component styling for consistent size */
|
| 431 |
-
.input-section video,
|
| 432 |
-
.output-section video,
|
| 433 |
-
.example-row video {
|
| 434 |
-
width: 100% !important;
|
| 435 |
-
height: 300px !important;
|
| 436 |
-
object-fit: contain !important;
|
| 437 |
-
border-radius: 10px !important;
|
| 438 |
-
background-color: #000 !important;
|
| 439 |
-
}
|
| 440 |
-
|
| 441 |
-
.example-row video {
|
| 442 |
-
height: 150px !important;
|
| 443 |
-
}
|
| 444 |
-
|
| 445 |
-
/* Fix for additional samples video display */
|
| 446 |
-
.additional-samples video {
|
| 447 |
-
height: 150px !important;
|
| 448 |
-
object-fit: contain !important;
|
| 449 |
-
border-radius: 10px !important;
|
| 450 |
-
background-color: #000 !important;
|
| 451 |
-
}
|
| 452 |
-
|
| 453 |
-
.additional-samples .gradio-video {
|
| 454 |
-
border-radius: 10px !important;
|
| 455 |
-
overflow: hidden !important;
|
| 456 |
-
background-color: #000 !important;
|
| 457 |
-
}
|
| 458 |
-
|
| 459 |
-
.additional-samples .gradio-video > div {
|
| 460 |
-
background-color: #000 !important;
|
| 461 |
-
border-radius: 10px !important;
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
/* Video container styling */
|
| 465 |
-
.input-section .video-container,
|
| 466 |
-
.output-section .video-container,
|
| 467 |
-
.example-row .video-container {
|
| 468 |
-
background-color: #000 !important;
|
| 469 |
-
border-radius: 10px !important;
|
| 470 |
-
display: flex !important;
|
| 471 |
-
align-items: center !important;
|
| 472 |
-
justify-content: center !important;
|
| 473 |
-
overflow: hidden !important;
|
| 474 |
-
}
|
| 475 |
-
|
| 476 |
-
/* Ensure proper alignment */
|
| 477 |
-
.example-row {
|
| 478 |
-
display: flex !important;
|
| 479 |
-
align-items: stretch !important;
|
| 480 |
-
}
|
| 481 |
-
|
| 482 |
-
.example-row > div {
|
| 483 |
-
display: flex !important;
|
| 484 |
-
flex-direction: column !important;
|
| 485 |
-
justify-content: center !important;
|
| 486 |
-
}
|
| 487 |
-
|
| 488 |
-
/* Video wrapper for better control */
|
| 489 |
-
.video-wrapper {
|
| 490 |
-
position: relative !important;
|
| 491 |
-
width: 100% !important;
|
| 492 |
-
background: #000 !important;
|
| 493 |
-
border-radius: 10px !important;
|
| 494 |
-
overflow: hidden !important;
|
| 495 |
-
display: flex !important;
|
| 496 |
-
align-items: center !important;
|
| 497 |
-
justify-content: center !important;
|
| 498 |
-
}
|
| 499 |
-
"""
|
| 500 |
-
|
| 501 |
-
with gr.Blocks(css=css, title="HunyuanVideo-Foley") as app:
|
| 502 |
-
|
| 503 |
-
# Main header
|
| 504 |
-
with gr.Column(elem_classes=["main-header"]):
|
| 505 |
-
gr.HTML("""
|
| 506 |
-
<h1>🎵 HunyuanVideo-Foley</h1>
|
| 507 |
-
<p>Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions</p>
|
| 508 |
-
""")
|
| 509 |
-
|
| 510 |
-
# Usage Guide
|
| 511 |
-
with gr.Column(elem_classes=["status-card"]):
|
| 512 |
-
gr.Markdown("""
|
| 513 |
-
### 📋 Quick Start Guide
|
| 514 |
-
**1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio
|
| 515 |
-
|
| 516 |
-
💡 For quick start, you can load the prepared examples by clicking the button.
|
| 517 |
-
""", elem_classes=["usage-guide"])
|
| 518 |
-
|
| 519 |
-
# Main inference interface - Input and Results side by side
|
| 520 |
-
with gr.Row(elem_classes=["main-interface"]):
|
| 521 |
-
# Input section
|
| 522 |
-
with gr.Column(scale=1, elem_classes=["input-section"]):
|
| 523 |
-
gr.Markdown("### 📹 Video Input")
|
| 524 |
-
|
| 525 |
-
video_input = gr.Video(
|
| 526 |
-
label="Upload Video",
|
| 527 |
-
info="Supported formats: MP4, AVI, MOV, etc.",
|
| 528 |
-
height=300
|
| 529 |
-
)
|
| 530 |
-
|
| 531 |
-
text_input = gr.Textbox(
|
| 532 |
-
label="🎯 Audio Description (English)",
|
| 533 |
-
placeholder="A person walks on frozen ice",
|
| 534 |
-
lines=3,
|
| 535 |
-
info="Describe the audio you want to generate (optional)"
|
| 536 |
-
)
|
| 537 |
-
|
| 538 |
-
neg_prompt_input = gr.Textbox(
|
| 539 |
-
label="🚫 Negative Prompt",
|
| 540 |
-
placeholder="noisy, harsh",
|
| 541 |
-
lines=2,
|
| 542 |
-
info="Describe what you want to avoid in the generated audio (optional, default: 'noisy, harsh')"
|
| 543 |
-
)
|
| 544 |
-
|
| 545 |
-
with gr.Row():
|
| 546 |
-
guidance_scale = gr.Slider(
|
| 547 |
-
minimum=1.0,
|
| 548 |
-
maximum=10.0,
|
| 549 |
-
value=4.5,
|
| 550 |
-
step=0.1,
|
| 551 |
-
label="🎚️ CFG Scale",
|
| 552 |
-
)
|
| 553 |
-
|
| 554 |
-
inference_steps = gr.Slider(
|
| 555 |
-
minimum=10,
|
| 556 |
-
maximum=100,
|
| 557 |
-
value=50,
|
| 558 |
-
step=5,
|
| 559 |
-
label="⚡ Steps",
|
| 560 |
-
)
|
| 561 |
-
|
| 562 |
-
sample_nums = gr.Slider(
|
| 563 |
-
minimum=1,
|
| 564 |
-
maximum=6,
|
| 565 |
-
value=1,
|
| 566 |
-
step=1,
|
| 567 |
-
label="🎲 Sample Nums",
|
| 568 |
-
)
|
| 569 |
-
|
| 570 |
-
generate_btn = gr.Button(
|
| 571 |
-
"🎵 Generate Audio",
|
| 572 |
-
variant="primary",
|
| 573 |
-
elem_classes=["generate-btn"]
|
| 574 |
-
)
|
| 575 |
-
|
| 576 |
-
# Results section
|
| 577 |
-
with gr.Column(scale=1, elem_classes=["output-section"]):
|
| 578 |
-
gr.Markdown("### 🎥 Generated Results")
|
| 579 |
-
|
| 580 |
-
# Multi-video gallery for displaying multiple generated samples
|
| 581 |
-
with gr.Column():
|
| 582 |
-
# Primary video (Sample 1)
|
| 583 |
-
video_output_1 = gr.Video(
|
| 584 |
-
label="Sample 1",
|
| 585 |
-
height=250,
|
| 586 |
-
visible=True
|
| 587 |
-
)
|
| 588 |
-
|
| 589 |
-
# Additional videos (Samples 2-6) - initially hidden
|
| 590 |
-
with gr.Row(elem_classes=["additional-samples"]):
|
| 591 |
-
with gr.Column(scale=1):
|
| 592 |
-
video_output_2 = gr.Video(
|
| 593 |
-
label="Sample 2",
|
| 594 |
-
height=150,
|
| 595 |
-
visible=False
|
| 596 |
-
)
|
| 597 |
-
video_output_3 = gr.Video(
|
| 598 |
-
label="Sample 3",
|
| 599 |
-
height=150,
|
| 600 |
-
visible=False
|
| 601 |
-
)
|
| 602 |
-
with gr.Column(scale=1):
|
| 603 |
-
video_output_4 = gr.Video(
|
| 604 |
-
label="Sample 4",
|
| 605 |
-
height=150,
|
| 606 |
-
visible=False
|
| 607 |
-
)
|
| 608 |
-
video_output_5 = gr.Video(
|
| 609 |
-
label="Sample 5",
|
| 610 |
-
height=150,
|
| 611 |
-
visible=False
|
| 612 |
-
)
|
| 613 |
-
|
| 614 |
-
# Sample 6 - full width
|
| 615 |
-
video_output_6 = gr.Video(
|
| 616 |
-
label="Sample 6",
|
| 617 |
-
height=150,
|
| 618 |
-
visible=False
|
| 619 |
-
)
|
| 620 |
-
|
| 621 |
-
result_text = gr.Textbox(
|
| 622 |
-
label="Status",
|
| 623 |
-
interactive=False,
|
| 624 |
-
lines=2
|
| 625 |
-
)
|
| 626 |
-
|
| 627 |
-
# Examples section at the bottom
|
| 628 |
-
with gr.Column(elem_classes=["examples-section"]):
|
| 629 |
-
gr.Markdown("### 🌟 Examples")
|
| 630 |
-
gr.Markdown("Click on any example to load it into the interface above")
|
| 631 |
-
|
| 632 |
-
# Define your custom examples here - 8 examples total
|
| 633 |
-
examples_data = [
|
| 634 |
-
# Example 1
|
| 635 |
-
{
|
| 636 |
-
"caption": "A person walks on frozen ice",
|
| 637 |
-
"video_path": "examples/1_video.mp4",
|
| 638 |
-
"result_path": "examples/1_result.mp4"
|
| 639 |
-
},
|
| 640 |
-
# Example 2
|
| 641 |
-
{
|
| 642 |
-
"caption": "With a faint sound as their hands parted, the two embraced, a soft 'mm' escaping between them.",
|
| 643 |
-
"video_path": "examples/2_video.mp4",
|
| 644 |
-
"result_path": "examples/2_result.mp4"
|
| 645 |
-
},
|
| 646 |
-
# Example 3
|
| 647 |
-
{
|
| 648 |
-
"caption": "The sound of the number 3's bouncing footsteps is as light and clear as glass marbles hitting the ground. Each step carries a magical sound.",
|
| 649 |
-
"video_path": "examples/3_video.mp4",
|
| 650 |
-
"result_path": "examples/3_result.mp4"
|
| 651 |
-
},
|
| 652 |
-
# Example 4
|
| 653 |
-
{
|
| 654 |
-
"caption": "gentle gurgling of the stream's current, and music plays in the background which is a beautiful and serene piano solo with a hint of classical charm, evoking a sense of peace and serenity in people's hearts.",
|
| 655 |
-
"video_path": "examples/4_video.mp4",
|
| 656 |
-
"result_path": "examples/4_result.mp4"
|
| 657 |
-
},
|
| 658 |
-
# Example 5 - Add your new examples here
|
| 659 |
-
{
|
| 660 |
-
"caption": "snow crunching under the snowboard's edge.",
|
| 661 |
-
"video_path": "examples/5_video.mp4",
|
| 662 |
-
"result_path": "examples/5_result.mp4"
|
| 663 |
-
},
|
| 664 |
-
# Example 6
|
| 665 |
-
{
|
| 666 |
-
"caption": "The crackling of the fire, the whooshing of the flames, and the occasional crisp popping of charred leaves filled the forest.",
|
| 667 |
-
"video_path": "examples/6_video.mp4",
|
| 668 |
-
"result_path": "examples/6_result.mp4"
|
| 669 |
-
},
|
| 670 |
-
# Example 7
|
| 671 |
-
{
|
| 672 |
-
"caption": "humming of the scooter engine accelerates slowly.",
|
| 673 |
-
"video_path": "examples/7_video.mp4",
|
| 674 |
-
"result_path": "examples/7_result.mp4"
|
| 675 |
-
},
|
| 676 |
-
# Example 8
|
| 677 |
-
{
|
| 678 |
-
"caption": "splash of water and loud thud as person hits the surface.",
|
| 679 |
-
"video_path": "examples/8_video.mp4",
|
| 680 |
-
"result_path": "examples/8_result.mp4"
|
| 681 |
-
}
|
| 682 |
-
]
|
| 683 |
-
|
| 684 |
-
# Create example grid - 4 examples per row, 2 rows total
|
| 685 |
-
example_buttons = []
|
| 686 |
-
for row in range(2): # 2 rows
|
| 687 |
-
with gr.Row(elem_classes=["example-grid-row"]):
|
| 688 |
-
for col in range(4): # 4 columns
|
| 689 |
-
idx = row * 4 + col
|
| 690 |
-
if idx < len(examples_data):
|
| 691 |
-
example = examples_data[idx]
|
| 692 |
-
|
| 693 |
-
with gr.Column(scale=1, elem_classes=["example-item"]):
|
| 694 |
-
# Video thumbnail
|
| 695 |
-
if os.path.exists(example['video_path']):
|
| 696 |
-
example_video = gr.Video(
|
| 697 |
-
value=example['video_path'],
|
| 698 |
-
label=f"Example {idx+1}",
|
| 699 |
-
interactive=False,
|
| 700 |
-
show_label=True,
|
| 701 |
-
height=180
|
| 702 |
-
)
|
| 703 |
-
else:
|
| 704 |
-
example_video = gr.HTML(f"""
|
| 705 |
-
<div style="background: #f0f0f0; padding: 15px; text-align: center; border-radius: 8px; height: 180px; display: flex; align-items: center; justify-content: center;">
|
| 706 |
-
<div>
|
| 707 |
-
<p style="color: #666; margin: 0; font-size: 12px;">📹 Video not found</p>
|
| 708 |
-
<small style="color: #999; font-size: 10px;">{example['video_path']}</small>
|
| 709 |
-
</div>
|
| 710 |
-
</div>
|
| 711 |
-
""")
|
| 712 |
-
|
| 713 |
-
# Caption (truncated for grid layout)
|
| 714 |
-
caption_preview = example['caption'][:60] + "..." if len(example['caption']) > 60 else example['caption']
|
| 715 |
-
gr.Markdown(f"{caption_preview}", elem_classes=["example-caption"])
|
| 716 |
-
|
| 717 |
-
# Load button
|
| 718 |
-
example_btn = gr.Button(
|
| 719 |
-
f"Load Example {idx+1}",
|
| 720 |
-
variant="secondary",
|
| 721 |
-
size="sm"
|
| 722 |
-
)
|
| 723 |
-
example_buttons.append((example_btn, example))
|
| 724 |
-
|
| 725 |
-
# Event handlers
|
| 726 |
-
def process_inference(video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, sample_nums):
|
| 727 |
-
# Generate videos
|
| 728 |
-
video_list, status_msg = infer_single_video(
|
| 729 |
-
video_file, text_prompt, neg_prompt, guidance_scale, inference_steps, int(sample_nums)
|
| 730 |
-
)
|
| 731 |
-
# Update outputs with proper visibility
|
| 732 |
-
return update_video_outputs(video_list, status_msg)
|
| 733 |
-
|
| 734 |
-
# Add dynamic visibility control based on sample_nums
|
| 735 |
-
def update_visibility(sample_nums):
|
| 736 |
-
sample_nums = int(sample_nums)
|
| 737 |
-
return [
|
| 738 |
-
gr.update(visible=True), # Sample 1 always visible
|
| 739 |
-
gr.update(visible=sample_nums >= 2), # Sample 2
|
| 740 |
-
gr.update(visible=sample_nums >= 3), # Sample 3
|
| 741 |
-
gr.update(visible=sample_nums >= 4), # Sample 4
|
| 742 |
-
gr.update(visible=sample_nums >= 5), # Sample 5
|
| 743 |
-
gr.update(visible=sample_nums >= 6), # Sample 6
|
| 744 |
-
]
|
| 745 |
-
|
| 746 |
-
# Update visibility when sample_nums changes
|
| 747 |
-
sample_nums.change(
|
| 748 |
-
fn=update_visibility,
|
| 749 |
-
inputs=[sample_nums],
|
| 750 |
-
outputs=[video_output_1, video_output_2, video_output_3, video_output_4, video_output_5, video_output_6]
|
| 751 |
-
)
|
| 752 |
-
|
| 753 |
-
generate_btn.click(
|
| 754 |
-
fn=process_inference,
|
| 755 |
-
inputs=[video_input, text_input, neg_prompt_input, guidance_scale, inference_steps, sample_nums],
|
| 756 |
-
outputs=[
|
| 757 |
-
video_output_1, # Sample 1 value
|
| 758 |
-
video_output_2, # Sample 2 value
|
| 759 |
-
video_output_3, # Sample 3 value
|
| 760 |
-
video_output_4, # Sample 4 value
|
| 761 |
-
video_output_5, # Sample 5 value
|
| 762 |
-
video_output_6, # Sample 6 value
|
| 763 |
-
result_text
|
| 764 |
-
]
|
| 765 |
-
)
|
| 766 |
-
|
| 767 |
-
# Add click handlers for example buttons
|
| 768 |
-
for btn, example in example_buttons:
|
| 769 |
-
def create_example_handler(ex):
|
| 770 |
-
def handler():
|
| 771 |
-
# Check if files exist, if not, return placeholder message
|
| 772 |
-
if os.path.exists(ex['video_path']):
|
| 773 |
-
video_file = ex['video_path']
|
| 774 |
-
else:
|
| 775 |
-
video_file = None
|
| 776 |
-
|
| 777 |
-
if os.path.exists(ex['result_path']):
|
| 778 |
-
result_video = ex['result_path']
|
| 779 |
-
else:
|
| 780 |
-
result_video = None
|
| 781 |
-
|
| 782 |
-
status_msg = f"✅ Loaded example with caption: {ex['caption'][:50]}..."
|
| 783 |
-
if not video_file:
|
| 784 |
-
status_msg += f"\n⚠️ Video file not found: {ex['video_path']}"
|
| 785 |
-
if not result_video:
|
| 786 |
-
status_msg += f"\n⚠️ Result video not found: {ex['result_path']}"
|
| 787 |
-
|
| 788 |
-
return video_file, ex['caption'], "noisy, harsh", result_video, status_msg
|
| 789 |
-
return handler
|
| 790 |
-
|
| 791 |
-
btn.click(
|
| 792 |
-
fn=create_example_handler(example),
|
| 793 |
-
outputs=[video_input, text_input, neg_prompt_input, video_output_1, result_text]
|
| 794 |
-
)
|
| 795 |
-
|
| 796 |
-
# Footer
|
| 797 |
-
gr.HTML("""
|
| 798 |
-
<div class="footer-text">
|
| 799 |
-
<p>🚀 Powered by HunyuanVideo-Foley | Generate high-quality audio from video and text descriptions</p>
|
| 800 |
-
</div>
|
| 801 |
-
""")
|
| 802 |
-
|
| 803 |
-
return app
|
| 804 |
-
|
| 805 |
-
def set_manual_seed(global_seed):
|
| 806 |
-
random.seed(global_seed)
|
| 807 |
-
np.random.seed(global_seed)
|
| 808 |
-
torch.manual_seed(global_seed)
|
| 809 |
-
|
| 810 |
-
if __name__ == "__main__":
|
| 811 |
-
set_manual_seed(1)
|
| 812 |
-
# Setup logging
|
| 813 |
-
logger.remove()
|
| 814 |
-
logger.add(lambda msg: print(msg, end=''), level="INFO")
|
| 815 |
-
|
| 816 |
-
# Auto-load model
|
| 817 |
-
logger.info("Starting application and loading model...")
|
| 818 |
-
model_load_result = auto_load_models()
|
| 819 |
-
logger.info(model_load_result)
|
| 820 |
-
|
| 821 |
-
# Create and launch Gradio app
|
| 822 |
-
app = create_gradio_interface()
|
| 823 |
-
|
| 824 |
-
# Log completion status
|
| 825 |
-
if "successfully" in model_load_result:
|
| 826 |
-
logger.info("Application ready, model loaded")
|
| 827 |
-
|
| 828 |
-
app.launch(
|
| 829 |
-
server_name="0.0.0.0",
|
| 830 |
-
server_port=8080,
|
| 831 |
-
share=False,
|
| 832 |
-
debug=False,
|
| 833 |
-
show_error=True
|
| 834 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/infer.py
DELETED
|
@@ -1,304 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import argparse
|
| 3 |
-
import random
|
| 4 |
-
import numpy as np
|
| 5 |
-
import torch
|
| 6 |
-
import pandas as pd
|
| 7 |
-
import torchaudio
|
| 8 |
-
from loguru import logger
|
| 9 |
-
from hunyuanvideo_foley.utils.model_utils import load_model
|
| 10 |
-
from hunyuanvideo_foley.utils.feature_utils import feature_process
|
| 11 |
-
from hunyuanvideo_foley.utils.model_utils import denoise_process
|
| 12 |
-
from hunyuanvideo_foley.utils.media_utils import merge_audio_video
|
| 13 |
-
|
| 14 |
-
def set_manual_seed(global_seed):
|
| 15 |
-
random.seed(global_seed)
|
| 16 |
-
np.random.seed(global_seed)
|
| 17 |
-
torch.manual_seed(global_seed)
|
| 18 |
-
|
| 19 |
-
def infer(video_path, prompt, model_dict, cfg, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
|
| 20 |
-
visual_feats, text_feats, audio_len_in_s = feature_process(
|
| 21 |
-
video_path,
|
| 22 |
-
prompt,
|
| 23 |
-
model_dict,
|
| 24 |
-
cfg,
|
| 25 |
-
neg_prompt=neg_prompt
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
audio, sample_rate = denoise_process(
|
| 29 |
-
visual_feats,
|
| 30 |
-
text_feats,
|
| 31 |
-
audio_len_in_s,
|
| 32 |
-
model_dict,
|
| 33 |
-
cfg,
|
| 34 |
-
guidance_scale=guidance_scale,
|
| 35 |
-
num_inference_steps=num_inference_steps
|
| 36 |
-
)
|
| 37 |
-
return audio[0], sample_rate
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def generate_audio(model_dict, cfg, csv_path, output_dir, guidance_scale=4.5, num_inference_steps=50, neg_prompt=None):
|
| 41 |
-
|
| 42 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 43 |
-
test_df = pd.read_csv(csv_path)
|
| 44 |
-
|
| 45 |
-
for index, row in test_df.iterrows():
|
| 46 |
-
video_path = row['video']
|
| 47 |
-
prompt = row['prompt']
|
| 48 |
-
|
| 49 |
-
logger.info(f"Processing video: {video_path}")
|
| 50 |
-
logger.info(f"Prompt: {prompt}")
|
| 51 |
-
|
| 52 |
-
output_audio_path = os.path.join(output_dir, f"{index:04d}.wav")
|
| 53 |
-
output_video_path = os.path.join(output_dir, f"{index:04d}.mp4")
|
| 54 |
-
|
| 55 |
-
if not os.path.exists(output_audio_path) or not os.path.exists(output_video_path):
|
| 56 |
-
audio, sample_rate = infer(video_path, prompt, model_dict, cfg, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, neg_prompt=neg_prompt)
|
| 57 |
-
torchaudio.save(output_audio_path, audio, sample_rate)
|
| 58 |
-
|
| 59 |
-
merge_audio_video(output_audio_path, video_path, output_video_path)
|
| 60 |
-
|
| 61 |
-
logger.info(f"All audio files saved to {output_dir}")
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def parse_args():
|
| 65 |
-
parser = argparse.ArgumentParser(
|
| 66 |
-
description="HunyuanVideo-Foley: Generate audio from video and text prompts",
|
| 67 |
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
parser.add_argument(
|
| 71 |
-
"--model_path",
|
| 72 |
-
type=str,
|
| 73 |
-
required=True,
|
| 74 |
-
help="Path to the pretrained model dir"
|
| 75 |
-
)
|
| 76 |
-
parser.add_argument(
|
| 77 |
-
"--config_path",
|
| 78 |
-
type=str,
|
| 79 |
-
help="Path to the configuration file (.yaml file). If not specified, will be inferred from model_size"
|
| 80 |
-
)
|
| 81 |
-
parser.add_argument(
|
| 82 |
-
"--model_size",
|
| 83 |
-
type=str,
|
| 84 |
-
choices=["xl", "xxl"],
|
| 85 |
-
default="xxl",
|
| 86 |
-
help="Model size (xl/xxl). Auto-selects config and model file (default: xxl)"
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
input_group = parser.add_mutually_exclusive_group(required=True)
|
| 90 |
-
input_group.add_argument(
|
| 91 |
-
"--csv_path",
|
| 92 |
-
type=str,
|
| 93 |
-
help="Path to CSV file containing video paths and text prompts (columns: 'video', 'text')"
|
| 94 |
-
)
|
| 95 |
-
input_group.add_argument(
|
| 96 |
-
"--single_video",
|
| 97 |
-
type=str,
|
| 98 |
-
help="Path to a single video file for inference"
|
| 99 |
-
)
|
| 100 |
-
parser.add_argument(
|
| 101 |
-
"--single_prompt",
|
| 102 |
-
type=str,
|
| 103 |
-
help="Text prompt for single video (required when using --single_video)"
|
| 104 |
-
)
|
| 105 |
-
parser.add_argument(
|
| 106 |
-
"--neg_prompt",
|
| 107 |
-
type=str,
|
| 108 |
-
default=None,
|
| 109 |
-
help="Negative prompt to avoid during generation (default: 'noisy, harsh')"
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
parser.add_argument(
|
| 113 |
-
"--output_dir",
|
| 114 |
-
type=str,
|
| 115 |
-
required=True,
|
| 116 |
-
help="Directory to save generated audio and video files"
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
parser.add_argument(
|
| 120 |
-
"--guidance_scale",
|
| 121 |
-
type=float,
|
| 122 |
-
default=4.5,
|
| 123 |
-
help="Guidance scale for classifier-free guidance (higher = more text adherence)"
|
| 124 |
-
)
|
| 125 |
-
parser.add_argument(
|
| 126 |
-
"--num_inference_steps",
|
| 127 |
-
type=int,
|
| 128 |
-
default=50,
|
| 129 |
-
help="Number of denoising steps for diffusion sampling"
|
| 130 |
-
)
|
| 131 |
-
parser.add_argument(
|
| 132 |
-
"--audio_length",
|
| 133 |
-
type=float,
|
| 134 |
-
default=None,
|
| 135 |
-
help="Maximum audio length in seconds (default: video length)"
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
parser.add_argument(
|
| 139 |
-
"--device",
|
| 140 |
-
type=str,
|
| 141 |
-
default="auto",
|
| 142 |
-
choices=["auto", "cpu", "cuda", "mps"],
|
| 143 |
-
help="Device to use for inference"
|
| 144 |
-
)
|
| 145 |
-
parser.add_argument(
|
| 146 |
-
"--gpu_id",
|
| 147 |
-
type=int,
|
| 148 |
-
default=0,
|
| 149 |
-
help="GPU ID to use when device is cuda"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
parser.add_argument(
|
| 153 |
-
"--batch_size",
|
| 154 |
-
type=int,
|
| 155 |
-
default=1,
|
| 156 |
-
help="Batch size for processing multiple videos"
|
| 157 |
-
)
|
| 158 |
-
parser.add_argument(
|
| 159 |
-
"--skip_existing",
|
| 160 |
-
action="store_true",
|
| 161 |
-
help="Skip processing if output files already exist"
|
| 162 |
-
)
|
| 163 |
-
parser.add_argument(
|
| 164 |
-
"--save_video",
|
| 165 |
-
action="store_true",
|
| 166 |
-
default=True,
|
| 167 |
-
help="Save video with generated audio merged"
|
| 168 |
-
)
|
| 169 |
-
parser.add_argument(
|
| 170 |
-
"--log_level",
|
| 171 |
-
type=str,
|
| 172 |
-
default="INFO",
|
| 173 |
-
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
| 174 |
-
help="Logging level"
|
| 175 |
-
)
|
| 176 |
-
parser.add_argument(
|
| 177 |
-
"--enable_offload",
|
| 178 |
-
action="store_true",
|
| 179 |
-
help="Enable model offloading to reduce peak memory usage (good for small VRAM GPUs)"
|
| 180 |
-
)
|
| 181 |
-
|
| 182 |
-
args = parser.parse_args()
|
| 183 |
-
|
| 184 |
-
if args.single_video and not args.single_prompt:
|
| 185 |
-
parser.error("--single_prompt is required when using --single_video")
|
| 186 |
-
|
| 187 |
-
# 如果指定了model_size,自动推断config_path和model文件
|
| 188 |
-
if args.model_size:
|
| 189 |
-
config_mapping = {
|
| 190 |
-
"xl": "configs/hunyuanvideo-foley-xl.yaml",
|
| 191 |
-
"xxl": "configs/hunyuanvideo-foley-xxl.yaml"
|
| 192 |
-
}
|
| 193 |
-
|
| 194 |
-
if not args.config_path:
|
| 195 |
-
args.config_path = config_mapping[args.model_size]
|
| 196 |
-
logger.info(f"Auto-selected config for {args.model_size} model: {args.config_path}")
|
| 197 |
-
elif not args.config_path:
|
| 198 |
-
args.model_size = "xxl"
|
| 199 |
-
args.config_path = "configs/hunyuanvideo-foley-xxl.yaml"
|
| 200 |
-
logger.info(f"Using default {args.model_size} model: {args.config_path}")
|
| 201 |
-
|
| 202 |
-
return args
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
def setup_device(device_str, gpu_id=0):
|
| 206 |
-
if device_str == "auto":
|
| 207 |
-
if torch.cuda.is_available():
|
| 208 |
-
device = torch.device(f"cuda:{gpu_id}")
|
| 209 |
-
logger.info(f"Using CUDA device: {device}")
|
| 210 |
-
elif torch.backends.mps.is_available():
|
| 211 |
-
device = torch.device("mps")
|
| 212 |
-
logger.info("Using MPS device")
|
| 213 |
-
else:
|
| 214 |
-
device = torch.device("cpu")
|
| 215 |
-
logger.info("Using CPU device")
|
| 216 |
-
else:
|
| 217 |
-
if device_str == "cuda":
|
| 218 |
-
device = torch.device(f"cuda:{gpu_id}")
|
| 219 |
-
else:
|
| 220 |
-
device = torch.device(device_str)
|
| 221 |
-
logger.info(f"Using specified device: {device}")
|
| 222 |
-
|
| 223 |
-
return device
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
def process_single_video(video_path, prompt, model_dict, cfg, output_dir, args):
|
| 227 |
-
logger.info(f"Processing single video: {video_path}")
|
| 228 |
-
logger.info(f"Text prompt: {prompt}")
|
| 229 |
-
|
| 230 |
-
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
| 231 |
-
output_audio_path = os.path.join(output_dir, f"{video_name}_generated.wav")
|
| 232 |
-
output_video_path = os.path.join(output_dir, f"{video_name}_with_audio.mp4")
|
| 233 |
-
|
| 234 |
-
if args.skip_existing and os.path.exists(output_audio_path):
|
| 235 |
-
logger.info(f"Skipping existing audio file: {output_audio_path}")
|
| 236 |
-
if args.save_video and os.path.exists(output_video_path):
|
| 237 |
-
logger.info(f"Skipping existing video file: {output_video_path}")
|
| 238 |
-
return
|
| 239 |
-
|
| 240 |
-
audio, sample_rate = infer(
|
| 241 |
-
video_path, prompt, model_dict, cfg,
|
| 242 |
-
guidance_scale=args.guidance_scale,
|
| 243 |
-
num_inference_steps=args.num_inference_steps,
|
| 244 |
-
neg_prompt=args.neg_prompt
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
torchaudio.save(output_audio_path, audio, sample_rate)
|
| 248 |
-
logger.info(f"Audio saved to: {output_audio_path}")
|
| 249 |
-
|
| 250 |
-
if args.save_video:
|
| 251 |
-
merge_audio_video(output_audio_path, video_path, output_video_path)
|
| 252 |
-
logger.info(f"Video with audio saved to: {output_video_path}")
|
| 253 |
-
|
| 254 |
-
def main():
|
| 255 |
-
set_manual_seed(1)
|
| 256 |
-
args = parse_args()
|
| 257 |
-
|
| 258 |
-
logger.remove()
|
| 259 |
-
logger.add(lambda msg: print(msg, end=''), level=args.log_level)
|
| 260 |
-
|
| 261 |
-
device = setup_device(args.device, args.gpu_id)
|
| 262 |
-
|
| 263 |
-
if not os.path.exists(args.model_path):
|
| 264 |
-
logger.error(f"Model file not found: {args.model_path}")
|
| 265 |
-
exit(1)
|
| 266 |
-
if not os.path.exists(args.config_path):
|
| 267 |
-
logger.error(f"Config file not found: {args.config_path}")
|
| 268 |
-
exit(1)
|
| 269 |
-
|
| 270 |
-
if args.csv_path:
|
| 271 |
-
if not os.path.exists(args.csv_path):
|
| 272 |
-
logger.error(f"CSV file not found: {args.csv_path}")
|
| 273 |
-
exit(1)
|
| 274 |
-
elif args.single_video:
|
| 275 |
-
if not os.path.exists(args.single_video):
|
| 276 |
-
logger.error(f"Video file not found: {args.single_video}")
|
| 277 |
-
exit(1)
|
| 278 |
-
|
| 279 |
-
os.makedirs(args.output_dir, exist_ok=True)
|
| 280 |
-
logger.info(f"Output directory: {args.output_dir}")
|
| 281 |
-
|
| 282 |
-
logger.info("Loading models...")
|
| 283 |
-
model_dict, cfg = load_model(args.model_path, args.config_path, device, enable_offload=args.enable_offload, model_size=args.model_size)
|
| 284 |
-
|
| 285 |
-
if args.single_video:
|
| 286 |
-
process_single_video(
|
| 287 |
-
args.single_video, args.single_prompt,
|
| 288 |
-
model_dict, cfg, args.output_dir, args
|
| 289 |
-
)
|
| 290 |
-
else:
|
| 291 |
-
generate_audio(
|
| 292 |
-
model_dict, cfg,
|
| 293 |
-
args.csv_path, args.output_dir,
|
| 294 |
-
guidance_scale=args.guidance_scale,
|
| 295 |
-
num_inference_steps=args.num_inference_steps,
|
| 296 |
-
neg_prompt=args.neg_prompt
|
| 297 |
-
)
|
| 298 |
-
|
| 299 |
-
logger.info("Processing completed!")
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
if __name__ == "__main__":
|
| 304 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/pytest.ini
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
[tool:pytest]
|
| 2 |
-
testpaths = tests
|
| 3 |
-
python_files = test_*.py
|
| 4 |
-
python_functions = test_*
|
| 5 |
-
addopts =
|
| 6 |
-
--verbose
|
| 7 |
-
--tb=short
|
| 8 |
-
--strict-markers
|
| 9 |
-
--disable-warnings
|
| 10 |
-
markers =
|
| 11 |
-
slow: marks tests as slow (deselect with '-m "not slow"')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/tests/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Test suite for HunyuanVideo-Foley
|
|
|
|
|
|
HunyuanVideo-Foley/tests/test_config_utils.py
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
"""Tests for configuration utilities."""
|
| 2 |
-
|
| 3 |
-
import pytest
|
| 4 |
-
import tempfile
|
| 5 |
-
import yaml
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
from hunyuanvideo_foley.utils.config_utils import AttributeDict, load_yaml
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class TestAttributeDict:
|
| 12 |
-
"""Test cases for AttributeDict class."""
|
| 13 |
-
|
| 14 |
-
def test_dict_access(self):
|
| 15 |
-
"""Test dictionary-style access."""
|
| 16 |
-
data = {"key1": "value1", "key2": {"nested": "value2"}}
|
| 17 |
-
attr_dict = AttributeDict(data)
|
| 18 |
-
|
| 19 |
-
assert attr_dict["key1"] == "value1"
|
| 20 |
-
assert attr_dict["key2"]["nested"] == "value2"
|
| 21 |
-
|
| 22 |
-
def test_attribute_access(self):
|
| 23 |
-
"""Test attribute-style access."""
|
| 24 |
-
data = {"key1": "value1", "key2": {"nested": "value2"}}
|
| 25 |
-
attr_dict = AttributeDict(data)
|
| 26 |
-
|
| 27 |
-
assert attr_dict.key1 == "value1"
|
| 28 |
-
assert attr_dict.key2.nested == "value2"
|
| 29 |
-
|
| 30 |
-
def test_list_handling(self):
|
| 31 |
-
"""Test list data handling."""
|
| 32 |
-
data = [1, 2, {"nested": "value"}]
|
| 33 |
-
attr_dict = AttributeDict(data)
|
| 34 |
-
|
| 35 |
-
assert attr_dict[0] == 1
|
| 36 |
-
assert attr_dict[2].nested == "value"
|
| 37 |
-
|
| 38 |
-
def test_keys_method(self):
|
| 39 |
-
"""Test keys() method."""
|
| 40 |
-
data = {"key1": "value1", "key2": "value2"}
|
| 41 |
-
attr_dict = AttributeDict(data)
|
| 42 |
-
|
| 43 |
-
keys = list(attr_dict.keys())
|
| 44 |
-
assert "key1" in keys
|
| 45 |
-
assert "key2" in keys
|
| 46 |
-
|
| 47 |
-
def test_get_method(self):
|
| 48 |
-
"""Test get() method."""
|
| 49 |
-
data = {"key1": "value1"}
|
| 50 |
-
attr_dict = AttributeDict(data)
|
| 51 |
-
|
| 52 |
-
assert attr_dict.get("key1") == "value1"
|
| 53 |
-
assert attr_dict.get("nonexistent", "default") == "default"
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
class TestLoadYaml:
|
| 57 |
-
"""Test cases for load_yaml function."""
|
| 58 |
-
|
| 59 |
-
def test_load_valid_yaml(self):
|
| 60 |
-
"""Test loading valid YAML file."""
|
| 61 |
-
data = {"model": {"name": "test_model", "params": {"lr": 0.001}}}
|
| 62 |
-
|
| 63 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
|
| 64 |
-
yaml.dump(data, f)
|
| 65 |
-
yaml_path = f.name
|
| 66 |
-
|
| 67 |
-
try:
|
| 68 |
-
result = load_yaml(yaml_path)
|
| 69 |
-
assert result.model.name == "test_model"
|
| 70 |
-
assert result.model.params.lr == 0.001
|
| 71 |
-
finally:
|
| 72 |
-
Path(yaml_path).unlink()
|
| 73 |
-
|
| 74 |
-
def test_load_nonexistent_file(self):
|
| 75 |
-
"""Test loading non-existent file."""
|
| 76 |
-
with pytest.raises(FileNotFoundError):
|
| 77 |
-
load_yaml("nonexistent.yaml")
|
| 78 |
-
|
| 79 |
-
def test_load_invalid_yaml(self):
|
| 80 |
-
"""Test loading invalid YAML file."""
|
| 81 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
|
| 82 |
-
f.write("invalid: yaml: content: [\n") # Invalid YAML
|
| 83 |
-
yaml_path = f.name
|
| 84 |
-
|
| 85 |
-
try:
|
| 86 |
-
with pytest.raises(yaml.YAMLError):
|
| 87 |
-
load_yaml(yaml_path)
|
| 88 |
-
finally:
|
| 89 |
-
Path(yaml_path).unlink()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HunyuanVideo-Foley/tests/test_media_utils.py
DELETED
|
@@ -1,82 +0,0 @@
|
|
| 1 |
-
"""Tests for media utilities."""
|
| 2 |
-
|
| 3 |
-
import pytest
|
| 4 |
-
import tempfile
|
| 5 |
-
import os
|
| 6 |
-
from unittest.mock import patch, MagicMock
|
| 7 |
-
|
| 8 |
-
from hunyuanvideo_foley.utils.media_utils import merge_audio_video, MediaProcessingError
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class TestMergeAudioVideo:
|
| 12 |
-
"""Test cases for merge_audio_video function."""
|
| 13 |
-
|
| 14 |
-
def test_invalid_audio_path(self):
|
| 15 |
-
"""Test with non-existent audio file."""
|
| 16 |
-
with pytest.raises(MediaProcessingError, match="Audio file not found"):
|
| 17 |
-
merge_audio_video("nonexistent.wav", "video.mp4", "output.mp4")
|
| 18 |
-
|
| 19 |
-
def test_invalid_video_path(self):
|
| 20 |
-
"""Test with non-existent video file."""
|
| 21 |
-
with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file:
|
| 22 |
-
with pytest.raises(MediaProcessingError, match="Video file not found"):
|
| 23 |
-
merge_audio_video(audio_file.name, "nonexistent.mp4", "output.mp4")
|
| 24 |
-
|
| 25 |
-
@patch('subprocess.Popen')
|
| 26 |
-
def test_successful_merge(self, mock_popen):
|
| 27 |
-
"""Test successful merge operation."""
|
| 28 |
-
# Create temporary files
|
| 29 |
-
with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
|
| 30 |
-
tempfile.NamedTemporaryFile(suffix='.mp4') as video_file, \
|
| 31 |
-
tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as output_file:
|
| 32 |
-
|
| 33 |
-
# Mock successful subprocess
|
| 34 |
-
mock_process = MagicMock()
|
| 35 |
-
mock_process.returncode = 0
|
| 36 |
-
mock_process.communicate.return_value = ("", "")
|
| 37 |
-
mock_popen.return_value = mock_process
|
| 38 |
-
|
| 39 |
-
result = merge_audio_video(
|
| 40 |
-
audio_file.name,
|
| 41 |
-
video_file.name,
|
| 42 |
-
output_file.name
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
assert result == output_file.name
|
| 46 |
-
mock_popen.assert_called_once()
|
| 47 |
-
|
| 48 |
-
# Cleanup
|
| 49 |
-
os.unlink(output_file.name)
|
| 50 |
-
|
| 51 |
-
@patch('subprocess.Popen')
|
| 52 |
-
def test_ffmpeg_failure(self, mock_popen):
|
| 53 |
-
"""Test ffmpeg failure handling."""
|
| 54 |
-
# Create temporary files
|
| 55 |
-
with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
|
| 56 |
-
tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
|
| 57 |
-
|
| 58 |
-
# Mock failed subprocess
|
| 59 |
-
mock_process = MagicMock()
|
| 60 |
-
mock_process.returncode = 1
|
| 61 |
-
mock_process.communicate.return_value = ("", "FFmpeg error")
|
| 62 |
-
mock_popen.return_value = mock_process
|
| 63 |
-
|
| 64 |
-
with pytest.raises(MediaProcessingError, match="FFmpeg failed"):
|
| 65 |
-
merge_audio_video(
|
| 66 |
-
audio_file.name,
|
| 67 |
-
video_file.name,
|
| 68 |
-
"output.mp4"
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
@patch('subprocess.Popen', side_effect=FileNotFoundError)
|
| 72 |
-
def test_ffmpeg_not_found(self, mock_popen):
|
| 73 |
-
"""Test ffmpeg not found error."""
|
| 74 |
-
with tempfile.NamedTemporaryFile(suffix='.wav') as audio_file, \
|
| 75 |
-
tempfile.NamedTemporaryFile(suffix='.mp4') as video_file:
|
| 76 |
-
|
| 77 |
-
with pytest.raises(FileNotFoundError, match="ffmpeg not found"):
|
| 78 |
-
merge_audio_video(
|
| 79 |
-
audio_file.name,
|
| 80 |
-
video_file.name,
|
| 81 |
-
"output.mp4"
|
| 82 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/.gitignore
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
run_*.sh
|
| 2 |
-
log/
|
| 3 |
-
saves
|
| 4 |
-
saves/
|
| 5 |
-
weights/
|
| 6 |
-
weights
|
| 7 |
-
output/
|
| 8 |
-
output
|
| 9 |
-
pretrained/
|
| 10 |
-
workspace
|
| 11 |
-
workspace/
|
| 12 |
-
ext_weights/
|
| 13 |
-
ext_weights
|
| 14 |
-
.checkpoints/
|
| 15 |
-
.vscode/
|
| 16 |
-
training/example_output/
|
| 17 |
-
|
| 18 |
-
# Byte-compiled / optimized / DLL files
|
| 19 |
-
__pycache__/
|
| 20 |
-
*.py[cod]
|
| 21 |
-
*$py.class
|
| 22 |
-
|
| 23 |
-
# C extensions
|
| 24 |
-
*.so
|
| 25 |
-
|
| 26 |
-
# Distribution / packaging
|
| 27 |
-
.Python
|
| 28 |
-
build/
|
| 29 |
-
develop-eggs/
|
| 30 |
-
dist/
|
| 31 |
-
downloads/
|
| 32 |
-
eggs/
|
| 33 |
-
.eggs/
|
| 34 |
-
lib/
|
| 35 |
-
lib64/
|
| 36 |
-
parts/
|
| 37 |
-
sdist/
|
| 38 |
-
var/
|
| 39 |
-
wheels/
|
| 40 |
-
pip-wheel-metadata/
|
| 41 |
-
share/python-wheels/
|
| 42 |
-
*.egg-info/
|
| 43 |
-
.installed.cfg
|
| 44 |
-
*.egg
|
| 45 |
-
MANIFEST
|
| 46 |
-
|
| 47 |
-
# PyInstaller
|
| 48 |
-
# Usually these files are written by a python script from a template
|
| 49 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 50 |
-
*.manifest
|
| 51 |
-
*.spec
|
| 52 |
-
|
| 53 |
-
# Installer logs
|
| 54 |
-
pip-log.txt
|
| 55 |
-
pip-delete-this-directory.txt
|
| 56 |
-
|
| 57 |
-
# Unit test / coverage reports
|
| 58 |
-
htmlcov/
|
| 59 |
-
.tox/
|
| 60 |
-
.nox/
|
| 61 |
-
.coverage
|
| 62 |
-
.coverage.*
|
| 63 |
-
.cache
|
| 64 |
-
nosetests.xml
|
| 65 |
-
coverage.xml
|
| 66 |
-
*.cover
|
| 67 |
-
*.py,cover
|
| 68 |
-
.hypothesis/
|
| 69 |
-
.pytest_cache/
|
| 70 |
-
|
| 71 |
-
# Translations
|
| 72 |
-
*.mo
|
| 73 |
-
*.pot
|
| 74 |
-
|
| 75 |
-
# Django stuff:
|
| 76 |
-
*.log
|
| 77 |
-
local_settings.py
|
| 78 |
-
db.sqlite3
|
| 79 |
-
db.sqlite3-journal
|
| 80 |
-
|
| 81 |
-
# Flask stuff:
|
| 82 |
-
instance/
|
| 83 |
-
.webassets-cache
|
| 84 |
-
|
| 85 |
-
# Scrapy stuff:
|
| 86 |
-
.scrapy
|
| 87 |
-
|
| 88 |
-
# Sphinx documentation
|
| 89 |
-
docs/_build/
|
| 90 |
-
|
| 91 |
-
# PyBuilder
|
| 92 |
-
target/
|
| 93 |
-
|
| 94 |
-
# Jupyter Notebook
|
| 95 |
-
.ipynb_checkpoints
|
| 96 |
-
|
| 97 |
-
# IPython
|
| 98 |
-
profile_default/
|
| 99 |
-
ipython_config.py
|
| 100 |
-
|
| 101 |
-
# pyenv
|
| 102 |
-
.python-version
|
| 103 |
-
|
| 104 |
-
# pipenv
|
| 105 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 106 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 107 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 108 |
-
# install all needed dependencies.
|
| 109 |
-
#Pipfile.lock
|
| 110 |
-
|
| 111 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
| 112 |
-
__pypackages__/
|
| 113 |
-
|
| 114 |
-
# Celery stuff
|
| 115 |
-
celerybeat-schedule
|
| 116 |
-
celerybeat.pid
|
| 117 |
-
|
| 118 |
-
# SageMath parsed files
|
| 119 |
-
*.sage.py
|
| 120 |
-
|
| 121 |
-
# Environments
|
| 122 |
-
.env
|
| 123 |
-
.venv
|
| 124 |
-
env/
|
| 125 |
-
venv/
|
| 126 |
-
ENV/
|
| 127 |
-
env.bak/
|
| 128 |
-
venv.bak/
|
| 129 |
-
|
| 130 |
-
# Spyder project settings
|
| 131 |
-
.spyderproject
|
| 132 |
-
.spyproject
|
| 133 |
-
|
| 134 |
-
# Rope project settings
|
| 135 |
-
.ropeproject
|
| 136 |
-
|
| 137 |
-
# mkdocs documentation
|
| 138 |
-
/site
|
| 139 |
-
|
| 140 |
-
# mypy
|
| 141 |
-
.mypy_cache/
|
| 142 |
-
.dmypy.json
|
| 143 |
-
dmypy.json
|
| 144 |
-
|
| 145 |
-
# Pyre type checker
|
| 146 |
-
.pyre/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
MIT License
|
| 2 |
-
|
| 3 |
-
Copyright (c) 2024 Sony Research Inc.
|
| 4 |
-
|
| 5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
-
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
-
in the Software without restriction, including without limitation the rights
|
| 8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
-
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
-
furnished to do so, subject to the following conditions:
|
| 11 |
-
|
| 12 |
-
The above copyright notice and this permission notice shall be included in all
|
| 13 |
-
copies or substantial portions of the Software.
|
| 14 |
-
|
| 15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/README.md
DELETED
|
@@ -1,198 +0,0 @@
|
|
| 1 |
-
<div align="center">
|
| 2 |
-
|
| 3 |
-
https://github.com/hkchengrex/MMAudio
|
| 4 |
-
|
| 5 |
-
<p align="center">
|
| 6 |
-
<h2>MMAudio</h2>
|
| 7 |
-
<a href="https://arxiv.org/abs/2412.15322">Paper</a> | <a href="https://hkchengrex.github.io/MMAudio">Webpage</a> | <a href="https://huggingface.co/hkchengrex/MMAudio/tree/main">Models</a> | <a href="https://huggingface.co/spaces/hkchengrex/MMAudio"> Huggingface Demo</a> | <a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">Colab Demo</a> | <a href="https://replicate.com/zsxkib/mmaudio">Replicate Demo</a>
|
| 8 |
-
</p>
|
| 9 |
-
</div>
|
| 10 |
-
|
| 11 |
-
## [Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis](https://hkchengrex.github.io/MMAudio)
|
| 12 |
-
|
| 13 |
-
[Ho Kei Cheng](https://hkchengrex.github.io/), [Masato Ishii](https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ), [Akio Hayakawa](https://scholar.google.com/citations?user=sXAjHFIAAAAJ), [Takashi Shibuya](https://scholar.google.com/citations?user=XCRO260AAAAJ), [Alexander Schwing](https://www.alexander-schwing.de/), [Yuki Mitsufuji](https://www.yukimitsufuji.com/)
|
| 14 |
-
|
| 15 |
-
University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
|
| 16 |
-
|
| 17 |
-
CVPR 2025
|
| 18 |
-
|
| 19 |
-
## Highlight
|
| 20 |
-
|
| 21 |
-
MMAudio generates synchronized audio given video and/or text inputs.
|
| 22 |
-
Our key innovation is multimodal joint training which allows training on a wide range of audio-visual and audio-text datasets.
|
| 23 |
-
Moreover, a synchronization module aligns the generated audio with the video frames.
|
| 24 |
-
|
| 25 |
-
Check out this fun video:
|
| 26 |
-
|
| 27 |
-
[](https://youtu.be/SLz3NWLyHxg)
|
| 28 |
-
|
| 29 |
-
[[Does Your Voice Match Your Face? https://youtu.be/SLz3NWLyHxg]](https://youtu.be/SLz3NWLyHxg)
|
| 30 |
-
|
| 31 |
-
## Results
|
| 32 |
-
|
| 33 |
-
(All audio from our algorithm MMAudio)
|
| 34 |
-
|
| 35 |
-
Videos from Sora:
|
| 36 |
-
|
| 37 |
-
https://github.com/user-attachments/assets/82afd192-0cee-48a1-86ca-bd39b8c8f330
|
| 38 |
-
|
| 39 |
-
Videos from Veo 2:
|
| 40 |
-
|
| 41 |
-
https://github.com/user-attachments/assets/8a11419e-fee2-46e0-9e67-dfb03c48d00e
|
| 42 |
-
|
| 43 |
-
Videos from MovieGen/Hunyuan Video/VGGSound:
|
| 44 |
-
|
| 45 |
-
https://github.com/user-attachments/assets/29230d4e-21c1-4cf8-a221-c28f2af6d0ca
|
| 46 |
-
|
| 47 |
-
For more results, visit https://hkchengrex.com/MMAudio/video_main.html.
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
## Installation
|
| 51 |
-
|
| 52 |
-
We have only tested this on Ubuntu.
|
| 53 |
-
|
| 54 |
-
### Prerequisites
|
| 55 |
-
|
| 56 |
-
We recommend using a [miniforge](https://github.com/conda-forge/miniforge) environment.
|
| 57 |
-
|
| 58 |
-
- Python 3.9+
|
| 59 |
-
- PyTorch **2.5.1+** and corresponding torchvision/torchaudio (pick your CUDA version https://pytorch.org/, pip install recommended)
|
| 60 |
-
<!-- - ffmpeg<7 ([this is required by torchaudio](https://pytorch.org/audio/master/installation.html#optional-dependencies), you can install it in a miniforge environment with `conda install -c conda-forge 'ffmpeg<7'`) -->
|
| 61 |
-
|
| 62 |
-
**1. Install prerequisite if not yet met:**
|
| 63 |
-
|
| 64 |
-
```bash
|
| 65 |
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade
|
| 66 |
-
```
|
| 67 |
-
|
| 68 |
-
(Or any other CUDA versions that your GPUs/driver support)
|
| 69 |
-
|
| 70 |
-
<!-- ```
|
| 71 |
-
conda install -c conda-forge 'ffmpeg<7
|
| 72 |
-
```
|
| 73 |
-
(Optional, if you use miniforge and don't already have the appropriate ffmpeg) -->
|
| 74 |
-
|
| 75 |
-
**2. Clone our repository:**
|
| 76 |
-
|
| 77 |
-
```bash
|
| 78 |
-
git clone https://github.com/hkchengrex/MMAudio.git
|
| 79 |
-
```
|
| 80 |
-
|
| 81 |
-
**3. Install with pip (install pytorch first before attempting this!):**
|
| 82 |
-
|
| 83 |
-
```bash
|
| 84 |
-
cd MMAudio
|
| 85 |
-
pip install -e .
|
| 86 |
-
```
|
| 87 |
-
|
| 88 |
-
(If you encounter the File "setup.py" not found error, upgrade your pip with pip install --upgrade pip)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
**Pretrained models:**
|
| 92 |
-
|
| 93 |
-
The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
|
| 94 |
-
The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
|
| 95 |
-
See [MODELS.md](docs/MODELS.md) for more details.
|
| 96 |
-
|
| 97 |
-
## Demo
|
| 98 |
-
|
| 99 |
-
By default, these scripts use the `large_44k_v2` model.
|
| 100 |
-
In our experiments, inference only takes around 6GB of GPU memory (in 16-bit mode) which should fit in most modern GPUs.
|
| 101 |
-
|
| 102 |
-
### Command-line interface
|
| 103 |
-
|
| 104 |
-
With `demo.py`
|
| 105 |
-
|
| 106 |
-
```bash
|
| 107 |
-
python demo.py --duration=8 --video=<path to video> --prompt "your prompt"
|
| 108 |
-
```
|
| 109 |
-
|
| 110 |
-
The output (audio in `.flac` format, and video in `.mp4` format) will be saved in `./output`.
|
| 111 |
-
See the file for more options.
|
| 112 |
-
Simply omit the `--video` option for text-to-audio synthesis.
|
| 113 |
-
The default output (and training) duration is 8 seconds. Longer/shorter durations could also work, but a large deviation from the training duration may result in a lower quality.
|
| 114 |
-
|
| 115 |
-
### Gradio interface
|
| 116 |
-
|
| 117 |
-
Supports video-to-audio and text-to-audio synthesis.
|
| 118 |
-
You can also try experimental image-to-audio synthesis which duplicates the input image to a video for processing. This might be interesting to some but it is not something MMAudio has been trained for.
|
| 119 |
-
Use [port forwarding](https://unix.stackexchange.com/questions/115897/whats-ssh-port-forwarding-and-whats-the-difference-between-ssh-local-and-remot) (e.g., `ssh -L 7860:localhost:7860 server`) if necessary. The default port is `7860` which you can specify with `--port`.
|
| 120 |
-
|
| 121 |
-
```bash
|
| 122 |
-
python gradio_demo.py
|
| 123 |
-
```
|
| 124 |
-
|
| 125 |
-
### FAQ
|
| 126 |
-
|
| 127 |
-
1. Video processing
|
| 128 |
-
- Processing higher-resolution videos takes longer due to encoding and decoding (which can take >95% of the processing time!), but it does not improve the quality of results.
|
| 129 |
-
- The CLIP encoder resizes input frames to 384×384 pixels.
|
| 130 |
-
- Synchformer resizes the shorter edge to 224 pixels and applies a center crop, focusing only on the central square of each frame.
|
| 131 |
-
2. Frame rates
|
| 132 |
-
- The CLIP model operates at 8 FPS, while Synchformer works at 25 FPS.
|
| 133 |
-
- Frame rate conversion happens on-the-fly via the video reader.
|
| 134 |
-
- For input videos with a frame rate below 25 FPS, frames will be duplicated to match the required rate.
|
| 135 |
-
3. Failure cases
|
| 136 |
-
As with most models of this type, failures can occur, and the reasons are not always clear. Below are some known failure modes. If you notice a failure mode or believe there’s a bug, feel free to open an issue in the repository.
|
| 137 |
-
4. Performance variations
|
| 138 |
-
We notice that there can be subtle performance variations in different hardware and software environments. Some of the reasons include using/not using `torch.compile`, video reader library/backend, inference precision, batch sizes, random seeds, etc. We (will) provide pre-computed results on standard benchmark for reference. Results obtained from this codebase should be similar but might not be exactly the same.
|
| 139 |
-
|
| 140 |
-
### Known limitations
|
| 141 |
-
|
| 142 |
-
1. The model sometimes generates unintelligible human speech-like sounds
|
| 143 |
-
2. The model sometimes generates background music (without explicit training, it would not be high quality)
|
| 144 |
-
3. The model struggles with unfamiliar concepts, e.g., it can generate "gunfires" but not "RPG firing".
|
| 145 |
-
|
| 146 |
-
We believe all of these three limitations can be addressed with more high-quality training data.
|
| 147 |
-
|
| 148 |
-
## Training
|
| 149 |
-
|
| 150 |
-
See [TRAINING.md](docs/TRAINING.md).
|
| 151 |
-
|
| 152 |
-
## Evaluation
|
| 153 |
-
|
| 154 |
-
See [EVAL.md](docs/EVAL.md).
|
| 155 |
-
|
| 156 |
-
## Training Datasets
|
| 157 |
-
|
| 158 |
-
MMAudio was trained on several datasets, including [AudioSet](https://research.google.com/audioset/), [Freesound](https://github.com/LAION-AI/audio-dataset/blob/main/laion-audio-630k/README.md), [VGGSound](https://www.robots.ox.ac.uk/~vgg/data/vggsound/), [AudioCaps](https://audiocaps.github.io/), and [WavCaps](https://github.com/XinhaoMei/WavCaps). These datasets are subject to specific licenses, which can be accessed on their respective websites. We do not guarantee that the pre-trained models are suitable for commercial use. Please use them at your own risk.
|
| 159 |
-
|
| 160 |
-
## Update Logs
|
| 161 |
-
|
| 162 |
-
- 2025-03-09: Uploaded the corrected tsv files. See [TRAINING.md](docs/TRAINING.md).
|
| 163 |
-
- 2025-02-27: Disabled the GradScaler by default to improve training stability. See #49.
|
| 164 |
-
- 2024-12-23: Added training and batch evaluation scripts.
|
| 165 |
-
- 2024-12-14: Removed the `ffmpeg<7` requirement for the demos by replacing `torio.io.StreamingMediaDecoder` with `pyav` for reading frames. The read frames are also cached, so we are not reading the same frames again during reconstruction. This should speed things up and make installation less of a hassle.
|
| 166 |
-
- 2024-12-13: Improved for-loop processing in CLIP/Sync feature extraction by introducing a batch size multiplier. We can approximately use 40x batch size for CLIP/Sync without using more memory, thereby speeding up processing. Removed VAE encoder during inference -- we don't need it.
|
| 167 |
-
- 2024-12-11: Replaced `torio.io.StreamingMediaDecoder` with `pyav` for reading framerate when reconstructing the input video. `torio.io.StreamingMediaDecoder` does not work reliably in huggingface ZeroGPU's environment, and I suspect that it might not work in some other environments as well.
|
| 168 |
-
|
| 169 |
-
## Citation
|
| 170 |
-
|
| 171 |
-
```bibtex
|
| 172 |
-
@inproceedings{cheng2025taming,
|
| 173 |
-
title={{MMAudio}: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis},
|
| 174 |
-
author={Cheng, Ho Kei and Ishii, Masato and Hayakawa, Akio and Shibuya, Takashi and Schwing, Alexander and Mitsufuji, Yuki},
|
| 175 |
-
booktitle={CVPR},
|
| 176 |
-
year={2025}
|
| 177 |
-
}
|
| 178 |
-
```
|
| 179 |
-
|
| 180 |
-
## Relevant Repositories
|
| 181 |
-
|
| 182 |
-
- [av-benchmark](https://github.com/hkchengrex/av-benchmark) for benchmarking results.
|
| 183 |
-
|
| 184 |
-
## License
|
| 185 |
-
- The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE)
|
| 186 |
-
- The checkpoints are released on Hugging Face under the CC-BY-NC 4.0 license as found at [https://creativecommons.org/licenses/by-nc/4.0/](https://creativecommons.org/licenses/by-nc/4.0/).
|
| 187 |
-
|
| 188 |
-
## Disclaimer
|
| 189 |
-
|
| 190 |
-
We have no affiliation with and have no knowledge of the party behind the domain "mmaudio.net".
|
| 191 |
-
|
| 192 |
-
## Acknowledgement
|
| 193 |
-
|
| 194 |
-
Many thanks to:
|
| 195 |
-
- [Make-An-Audio 2](https://github.com/bytedance/Make-An-Audio-2) for the 16kHz BigVGAN pretrained model and the VAE architecture
|
| 196 |
-
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
| 197 |
-
- [Synchformer](https://github.com/v-iashin/Synchformer)
|
| 198 |
-
- [EDM2](https://github.com/NVlabs/edm2) for the magnitude-preserving VAE network architecture
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/batch_eval.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import os
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
|
| 5 |
-
import hydra
|
| 6 |
-
import torch
|
| 7 |
-
import torch.distributed as distributed
|
| 8 |
-
import torchaudio
|
| 9 |
-
from hydra.core.hydra_config import HydraConfig
|
| 10 |
-
from omegaconf import DictConfig
|
| 11 |
-
from tqdm import tqdm
|
| 12 |
-
|
| 13 |
-
from mmaudio.data.data_setup import setup_eval_dataset
|
| 14 |
-
from mmaudio.eval_utils import ModelConfig, all_model_cfg, generate
|
| 15 |
-
from mmaudio.model.flow_matching import FlowMatching
|
| 16 |
-
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
| 17 |
-
from mmaudio.model.utils.features_utils import FeaturesUtils
|
| 18 |
-
|
| 19 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
| 20 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 21 |
-
|
| 22 |
-
local_rank = int(os.environ['LOCAL_RANK'])
|
| 23 |
-
world_size = int(os.environ['WORLD_SIZE'])
|
| 24 |
-
log = logging.getLogger()
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
@torch.inference_mode()
|
| 28 |
-
@hydra.main(version_base='1.3.2', config_path='config', config_name='eval_config.yaml')
|
| 29 |
-
def main(cfg: DictConfig):
|
| 30 |
-
device = 'cuda'
|
| 31 |
-
torch.cuda.set_device(local_rank)
|
| 32 |
-
|
| 33 |
-
if cfg.model not in all_model_cfg:
|
| 34 |
-
raise ValueError(f'Unknown model variant: {cfg.model}')
|
| 35 |
-
model: ModelConfig = all_model_cfg[cfg.model]
|
| 36 |
-
model.download_if_needed()
|
| 37 |
-
seq_cfg = model.seq_cfg
|
| 38 |
-
|
| 39 |
-
run_dir = Path(HydraConfig.get().run.dir)
|
| 40 |
-
if cfg.output_name is None:
|
| 41 |
-
output_dir = run_dir / cfg.dataset
|
| 42 |
-
else:
|
| 43 |
-
output_dir = run_dir / f'{cfg.dataset}-{cfg.output_name}'
|
| 44 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 45 |
-
|
| 46 |
-
# load a pretrained model
|
| 47 |
-
seq_cfg.duration = cfg.duration_s
|
| 48 |
-
net: MMAudio = get_my_mmaudio(cfg.model).to(device).eval()
|
| 49 |
-
net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
|
| 50 |
-
log.info(f'Loaded weights from {model.model_path}')
|
| 51 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 52 |
-
log.info(f'Latent seq len: {seq_cfg.latent_seq_len}')
|
| 53 |
-
log.info(f'Clip seq len: {seq_cfg.clip_seq_len}')
|
| 54 |
-
log.info(f'Sync seq len: {seq_cfg.sync_seq_len}')
|
| 55 |
-
|
| 56 |
-
# misc setup
|
| 57 |
-
rng = torch.Generator(device=device)
|
| 58 |
-
rng.manual_seed(cfg.seed)
|
| 59 |
-
fm = FlowMatching(cfg.sampling.min_sigma,
|
| 60 |
-
inference_mode=cfg.sampling.method,
|
| 61 |
-
num_steps=cfg.sampling.num_steps)
|
| 62 |
-
|
| 63 |
-
feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
|
| 64 |
-
synchformer_ckpt=model.synchformer_ckpt,
|
| 65 |
-
enable_conditions=True,
|
| 66 |
-
mode=model.mode,
|
| 67 |
-
bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
|
| 68 |
-
need_vae_encoder=False)
|
| 69 |
-
feature_utils = feature_utils.to(device).eval()
|
| 70 |
-
|
| 71 |
-
if cfg.compile:
|
| 72 |
-
net.preprocess_conditions = torch.compile(net.preprocess_conditions)
|
| 73 |
-
net.predict_flow = torch.compile(net.predict_flow)
|
| 74 |
-
feature_utils.compile()
|
| 75 |
-
|
| 76 |
-
dataset, loader = setup_eval_dataset(cfg.dataset, cfg)
|
| 77 |
-
|
| 78 |
-
with torch.amp.autocast(enabled=cfg.amp, dtype=torch.bfloat16, device_type=device):
|
| 79 |
-
for batch in tqdm(loader):
|
| 80 |
-
audios = generate(batch.get('clip_video', None),
|
| 81 |
-
batch.get('sync_video', None),
|
| 82 |
-
batch.get('caption', None),
|
| 83 |
-
feature_utils=feature_utils,
|
| 84 |
-
net=net,
|
| 85 |
-
fm=fm,
|
| 86 |
-
rng=rng,
|
| 87 |
-
cfg_strength=cfg.cfg_strength,
|
| 88 |
-
clip_batch_size_multiplier=64,
|
| 89 |
-
sync_batch_size_multiplier=64)
|
| 90 |
-
audios = audios.float().cpu()
|
| 91 |
-
names = batch['name']
|
| 92 |
-
for audio, name in zip(audios, names):
|
| 93 |
-
torchaudio.save(output_dir / f'{name}.flac', audio, seq_cfg.sampling_rate)
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
def distributed_setup():
|
| 97 |
-
distributed.init_process_group(backend="nccl")
|
| 98 |
-
local_rank = distributed.get_rank()
|
| 99 |
-
world_size = distributed.get_world_size()
|
| 100 |
-
log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
|
| 101 |
-
return local_rank, world_size
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if __name__ == '__main__':
|
| 105 |
-
distributed_setup()
|
| 106 |
-
|
| 107 |
-
main()
|
| 108 |
-
|
| 109 |
-
# clean-up
|
| 110 |
-
distributed.destroy_process_group()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/__init__.py
DELETED
|
File without changes
|
MMAudio/config/base_config.yaml
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
defaults:
|
| 2 |
-
- data: base
|
| 3 |
-
- eval_data: base
|
| 4 |
-
- override hydra/job_logging: custom-simplest
|
| 5 |
-
- _self_
|
| 6 |
-
|
| 7 |
-
hydra:
|
| 8 |
-
run:
|
| 9 |
-
dir: ./output/${exp_id}
|
| 10 |
-
output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
|
| 11 |
-
|
| 12 |
-
enable_email: False
|
| 13 |
-
|
| 14 |
-
model: small_16k
|
| 15 |
-
|
| 16 |
-
exp_id: default
|
| 17 |
-
debug: False
|
| 18 |
-
cudnn_benchmark: True
|
| 19 |
-
compile: True
|
| 20 |
-
amp: True
|
| 21 |
-
weights: null
|
| 22 |
-
checkpoint: null
|
| 23 |
-
seed: 14159265
|
| 24 |
-
num_workers: 10 # per-GPU
|
| 25 |
-
pin_memory: False # set to True if your system can handle it, i.e., have enough memory
|
| 26 |
-
|
| 27 |
-
# NOTE: This DOSE NOT affect the model during inference in any way
|
| 28 |
-
# they are just for the dataloader to fill in the missing data in multi-modal loading
|
| 29 |
-
# to change the sequence length for the model, see networks.py
|
| 30 |
-
data_dim:
|
| 31 |
-
text_seq_len: 77
|
| 32 |
-
clip_dim: 1024
|
| 33 |
-
sync_dim: 768
|
| 34 |
-
text_dim: 1024
|
| 35 |
-
|
| 36 |
-
# ema configuration
|
| 37 |
-
ema:
|
| 38 |
-
enable: True
|
| 39 |
-
sigma_rels: [0.05, 0.1]
|
| 40 |
-
update_every: 1
|
| 41 |
-
checkpoint_every: 5_000
|
| 42 |
-
checkpoint_folder: ${hydra:run.dir}/ema_ckpts
|
| 43 |
-
default_output_sigma: 0.05
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
# sampling
|
| 47 |
-
sampling:
|
| 48 |
-
mean: 0.0
|
| 49 |
-
scale: 1.0
|
| 50 |
-
min_sigma: 0.0
|
| 51 |
-
method: euler
|
| 52 |
-
num_steps: 25
|
| 53 |
-
|
| 54 |
-
# classifier-free guidance
|
| 55 |
-
null_condition_probability: 0.1
|
| 56 |
-
cfg_strength: 4.5
|
| 57 |
-
|
| 58 |
-
# checkpoint paths to external modules
|
| 59 |
-
vae_16k_ckpt: ./ext_weights/v1-16.pth
|
| 60 |
-
vae_44k_ckpt: ./ext_weights/v1-44.pth
|
| 61 |
-
bigvgan_vocoder_ckpt: ./ext_weights/best_netG.pt
|
| 62 |
-
synchformer_ckpt: ./ext_weights/synchformer_state_dict.pth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/data/base.yaml
DELETED
|
@@ -1,70 +0,0 @@
|
|
| 1 |
-
VGGSound:
|
| 2 |
-
root: ../data/video
|
| 3 |
-
subset_name: sets/vgg3-train.tsv
|
| 4 |
-
fps: 8
|
| 5 |
-
height: 384
|
| 6 |
-
width: 384
|
| 7 |
-
sample_duration_sec: 8.0
|
| 8 |
-
|
| 9 |
-
VGGSound_test:
|
| 10 |
-
root: ../data/video
|
| 11 |
-
subset_name: sets/vgg3-test.tsv
|
| 12 |
-
fps: 8
|
| 13 |
-
height: 384
|
| 14 |
-
width: 384
|
| 15 |
-
sample_duration_sec: 8.0
|
| 16 |
-
|
| 17 |
-
VGGSound_val:
|
| 18 |
-
root: ../data/video
|
| 19 |
-
subset_name: sets/vgg3-val.tsv
|
| 20 |
-
fps: 8
|
| 21 |
-
height: 384
|
| 22 |
-
width: 384
|
| 23 |
-
sample_duration_sec: 8.0
|
| 24 |
-
|
| 25 |
-
ExtractedVGG:
|
| 26 |
-
tsv: ../data/v1-16-memmap/vgg-train.tsv
|
| 27 |
-
memmap_dir: ../data/v1-16-memmap/vgg-train
|
| 28 |
-
|
| 29 |
-
ExtractedVGG_test:
|
| 30 |
-
tag: test
|
| 31 |
-
gt_cache: ../data/eval-cache/vggsound-test
|
| 32 |
-
output_subdir: null
|
| 33 |
-
tsv: ../data/v1-16-memmap/vgg-test.tsv
|
| 34 |
-
memmap_dir: ../data/v1-16-memmap/vgg-test
|
| 35 |
-
|
| 36 |
-
ExtractedVGG_val:
|
| 37 |
-
tag: val
|
| 38 |
-
gt_cache: ../data/eval-cache/vggsound-val
|
| 39 |
-
output_subdir: val
|
| 40 |
-
tsv: ../data/v1-16-memmap/vgg-val.tsv
|
| 41 |
-
memmap_dir: ../data/v1-16-memmap/vgg-val
|
| 42 |
-
|
| 43 |
-
AudioCaps:
|
| 44 |
-
tsv: ../data/v1-16-memmap/audiocaps.tsv
|
| 45 |
-
memmap_dir: ../data/v1-16-memmap/audiocaps
|
| 46 |
-
|
| 47 |
-
AudioSetSL:
|
| 48 |
-
tsv: ../data/v1-16-memmap/audioset_sl.tsv
|
| 49 |
-
memmap_dir: ../data/v1-16-memmap/audioset_sl
|
| 50 |
-
|
| 51 |
-
BBCSound:
|
| 52 |
-
tsv: ../data/v1-16-memmap/bbcsound.tsv
|
| 53 |
-
memmap_dir: ../data/v1-16-memmap/bbcsound
|
| 54 |
-
|
| 55 |
-
FreeSound:
|
| 56 |
-
tsv: ../data/v1-16-memmap/freesound.tsv
|
| 57 |
-
memmap_dir: ../data/v1-16-memmap/freesound
|
| 58 |
-
|
| 59 |
-
Clotho:
|
| 60 |
-
tsv: ../data/v1-16-memmap/clotho.tsv
|
| 61 |
-
memmap_dir: ../data/v1-16-memmap/clotho
|
| 62 |
-
|
| 63 |
-
Example_video:
|
| 64 |
-
tsv: ./training/example_output/memmap/vgg-example.tsv
|
| 65 |
-
memmap_dir: ./training/example_output/memmap/vgg-example
|
| 66 |
-
|
| 67 |
-
Example_audio:
|
| 68 |
-
tsv: ./training/example_output/memmap/audio-example.tsv
|
| 69 |
-
memmap_dir: ./training/example_output/memmap/audio-example
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/eval_config.yaml
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
defaults:
|
| 2 |
-
- base_config
|
| 3 |
-
- override hydra/job_logging: custom-simplest
|
| 4 |
-
- _self_
|
| 5 |
-
|
| 6 |
-
hydra:
|
| 7 |
-
run:
|
| 8 |
-
dir: ./output/${exp_id}
|
| 9 |
-
output_subdir: eval-${now:%Y-%m-%d_%H-%M-%S}-hydra
|
| 10 |
-
|
| 11 |
-
exp_id: ${model}
|
| 12 |
-
dataset: audiocaps
|
| 13 |
-
duration_s: 8.0
|
| 14 |
-
|
| 15 |
-
# for inference, this is the per-GPU batch size
|
| 16 |
-
batch_size: 16
|
| 17 |
-
output_name: null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/eval_data/base.yaml
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
AudioCaps:
|
| 2 |
-
audio_path: ../data/AudioCaps-test-audioldm-ver
|
| 3 |
-
# a csv file, with a header row of 'name' and 'caption'
|
| 4 |
-
# name should match the audio file name without extension
|
| 5 |
-
# Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_audioldm_data.csv
|
| 6 |
-
csv_path: ../data/AudioCaps-test-audioldm-ver/data.csv
|
| 7 |
-
|
| 8 |
-
AudioCaps_full:
|
| 9 |
-
audio_path: ../data/AudioCaps-test-full-ver
|
| 10 |
-
# a csv file, with a header row of 'name' and 'caption'
|
| 11 |
-
# name should match the audio file name without extension
|
| 12 |
-
# Can be downloaded here: https://github.com/hkchengrex/MMAudio/releases/download/v0.1/AudioCaps_full_data.csv
|
| 13 |
-
csv_path: ../data/AudioCaps-test-full-ver/data.csv
|
| 14 |
-
|
| 15 |
-
MovieGen:
|
| 16 |
-
video_path: ../data/MovieGen/MovieGenAudioBenchSfx/video_with_audio
|
| 17 |
-
jsonl_path: ../data/MovieGen/MovieGenAudioBenchSfx/metadata
|
| 18 |
-
|
| 19 |
-
VGGSound:
|
| 20 |
-
video_path: ../data/test-videos
|
| 21 |
-
# from the officially released csv file
|
| 22 |
-
csv_path: ../data/vggsound.csv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/hydra/job_logging/custom-eval.yaml
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
# python logging configuration for tasks
|
| 2 |
-
version: 1
|
| 3 |
-
formatters:
|
| 4 |
-
simple:
|
| 5 |
-
format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
|
| 6 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 7 |
-
colorlog:
|
| 8 |
-
'()': 'colorlog.ColoredFormatter'
|
| 9 |
-
format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
|
| 10 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 11 |
-
log_colors:
|
| 12 |
-
DEBUG: purple
|
| 13 |
-
INFO: green
|
| 14 |
-
WARNING: yellow
|
| 15 |
-
ERROR: red
|
| 16 |
-
CRITICAL: red
|
| 17 |
-
handlers:
|
| 18 |
-
console:
|
| 19 |
-
class: logging.StreamHandler
|
| 20 |
-
formatter: colorlog
|
| 21 |
-
stream: ext://sys.stdout
|
| 22 |
-
file:
|
| 23 |
-
class: logging.FileHandler
|
| 24 |
-
formatter: simple
|
| 25 |
-
# absolute file path
|
| 26 |
-
filename: ${hydra.runtime.output_dir}/eval-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
|
| 27 |
-
mode: w
|
| 28 |
-
root:
|
| 29 |
-
level: INFO
|
| 30 |
-
handlers: [console, file]
|
| 31 |
-
|
| 32 |
-
disable_existing_loggers: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/hydra/job_logging/custom-no-rank.yaml
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
# python logging configuration for tasks
|
| 2 |
-
version: 1
|
| 3 |
-
formatters:
|
| 4 |
-
simple:
|
| 5 |
-
format: '[%(asctime)s][%(levelname)s] - %(message)s'
|
| 6 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 7 |
-
colorlog:
|
| 8 |
-
'()': 'colorlog.ColoredFormatter'
|
| 9 |
-
format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
|
| 10 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 11 |
-
log_colors:
|
| 12 |
-
DEBUG: purple
|
| 13 |
-
INFO: green
|
| 14 |
-
WARNING: yellow
|
| 15 |
-
ERROR: red
|
| 16 |
-
CRITICAL: red
|
| 17 |
-
handlers:
|
| 18 |
-
console:
|
| 19 |
-
class: logging.StreamHandler
|
| 20 |
-
formatter: colorlog
|
| 21 |
-
stream: ext://sys.stdout
|
| 22 |
-
file:
|
| 23 |
-
class: logging.FileHandler
|
| 24 |
-
formatter: simple
|
| 25 |
-
# absolute file path
|
| 26 |
-
filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log
|
| 27 |
-
mode: w
|
| 28 |
-
root:
|
| 29 |
-
level: INFO
|
| 30 |
-
handlers: [console, file]
|
| 31 |
-
|
| 32 |
-
disable_existing_loggers: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/hydra/job_logging/custom-simplest.yaml
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
# python logging configuration for tasks
|
| 2 |
-
version: 1
|
| 3 |
-
formatters:
|
| 4 |
-
simple:
|
| 5 |
-
format: '[%(asctime)s][%(levelname)s] - %(message)s'
|
| 6 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 7 |
-
colorlog:
|
| 8 |
-
'()': 'colorlog.ColoredFormatter'
|
| 9 |
-
format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
|
| 10 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 11 |
-
log_colors:
|
| 12 |
-
DEBUG: purple
|
| 13 |
-
INFO: green
|
| 14 |
-
WARNING: yellow
|
| 15 |
-
ERROR: red
|
| 16 |
-
CRITICAL: red
|
| 17 |
-
handlers:
|
| 18 |
-
console:
|
| 19 |
-
class: logging.StreamHandler
|
| 20 |
-
formatter: colorlog
|
| 21 |
-
stream: ext://sys.stdout
|
| 22 |
-
root:
|
| 23 |
-
level: INFO
|
| 24 |
-
handlers: [console]
|
| 25 |
-
|
| 26 |
-
disable_existing_loggers: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/hydra/job_logging/custom.yaml
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
# @package hydra.job_logging
|
| 2 |
-
# python logging configuration for tasks
|
| 3 |
-
version: 1
|
| 4 |
-
formatters:
|
| 5 |
-
simple:
|
| 6 |
-
format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
|
| 7 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 8 |
-
colorlog:
|
| 9 |
-
'()': 'colorlog.ColoredFormatter'
|
| 10 |
-
format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)sr${oc.env:LOCAL_RANK}%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
|
| 11 |
-
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 12 |
-
log_colors:
|
| 13 |
-
DEBUG: purple
|
| 14 |
-
INFO: green
|
| 15 |
-
WARNING: yellow
|
| 16 |
-
ERROR: red
|
| 17 |
-
CRITICAL: red
|
| 18 |
-
handlers:
|
| 19 |
-
console:
|
| 20 |
-
class: logging.StreamHandler
|
| 21 |
-
formatter: colorlog
|
| 22 |
-
stream: ext://sys.stdout
|
| 23 |
-
file:
|
| 24 |
-
class: logging.FileHandler
|
| 25 |
-
formatter: simple
|
| 26 |
-
# absolute file path
|
| 27 |
-
filename: ${hydra.runtime.output_dir}/train-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
|
| 28 |
-
mode: w
|
| 29 |
-
root:
|
| 30 |
-
level: INFO
|
| 31 |
-
handlers: [console, file]
|
| 32 |
-
|
| 33 |
-
disable_existing_loggers: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/config/train_config.yaml
DELETED
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
defaults:
|
| 2 |
-
- base_config
|
| 3 |
-
- override data: base
|
| 4 |
-
- override hydra/job_logging: custom
|
| 5 |
-
- _self_
|
| 6 |
-
|
| 7 |
-
hydra:
|
| 8 |
-
run:
|
| 9 |
-
dir: ./output/${exp_id}
|
| 10 |
-
output_subdir: train-${now:%Y-%m-%d_%H-%M-%S}-hydra
|
| 11 |
-
|
| 12 |
-
ema:
|
| 13 |
-
start: 0
|
| 14 |
-
|
| 15 |
-
mini_train: False
|
| 16 |
-
example_train: False
|
| 17 |
-
enable_grad_scaler: False
|
| 18 |
-
vgg_oversample_rate: 5
|
| 19 |
-
|
| 20 |
-
log_text_interval: 200
|
| 21 |
-
log_extra_interval: 20_000
|
| 22 |
-
val_interval: 5_000
|
| 23 |
-
eval_interval: 20_000
|
| 24 |
-
save_eval_interval: 40_000
|
| 25 |
-
save_weights_interval: 10_000
|
| 26 |
-
save_checkpoint_interval: 10_000
|
| 27 |
-
save_copy_iterations: []
|
| 28 |
-
|
| 29 |
-
batch_size: 512
|
| 30 |
-
eval_batch_size: 256 # per-GPU
|
| 31 |
-
|
| 32 |
-
num_iterations: 300_000
|
| 33 |
-
learning_rate: 1.0e-4
|
| 34 |
-
linear_warmup_steps: 1_000
|
| 35 |
-
|
| 36 |
-
lr_schedule: step
|
| 37 |
-
lr_schedule_steps: [240_000, 270_000]
|
| 38 |
-
lr_schedule_gamma: 0.1
|
| 39 |
-
|
| 40 |
-
clip_grad_norm: 1.0
|
| 41 |
-
weight_decay: 1.0e-6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/demo.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
from argparse import ArgumentParser
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
|
| 5 |
-
import torch
|
| 6 |
-
import torchaudio
|
| 7 |
-
|
| 8 |
-
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
|
| 9 |
-
setup_eval_logging)
|
| 10 |
-
from mmaudio.model.flow_matching import FlowMatching
|
| 11 |
-
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
| 12 |
-
from mmaudio.model.utils.features_utils import FeaturesUtils
|
| 13 |
-
|
| 14 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
| 15 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 16 |
-
|
| 17 |
-
log = logging.getLogger()
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
@torch.inference_mode()
|
| 21 |
-
def main():
|
| 22 |
-
setup_eval_logging()
|
| 23 |
-
|
| 24 |
-
parser = ArgumentParser()
|
| 25 |
-
parser.add_argument('--variant',
|
| 26 |
-
type=str,
|
| 27 |
-
default='large_44k_v2',
|
| 28 |
-
help='small_16k, small_44k, medium_44k, large_44k, large_44k_v2')
|
| 29 |
-
parser.add_argument('--video', type=Path, help='Path to the video file')
|
| 30 |
-
parser.add_argument('--prompt', type=str, help='Input prompt', default='')
|
| 31 |
-
parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
|
| 32 |
-
parser.add_argument('--duration', type=float, default=8.0)
|
| 33 |
-
parser.add_argument('--cfg_strength', type=float, default=4.5)
|
| 34 |
-
parser.add_argument('--num_steps', type=int, default=25)
|
| 35 |
-
|
| 36 |
-
parser.add_argument('--mask_away_clip', action='store_true')
|
| 37 |
-
|
| 38 |
-
parser.add_argument('--output', type=Path, help='Output directory', default='./output')
|
| 39 |
-
parser.add_argument('--seed', type=int, help='Random seed', default=42)
|
| 40 |
-
parser.add_argument('--skip_video_composite', action='store_true')
|
| 41 |
-
parser.add_argument('--full_precision', action='store_true')
|
| 42 |
-
|
| 43 |
-
args = parser.parse_args()
|
| 44 |
-
|
| 45 |
-
if args.variant not in all_model_cfg:
|
| 46 |
-
raise ValueError(f'Unknown model variant: {args.variant}')
|
| 47 |
-
model: ModelConfig = all_model_cfg[args.variant]
|
| 48 |
-
model.download_if_needed()
|
| 49 |
-
seq_cfg = model.seq_cfg
|
| 50 |
-
|
| 51 |
-
if args.video:
|
| 52 |
-
video_path: Path = Path(args.video).expanduser()
|
| 53 |
-
else:
|
| 54 |
-
video_path = None
|
| 55 |
-
prompt: str = args.prompt
|
| 56 |
-
negative_prompt: str = args.negative_prompt
|
| 57 |
-
output_dir: str = args.output.expanduser()
|
| 58 |
-
seed: int = args.seed
|
| 59 |
-
num_steps: int = args.num_steps
|
| 60 |
-
duration: float = args.duration
|
| 61 |
-
cfg_strength: float = args.cfg_strength
|
| 62 |
-
skip_video_composite: bool = args.skip_video_composite
|
| 63 |
-
mask_away_clip: bool = args.mask_away_clip
|
| 64 |
-
|
| 65 |
-
device = 'cpu'
|
| 66 |
-
if torch.cuda.is_available():
|
| 67 |
-
device = 'cuda'
|
| 68 |
-
elif torch.backends.mps.is_available():
|
| 69 |
-
device = 'mps'
|
| 70 |
-
else:
|
| 71 |
-
log.warning('CUDA/MPS are not available, running on CPU')
|
| 72 |
-
dtype = torch.float32 if args.full_precision else torch.bfloat16
|
| 73 |
-
|
| 74 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 75 |
-
|
| 76 |
-
# load a pretrained model
|
| 77 |
-
net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
|
| 78 |
-
net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
|
| 79 |
-
log.info(f'Loaded weights from {model.model_path}')
|
| 80 |
-
|
| 81 |
-
# misc setup
|
| 82 |
-
rng = torch.Generator(device=device)
|
| 83 |
-
rng.manual_seed(seed)
|
| 84 |
-
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 85 |
-
|
| 86 |
-
feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
|
| 87 |
-
synchformer_ckpt=model.synchformer_ckpt,
|
| 88 |
-
enable_conditions=True,
|
| 89 |
-
mode=model.mode,
|
| 90 |
-
bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
|
| 91 |
-
need_vae_encoder=False)
|
| 92 |
-
feature_utils = feature_utils.to(device, dtype).eval()
|
| 93 |
-
|
| 94 |
-
if video_path is not None:
|
| 95 |
-
log.info(f'Using video {video_path}')
|
| 96 |
-
video_info = load_video(video_path, duration)
|
| 97 |
-
clip_frames = video_info.clip_frames
|
| 98 |
-
sync_frames = video_info.sync_frames
|
| 99 |
-
duration = video_info.duration_sec
|
| 100 |
-
if mask_away_clip:
|
| 101 |
-
clip_frames = None
|
| 102 |
-
else:
|
| 103 |
-
clip_frames = clip_frames.unsqueeze(0)
|
| 104 |
-
sync_frames = sync_frames.unsqueeze(0)
|
| 105 |
-
else:
|
| 106 |
-
log.info('No video provided -- text-to-audio mode')
|
| 107 |
-
clip_frames = sync_frames = None
|
| 108 |
-
|
| 109 |
-
seq_cfg.duration = duration
|
| 110 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 111 |
-
|
| 112 |
-
log.info(f'Prompt: {prompt}')
|
| 113 |
-
log.info(f'Negative prompt: {negative_prompt}')
|
| 114 |
-
|
| 115 |
-
audios = generate(clip_frames,
|
| 116 |
-
sync_frames, [prompt],
|
| 117 |
-
negative_text=[negative_prompt],
|
| 118 |
-
feature_utils=feature_utils,
|
| 119 |
-
net=net,
|
| 120 |
-
fm=fm,
|
| 121 |
-
rng=rng,
|
| 122 |
-
cfg_strength=cfg_strength)
|
| 123 |
-
audio = audios.float().cpu()[0]
|
| 124 |
-
if video_path is not None:
|
| 125 |
-
save_path = output_dir / f'{video_path.stem}.flac'
|
| 126 |
-
else:
|
| 127 |
-
safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
|
| 128 |
-
save_path = output_dir / f'{safe_filename}.flac'
|
| 129 |
-
torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
|
| 130 |
-
|
| 131 |
-
log.info(f'Audio saved to {save_path}')
|
| 132 |
-
if video_path is not None and not skip_video_composite:
|
| 133 |
-
video_save_path = output_dir / f'{video_path.stem}.mp4'
|
| 134 |
-
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
| 135 |
-
log.info(f'Video saved to {output_dir / video_save_path}')
|
| 136 |
-
|
| 137 |
-
log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
if __name__ == '__main__':
|
| 141 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/EVAL.md
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
# Evaluation
|
| 2 |
-
|
| 3 |
-
## Batch Evaluation
|
| 4 |
-
|
| 5 |
-
To evaluate the model on a dataset, use the `batch_eval.py` script. It is significantly more efficient in large-scale evaluation compared to `demo.py`, supporting batched inference, multi-GPU inference, torch compilation, and skipping video compositions.
|
| 6 |
-
|
| 7 |
-
An example of running this script with four GPUs is as follows:
|
| 8 |
-
|
| 9 |
-
```bash
|
| 10 |
-
OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=4 batch_eval.py duration_s=8 dataset=vggsound model=small_16k num_workers=8
|
| 11 |
-
```
|
| 12 |
-
|
| 13 |
-
You may need to update the data paths in `config/eval_data/base.yaml`.
|
| 14 |
-
More configuration options can be found in `config/base_config.yaml` and `config/eval_config.yaml`.
|
| 15 |
-
You might also want to change the dataset definition if you are not evaluating on VGGSound: https://github.com/hkchengrex/MMAudio/blob/main/mmaudio/data/eval/video_dataset.py
|
| 16 |
-
|
| 17 |
-
## Precomputed Results
|
| 18 |
-
|
| 19 |
-
Precomputed results for VGGSound, AudioCaps, and MovieGen are available here: https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results
|
| 20 |
-
|
| 21 |
-
## Obtaining Quantitative Metrics
|
| 22 |
-
|
| 23 |
-
Our evaluation code is available here: https://github.com/hkchengrex/av-benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/MODELS.md
DELETED
|
@@ -1,50 +0,0 @@
|
|
| 1 |
-
# Pretrained models
|
| 2 |
-
|
| 3 |
-
The models will be downloaded automatically when you run the demo script. MD5 checksums are provided in `mmaudio/utils/download_utils.py`.
|
| 4 |
-
The models are also available at https://huggingface.co/hkchengrex/MMAudio/tree/main
|
| 5 |
-
|
| 6 |
-
| Model | Download link | File size |
|
| 7 |
-
| -------- | ------- | ------- |
|
| 8 |
-
| Flow prediction network, small 16kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_16k.pth" download="mmaudio_small_16k.pth">mmaudio_small_16k.pth</a> | 601M |
|
| 9 |
-
| Flow prediction network, small 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_small_44k.pth" download="mmaudio_small_44k.pth">mmaudio_small_44k.pth</a> | 601M |
|
| 10 |
-
| Flow prediction network, medium 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_medium_44k.pth" download="mmaudio_medium_44k.pth">mmaudio_medium_44k.pth</a> | 2.4G |
|
| 11 |
-
| Flow prediction network, large 44.1kHz | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k.pth" download="mmaudio_large_44k.pth">mmaudio_large_44k.pth</a> | 3.9G |
|
| 12 |
-
| Flow prediction network, large 44.1kHz, v2 **(recommended)** | <a href="https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth" download="mmaudio_large_44k_v2.pth">mmaudio_large_44k_v2.pth</a> | 3.9G |
|
| 13 |
-
| 16kHz VAE | <a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-16.pth">v1-16.pth</a> | 655M |
|
| 14 |
-
| 16kHz BigVGAN vocoder (from Make-An-Audio 2) |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/best_netG.pt">best_netG.pt</a> | 429M |
|
| 15 |
-
| 44.1kHz VAE |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/v1-44.pth">v1-44.pth</a> | 1.2G |
|
| 16 |
-
| Synchformer visual encoder |<a href="https://github.com/hkchengrex/MMAudio/releases/download/v0.1/synchformer_state_dict.pth">synchformer_state_dict.pth</a> | 907M |
|
| 17 |
-
|
| 18 |
-
To run the model, you need four components: a flow prediction network, visual feature extractors (Synchformer and CLIP, CLIP will be downloaded automatically), a VAE, and a vocoder. VAEs and vocoders are specific to the sampling rate (16kHz or 44.1kHz) and not model sizes.
|
| 19 |
-
The 44.1kHz vocoder will be downloaded automatically.
|
| 20 |
-
The `_v2` model performs worse in benchmarking (e.g., in Fréchet distance), but, in my experience, generalizes better to new data.
|
| 21 |
-
|
| 22 |
-
The expected directory structure (full):
|
| 23 |
-
|
| 24 |
-
```bash
|
| 25 |
-
MMAudio
|
| 26 |
-
├── ext_weights
|
| 27 |
-
│ ├── best_netG.pt
|
| 28 |
-
│ ├── synchformer_state_dict.pth
|
| 29 |
-
│ ├── v1-16.pth
|
| 30 |
-
│ └── v1-44.pth
|
| 31 |
-
├── weights
|
| 32 |
-
│ ├── mmaudio_small_16k.pth
|
| 33 |
-
│ ├── mmaudio_small_44k.pth
|
| 34 |
-
│ ├── mmaudio_medium_44k.pth
|
| 35 |
-
│ ├── mmaudio_large_44k.pth
|
| 36 |
-
│ └── mmaudio_large_44k_v2.pth
|
| 37 |
-
└── ...
|
| 38 |
-
```
|
| 39 |
-
|
| 40 |
-
The expected directory structure (minimal, for the recommended model only):
|
| 41 |
-
|
| 42 |
-
```bash
|
| 43 |
-
MMAudio
|
| 44 |
-
├── ext_weights
|
| 45 |
-
│ ├── synchformer_state_dict.pth
|
| 46 |
-
│ └── v1-44.pth
|
| 47 |
-
├── weights
|
| 48 |
-
│ └── mmaudio_large_44k_v2.pth
|
| 49 |
-
└── ...
|
| 50 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/TRAINING.md
DELETED
|
@@ -1,184 +0,0 @@
|
|
| 1 |
-
# Training
|
| 2 |
-
|
| 3 |
-
## Overview
|
| 4 |
-
|
| 5 |
-
We have put a large emphasis on making training as fast as possible.
|
| 6 |
-
Consequently, some pre-processing steps are required.
|
| 7 |
-
|
| 8 |
-
Namely, before starting any training, we
|
| 9 |
-
|
| 10 |
-
1. Obtain training data as videos, audios, and captions.
|
| 11 |
-
2. Encode training audios into spectrograms and then with VAE into mean/std
|
| 12 |
-
3. Extract CLIP and synchronization features from videos
|
| 13 |
-
4. Extract CLIP features from text (captions)
|
| 14 |
-
5. Encode all extracted features into [MemoryMappedTensors](https://pytorch.org/tensordict/main/reference/generated/tensordict.MemoryMappedTensor.html) with [TensorDict](https://pytorch.org/tensordict/main/reference/tensordict.html)
|
| 15 |
-
|
| 16 |
-
**NOTE:** for maximum training speed (e.g., when training the base model with 2*H100s), you would need around 3~5 GB/s of random read speed. Spinning disks would not be able to catch up and most consumer-grade SSDs would struggle. In my experience, the best bet is to have a large enough system memory such that the OS can cache the data. This way, the data is read from RAM instead of disk.
|
| 17 |
-
|
| 18 |
-
The current training script does not support `_v2` training.
|
| 19 |
-
|
| 20 |
-
## Recommended Hardware Configuration
|
| 21 |
-
|
| 22 |
-
These are what I recommend for a smooth and efficient training experience. These are not minimum requirements.
|
| 23 |
-
|
| 24 |
-
- Single-node machine. We did not implement multi-node training
|
| 25 |
-
- GPUs: for the small model, two 80G-H100s or above; for the large model, eight 80G-H100s or above
|
| 26 |
-
- System memory: for 16kHz training, 600GB+; for 44kHz training, 700GB+
|
| 27 |
-
- Storage: >2TB of fast NVMe storage. If you have enough system memory, OS caching will help and the storage does not need to be as fast.
|
| 28 |
-
|
| 29 |
-
## Prerequisites
|
| 30 |
-
|
| 31 |
-
1. Install [av-benchmark](https://github.com/hkchengrex/av-benchmark). We use this library to automatically evaluate on the validation set during training, and on the test set after training.
|
| 32 |
-
2. Extract features for evaluation using [av-benchmark](https://github.com/hkchengrex/av-benchmark) for the validation and test set as a [validation cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L38) and a [test cache](https://github.com/hkchengrex/MMAudio/blob/34bf089fdd2e457cd5ef33be96c0e1c8a0412476/config/data/base.yaml#L31). You can also download the precomputed evaluation cache [here](https://huggingface.co/datasets/hkchengrex/MMAudio-precomputed-results/tree/main).
|
| 33 |
-
|
| 34 |
-
3. You will need ffmpeg to extract frames from videos. Note that `torchaudio` imposes a maximum version limit (`ffmpeg<7`). You can install it as follows:
|
| 35 |
-
|
| 36 |
-
```bash
|
| 37 |
-
conda install -c conda-forge 'ffmpeg<7'
|
| 38 |
-
```
|
| 39 |
-
|
| 40 |
-
4. Download the training datasets. We used [VGGSound](https://arxiv.org/abs/2004.14368), [AudioCaps](https://audiocaps.github.io/), [WavCaps](https://arxiv.org/abs/2303.17395), and [Clotho](https://arxiv.org/abs/1910.09387) (paper to be updated). Note that the audio files in the huggingface release of WavCaps have been downsampled to 32kHz. To the best of our ability, we located the original (high-sampling rate) audio files and used them instead to prevent artifacts during 44.1kHz training. We did not use the "SoundBible" portion of WavCaps, since it is a small set with many short audio unsuitable for our training.
|
| 41 |
-
|
| 42 |
-
5. Download the corresponding VAE (`v1-16.pth` for 16kHz training, and `v1-44.pth` for 44.1kHz training), vocoder models (`best_netG.pt` for 16kHz training; the vocoder for 44.1kHz training will be downloaded automatically), the [empty string encoding](https://github.com/hkchengrex/MMAudio/releases/download/v0.1/empty_string.pth), and Synchformer weights from [MODELS.md](https://github.com/hkchengrex/MMAudio/blob/main/docs/MODELS.md) place them in `ext_weights/`.
|
| 43 |
-
|
| 44 |
-
### Helpful links for downloading the datasets
|
| 45 |
-
|
| 46 |
-
We cannot redistribute the datasets for copyright reasons, but we do find some links helpful and they might be helpful to you as well.
|
| 47 |
-
|
| 48 |
-
- https://huggingface.co/datasets/Meranti/CLAP_freesound
|
| 49 |
-
- https://huggingface.co/datasets/agkphysics/AudioSet
|
| 50 |
-
- https://sound-effects.bbcrewind.co.uk/
|
| 51 |
-
|
| 52 |
-
For certain sources of VGGSound, you might notice desychronization between the audio and the video. This happens the video keyframes do not always align with the start of the audio and what happens during playbacks is player-dependent. We used PyTorch's decoder which can correctly handle these cases.
|
| 53 |
-
|
| 54 |
-
## Preparing Audio-Video-Text Features
|
| 55 |
-
|
| 56 |
-
We have prepared some example data in `training/example_videos`.
|
| 57 |
-
`training/extract_video_training_latents.py` extracts audio, video, and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
|
| 58 |
-
|
| 59 |
-
To run this script, use the `torchrun` utility:
|
| 60 |
-
|
| 61 |
-
```bash
|
| 62 |
-
torchrun --standalone training/extract_video_training_latents.py
|
| 63 |
-
```
|
| 64 |
-
|
| 65 |
-
You can run this script with multiple GPUs (with `--nproc_per_node=<n>` after `--standalone` and before the script name) to speed up extraction.
|
| 66 |
-
Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
|
| 67 |
-
Change the data path definitions in `data_cfg` if necessary.
|
| 68 |
-
|
| 69 |
-
Arguments:
|
| 70 |
-
|
| 71 |
-
- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
|
| 72 |
-
- `output_dir` -- where TensorDict and the metadata file are saved.
|
| 73 |
-
|
| 74 |
-
Outputs produced in `output_dir`:
|
| 75 |
-
|
| 76 |
-
1. A directory named `vgg-{split}` (i.e., in the TensorDict format), containing
|
| 77 |
-
a. `mean.memmap` mean values predicted by the VAE encoder (number of videos X sequence length X channel size)
|
| 78 |
-
b. `std.memmap` standard deviation values predicted by the VAE encoder (number of videos X sequence length X channel size)
|
| 79 |
-
c. `text_features.memmap` text features extracted from CLIP (number of videos X 77 (sequence length) X 1024)
|
| 80 |
-
d. `clip_features.memmap` clip features extracted from CLIP (number of videos X 64 (8 fps) X 1024)
|
| 81 |
-
e. `sync_features.memmap` synchronization features extracted from Synchformer (number of videos X 192 (24 fps) X 768)
|
| 82 |
-
f. `meta.json` that contains the metadata for the above memory mappings
|
| 83 |
-
2. A tab-separated values file named `vgg-{split}.tsv` that contains two columns: `id` containing video file names without extension, and `label` containing corresponding text labels (i.e., captions)
|
| 84 |
-
|
| 85 |
-
## Preparing Audio-Text Features
|
| 86 |
-
|
| 87 |
-
We have prepared some example data in `training/example_audios`.
|
| 88 |
-
|
| 89 |
-
1. Run `training/partition_clips` to partition each audio file into clips (by finding start and end points; we do not save the partitioned audio onto the disk to save disk space)
|
| 90 |
-
2. Run `training/extract_audio_training_latents.py` to extract each clip's audio and text features and save them as a `TensorDict` with a `.tsv` file containing metadata to `output_dir`.
|
| 91 |
-
|
| 92 |
-
### Partitioning the audio files
|
| 93 |
-
|
| 94 |
-
Run
|
| 95 |
-
|
| 96 |
-
```bash
|
| 97 |
-
python training/partition_clips.py
|
| 98 |
-
```
|
| 99 |
-
|
| 100 |
-
Arguments:
|
| 101 |
-
|
| 102 |
-
- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`)
|
| 103 |
-
- `output_dir` -- path to the output `.csv` file
|
| 104 |
-
- `start` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the beginning of the chunk to be processed
|
| 105 |
-
- `end` -- optional; useful when you need to run multiple processes to speed up processing -- this defines the end of the chunk to be processed
|
| 106 |
-
|
| 107 |
-
### Extracting audio and text features
|
| 108 |
-
|
| 109 |
-
Run
|
| 110 |
-
|
| 111 |
-
```bash
|
| 112 |
-
torchrun --standalone training/extract_audio_training_latents.py
|
| 113 |
-
```
|
| 114 |
-
|
| 115 |
-
You can run this with multiple GPUs (with `--nproc_per_node=<n>`) to speed up extraction.
|
| 116 |
-
Modify the definitions near the top of the script to switch between 16kHz/44.1kHz extraction.
|
| 117 |
-
|
| 118 |
-
Arguments:
|
| 119 |
-
|
| 120 |
-
- `data_dir` -- path to a directory containing the audio files (`.flac` or `.wav`), same as the previous step
|
| 121 |
-
- `captions_tsv` -- path to the captions file, a tab-separated values (tsv) file at least with columns `id` and `caption`
|
| 122 |
-
- `clips_tsv` -- path to the clips file, generated in the last step
|
| 123 |
-
- `latent_dir` -- where intermediate latent outputs are saved. It is safe to delete this directory afterwards.
|
| 124 |
-
- `output_dir` -- where TensorDict and the metadata file are saved.
|
| 125 |
-
|
| 126 |
-
Outputs produced in `output_dir`:
|
| 127 |
-
|
| 128 |
-
1. A directory named `{basename(output_dir)}` (i.e., in the TensorDict format), containing
|
| 129 |
-
a. `mean.memmap` mean values predicted by the VAE encoder (number of audios X sequence length X channel size)
|
| 130 |
-
b. `std.memmap` standard deviation values predicted by the VAE encoder (number of audios X sequence length X channel size)
|
| 131 |
-
c. `text_features.memmap` text features extracted from CLIP (number of audios X 77 (sequence length) X 1024)
|
| 132 |
-
f. `meta.json` that contains the metadata for the above memory mappings
|
| 133 |
-
2. A tab-separated values file named `{basename(output_dir)}.tsv` that contains two columns: `id` containing audio file names without extension, and `label` containing corresponding text labels (i.e., captions)
|
| 134 |
-
|
| 135 |
-
### Reference tsv files (with overlaps removed as mentioned in the paper)
|
| 136 |
-
|
| 137 |
-
The reference tsv files can be found [here](https://github.com/hkchengrex/MMAudio/releases/tag/v0.1).
|
| 138 |
-
|
| 139 |
-
Note that these reference tsv files are the **outputs** of `extract_audio_training_latents.py`, which means the `id` column might contain duplicate entries (one per clip). You can still use it as the `captions_tsv` input though -- the script will handle duplicates gracefully.
|
| 140 |
-
Among these reference tsv files, `audioset_sl.tsv`, `bbcsound.tsv`, and `freesound.tsv` are subsets that are parts of WavCaps. These subsets might be smaller than the original datasets.
|
| 141 |
-
The Clotho data contains both the development set and the validation set.
|
| 142 |
-
|
| 143 |
-
**Update (Mar 9, 2025)**:
|
| 144 |
-
We have updated a corrected set of reference tsv files. The previous tsv files contained some (<1%) corrupted captions (ie, mismatch between audio and caption, see https://github.com/hkchengrex/MMAudio/issues/56). The tsv files for VGGSound are unaffected. This reason for this error is unknown, but I cannot reproduce this error in the latest version of the code. Our pre-trained models are trained with **uncorrected** tsv files. For future training, I recommend using the corrected tsv files.
|
| 145 |
-
|
| 146 |
-
The error statistics are as follows:
|
| 147 |
-
|
| 148 |
-
- AudioCaps (170/43824), 0.39%
|
| 149 |
-
- Freesound: (1670/180636), 0.92%
|
| 150 |
-
- AudioSet: (290/100776), 0.29%
|
| 151 |
-
- BBCSound: (3/29975), 0.01%
|
| 152 |
-
- Clotho: (8/24332), 0.03%
|
| 153 |
-
|
| 154 |
-
## Training on Extracted Features
|
| 155 |
-
|
| 156 |
-
We use Distributed Data Parallel (DDP) for training.
|
| 157 |
-
First, specify the data path in `config/data/base.yaml`. If you used the default parameters in the scripts above to extract features for the example data, the `Example_video` and `Example_audio` items should already be correct.
|
| 158 |
-
|
| 159 |
-
To run training on the example data, use the following command:
|
| 160 |
-
|
| 161 |
-
```bash
|
| 162 |
-
OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=1 train.py exp_id=debug compile=False debug=True example_train=True batch_size=1
|
| 163 |
-
```
|
| 164 |
-
|
| 165 |
-
This will not train a useful model, but it will check if everything is set up correctly.
|
| 166 |
-
|
| 167 |
-
For full training on the base model with two GPUs, use the following command:
|
| 168 |
-
|
| 169 |
-
```bash
|
| 170 |
-
OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=2 train.py exp_id=exp_1 model=small_16k
|
| 171 |
-
```
|
| 172 |
-
|
| 173 |
-
Any outputs from training will be stored in `output/<exp_id>`.
|
| 174 |
-
|
| 175 |
-
More configuration options can be found in `config/base_config.yaml` and `config/train_config.yaml`.
|
| 176 |
-
For the medium and large models, specify `vgg_oversample_rate` to be `3` to reduce overfitting.
|
| 177 |
-
|
| 178 |
-
## Checkpoints
|
| 179 |
-
|
| 180 |
-
Model checkpoints, including optimizer states and the latest EMA weights, are available here: https://huggingface.co/hkchengrex/MMAudio
|
| 181 |
-
|
| 182 |
-
---
|
| 183 |
-
|
| 184 |
-
Godspeed!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/demo.html
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<!-- Google tag (gtag.js) -->
|
| 5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
| 6 |
-
<script>
|
| 7 |
-
window.dataLayer = window.dataLayer || [];
|
| 8 |
-
function gtag(){dataLayer.push(arguments);}
|
| 9 |
-
gtag('js', new Date());
|
| 10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
| 11 |
-
</script>
|
| 12 |
-
|
| 13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
| 14 |
-
<meta charset="UTF-8">
|
| 15 |
-
<title>MMAudio</title>
|
| 16 |
-
|
| 17 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
| 18 |
-
|
| 19 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 20 |
-
<!-- CSS only -->
|
| 21 |
-
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
| 22 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
| 23 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
|
| 24 |
-
|
| 25 |
-
<link rel="stylesheet" href="style_videos.css">
|
| 26 |
-
</head>
|
| 27 |
-
<body>
|
| 28 |
-
|
| 29 |
-
<div id="moviegen_all">
|
| 30 |
-
<h2 id="moviegen" style="text-align: center;">Supplementary Videos</h2>
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
<div class="row g-1">
|
| 34 |
-
<div class="col-12 col-md-4">
|
| 35 |
-
<div class="video-header" style="font-size: large;">Golf; ground-truth</div>
|
| 36 |
-
<div class="video-container">
|
| 37 |
-
<iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
|
| 38 |
-
</div>
|
| 39 |
-
</div>
|
| 40 |
-
<div class="col-12 col-md-4">
|
| 41 |
-
<div class="video-header" style="font-size: large;">Golf; FoleyCrafter</div>
|
| 42 |
-
<div class="video-container">
|
| 43 |
-
<iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
|
| 44 |
-
</div>
|
| 45 |
-
</div>
|
| 46 |
-
<div class="col-12 col-md-4">
|
| 47 |
-
<div class="video-header" style="font-size: large;">Golf; Ours</div>
|
| 48 |
-
<div class="video-container">
|
| 49 |
-
<iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
|
| 50 |
-
</div>
|
| 51 |
-
</div>
|
| 52 |
-
</div>
|
| 53 |
-
<br>
|
| 54 |
-
|
| 55 |
-
<div class="row g-1">
|
| 56 |
-
<div class="col-12 col-md-4">
|
| 57 |
-
<div class="video-header" style="font-size: large;">Waves; Ours</div>
|
| 58 |
-
<div class="video-container">
|
| 59 |
-
<iframe src="https://youtube.com/embed/7zQzDEuFnfI"></iframe>
|
| 60 |
-
</div>
|
| 61 |
-
</div>
|
| 62 |
-
<div class="col-12 col-md-4">
|
| 63 |
-
<div class="video-header" style="font-size: large;">Featured MMAudio</div>
|
| 64 |
-
<div class="video-container">
|
| 65 |
-
<iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
|
| 66 |
-
</div>
|
| 67 |
-
</div>
|
| 68 |
-
<div class="col-12 col-md-4">
|
| 69 |
-
<div class="video-header" style="font-size: large;">Failure case</div>
|
| 70 |
-
<div class="video-container">
|
| 71 |
-
<iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
|
| 72 |
-
</div>
|
| 73 |
-
</div>
|
| 74 |
-
</div>
|
| 75 |
-
<br>
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
</div>
|
| 79 |
-
|
| 80 |
-
</body>
|
| 81 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/images/icon.png
DELETED
|
Binary file (163 Bytes)
|
|
|
MMAudio/docs/index.html
DELETED
|
@@ -1,156 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<!-- Google tag (gtag.js) -->
|
| 5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
| 6 |
-
<script>
|
| 7 |
-
window.dataLayer = window.dataLayer || [];
|
| 8 |
-
function gtag(){dataLayer.push(arguments);}
|
| 9 |
-
gtag('js', new Date());
|
| 10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
| 11 |
-
</script>
|
| 12 |
-
|
| 13 |
-
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 14 |
-
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 15 |
-
<link href="https://fonts.googleapis.com/css2?family=Source+Sans+3&display=swap" rel="stylesheet">
|
| 16 |
-
<meta charset="UTF-8">
|
| 17 |
-
<title>MMAudio</title>
|
| 18 |
-
|
| 19 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
| 20 |
-
|
| 21 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 22 |
-
<!-- CSS only -->
|
| 23 |
-
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
| 24 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
| 25 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
| 26 |
-
|
| 27 |
-
<link rel="stylesheet" href="style.css">
|
| 28 |
-
</head>
|
| 29 |
-
<body>
|
| 30 |
-
|
| 31 |
-
<body>
|
| 32 |
-
<br><br><br><br>
|
| 33 |
-
<div class="container">
|
| 34 |
-
<div class="row text-center" style="font-size:38px">
|
| 35 |
-
<div class="col strong">
|
| 36 |
-
MMAudio: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis
|
| 37 |
-
</div>
|
| 38 |
-
</div>
|
| 39 |
-
|
| 40 |
-
<br>
|
| 41 |
-
<div class="row text-center" style="font-size:28px">
|
| 42 |
-
<div class="col">
|
| 43 |
-
CVPR 2025
|
| 44 |
-
</div>
|
| 45 |
-
</div>
|
| 46 |
-
<br>
|
| 47 |
-
|
| 48 |
-
<div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
|
| 49 |
-
<div class="col-sm-auto px-lg-2">
|
| 50 |
-
<a href="https://hkchengrex.github.io/">Ho Kei Cheng<sup>1</sup></a>
|
| 51 |
-
</div>
|
| 52 |
-
<div class="col-sm-auto px-lg-2">
|
| 53 |
-
<nobr><a href="https://scholar.google.co.jp/citations?user=RRIO1CcAAAAJ">Masato Ishii<sup>2</sup></a></nobr>
|
| 54 |
-
</div>
|
| 55 |
-
<div class="col-sm-auto px-lg-2">
|
| 56 |
-
<nobr><a href="https://scholar.google.com/citations?user=sXAjHFIAAAAJ">Akio Hayakawa<sup>2</sup></a></nobr>
|
| 57 |
-
</div>
|
| 58 |
-
<div class="col-sm-auto px-lg-2">
|
| 59 |
-
<nobr><a href="https://scholar.google.com/citations?user=XCRO260AAAAJ">Takashi Shibuya<sup>2</sup></a></nobr>
|
| 60 |
-
</div>
|
| 61 |
-
<div class="col-sm-auto px-lg-2">
|
| 62 |
-
<nobr><a href="https://www.alexander-schwing.de/">Alexander Schwing<sup>1</sup></a></nobr>
|
| 63 |
-
</div>
|
| 64 |
-
<div class="col-sm-auto px-lg-2" >
|
| 65 |
-
<nobr><a href="https://www.yukimitsufuji.com/">Yuki Mitsufuji<sup>2,3</sup></a></nobr>
|
| 66 |
-
</div>
|
| 67 |
-
</div>
|
| 68 |
-
|
| 69 |
-
<div class="h-100 row text-center heavy justify-content-md-center" style="font-size:22px;">
|
| 70 |
-
<div class="col-sm-auto px-lg-2">
|
| 71 |
-
<sup>1</sup>University of Illinois Urbana-Champaign
|
| 72 |
-
</div>
|
| 73 |
-
<div class="col-sm-auto px-lg-2">
|
| 74 |
-
<sup>2</sup>Sony AI
|
| 75 |
-
</div>
|
| 76 |
-
<div class="col-sm-auto px-lg-2">
|
| 77 |
-
<sup>3</sup>Sony Group Corporation
|
| 78 |
-
</div>
|
| 79 |
-
</div>
|
| 80 |
-
|
| 81 |
-
<br>
|
| 82 |
-
|
| 83 |
-
<br>
|
| 84 |
-
|
| 85 |
-
<div class="h-100 row text-center justify-content-md-center" style="font-size:20px;">
|
| 86 |
-
<div class="col-sm-2">
|
| 87 |
-
<a href="https://arxiv.org/abs/2412.15322">[Paper]</a>
|
| 88 |
-
</div>
|
| 89 |
-
<div class="col-sm-2">
|
| 90 |
-
<a href="https://github.com/hkchengrex/MMAudio">[Code]</a>
|
| 91 |
-
</div>
|
| 92 |
-
<div class="col-sm-3">
|
| 93 |
-
<a href="https://huggingface.co/spaces/hkchengrex/MMAudio">[Huggingface Demo]</a>
|
| 94 |
-
</div>
|
| 95 |
-
<div class="col-sm-2">
|
| 96 |
-
<a href="https://colab.research.google.com/drive/1TAaXCY2-kPk4xE4PwKB3EqFbSnkUuzZ8?usp=sharing">[Colab Demo]</a>
|
| 97 |
-
</div>
|
| 98 |
-
<div class="col-sm-3">
|
| 99 |
-
<a href="https://replicate.com/zsxkib/mmaudio">[Replicate Demo]</a>
|
| 100 |
-
</div>
|
| 101 |
-
</div>
|
| 102 |
-
|
| 103 |
-
<br>
|
| 104 |
-
|
| 105 |
-
<hr>
|
| 106 |
-
|
| 107 |
-
<div class="row" style="font-size:32px">
|
| 108 |
-
<div class="col strong">
|
| 109 |
-
TL;DR
|
| 110 |
-
</div>
|
| 111 |
-
</div>
|
| 112 |
-
<br>
|
| 113 |
-
<div class="row">
|
| 114 |
-
<div class="col">
|
| 115 |
-
<p class="light" style="text-align: left;">
|
| 116 |
-
MMAudio generates synchronized audio given video and/or text inputs.
|
| 117 |
-
</p>
|
| 118 |
-
|
| 119 |
-
<p>
|
| 120 |
-
Check out this fun video!
|
| 121 |
-
<div class="video-container" style="text-align: center;">
|
| 122 |
-
<iframe src="https://youtube.com/embed/SLz3NWLyHxg"></iframe>
|
| 123 |
-
</div>
|
| 124 |
-
</p>
|
| 125 |
-
</div>
|
| 126 |
-
</div>
|
| 127 |
-
|
| 128 |
-
<br>
|
| 129 |
-
<hr>
|
| 130 |
-
<br>
|
| 131 |
-
|
| 132 |
-
<div class="row" style="font-size:32px">
|
| 133 |
-
<div class="col strong">
|
| 134 |
-
Demo
|
| 135 |
-
</div>
|
| 136 |
-
</div>
|
| 137 |
-
<br>
|
| 138 |
-
<div class="row" style="font-size:48px">
|
| 139 |
-
<div class="col strong text-center">
|
| 140 |
-
<a href="video_main.html" style="text-decoration: underline;"><More results></a>
|
| 141 |
-
</div>
|
| 142 |
-
</div>
|
| 143 |
-
<br>
|
| 144 |
-
<div class="video-container" style="text-align: center;">
|
| 145 |
-
<iframe src="https://youtube.com/embed/YElewUT2M4M"></iframe>
|
| 146 |
-
</div>
|
| 147 |
-
|
| 148 |
-
<br>
|
| 149 |
-
|
| 150 |
-
<br><br>
|
| 151 |
-
<br><br>
|
| 152 |
-
|
| 153 |
-
</div>
|
| 154 |
-
|
| 155 |
-
</body>
|
| 156 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/style.css
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
font-family: 'Source Sans 3', sans-serif;
|
| 3 |
-
font-size: 18px;
|
| 4 |
-
margin-left: auto;
|
| 5 |
-
margin-right: auto;
|
| 6 |
-
font-weight: 400;
|
| 7 |
-
height: 100%;
|
| 8 |
-
max-width: 1000px;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
table {
|
| 12 |
-
width: 100%;
|
| 13 |
-
border-collapse: collapse;
|
| 14 |
-
}
|
| 15 |
-
th, td {
|
| 16 |
-
border: 1px solid #ddd;
|
| 17 |
-
padding: 8px;
|
| 18 |
-
text-align: center;
|
| 19 |
-
}
|
| 20 |
-
th {
|
| 21 |
-
background-color: #f2f2f2;
|
| 22 |
-
}
|
| 23 |
-
video {
|
| 24 |
-
width: 100%;
|
| 25 |
-
height: auto;
|
| 26 |
-
}
|
| 27 |
-
p {
|
| 28 |
-
font-size: 28px;
|
| 29 |
-
}
|
| 30 |
-
h2 {
|
| 31 |
-
font-size: 36px;
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
.strong {
|
| 35 |
-
font-weight: 700;
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
.light {
|
| 39 |
-
font-weight: 100;
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
.heavy {
|
| 43 |
-
font-weight: 900;
|
| 44 |
-
}
|
| 45 |
-
|
| 46 |
-
.column {
|
| 47 |
-
float: left;
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
a:link,
|
| 51 |
-
a:visited {
|
| 52 |
-
color: #05538f;
|
| 53 |
-
text-decoration: none;
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
a:hover {
|
| 57 |
-
color: #63cbdd;
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
hr {
|
| 61 |
-
border: 0;
|
| 62 |
-
height: 1px;
|
| 63 |
-
background-image: linear-gradient(to right, rgba(0, 0, 0, 0), rgba(0, 0, 0, 0.75), rgba(0, 0, 0, 0));
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
.video-container {
|
| 67 |
-
position: relative;
|
| 68 |
-
padding-bottom: 56.25%; /* 16:9 */
|
| 69 |
-
height: 0;
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
.video-container iframe {
|
| 73 |
-
position: absolute;
|
| 74 |
-
top: 0;
|
| 75 |
-
left: 0;
|
| 76 |
-
width: 100%;
|
| 77 |
-
height: 100%;
|
| 78 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/style_videos.css
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
font-family: 'Source Sans 3', sans-serif;
|
| 3 |
-
font-size: 1.5vh;
|
| 4 |
-
font-weight: 400;
|
| 5 |
-
}
|
| 6 |
-
|
| 7 |
-
table {
|
| 8 |
-
width: 100%;
|
| 9 |
-
border-collapse: collapse;
|
| 10 |
-
}
|
| 11 |
-
th, td {
|
| 12 |
-
border: 1px solid #ddd;
|
| 13 |
-
padding: 8px;
|
| 14 |
-
text-align: center;
|
| 15 |
-
}
|
| 16 |
-
th {
|
| 17 |
-
background-color: #f2f2f2;
|
| 18 |
-
}
|
| 19 |
-
video {
|
| 20 |
-
width: 100%;
|
| 21 |
-
height: auto;
|
| 22 |
-
}
|
| 23 |
-
p {
|
| 24 |
-
font-size: 1.5vh;
|
| 25 |
-
font-weight: bold;
|
| 26 |
-
}
|
| 27 |
-
h2 {
|
| 28 |
-
font-size: 2vh;
|
| 29 |
-
font-weight: bold;
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
.video-container {
|
| 33 |
-
position: relative;
|
| 34 |
-
padding-bottom: 56.25%; /* 16:9 */
|
| 35 |
-
height: 0;
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
.video-container iframe {
|
| 39 |
-
position: absolute;
|
| 40 |
-
top: 0;
|
| 41 |
-
left: 0;
|
| 42 |
-
width: 100%;
|
| 43 |
-
height: 100%;
|
| 44 |
-
}
|
| 45 |
-
|
| 46 |
-
.video-header {
|
| 47 |
-
background-color: #f2f2f2;
|
| 48 |
-
text-align: center;
|
| 49 |
-
font-size: 1.5vh;
|
| 50 |
-
font-weight: bold;
|
| 51 |
-
padding: 8px;
|
| 52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/video_gen.html
DELETED
|
@@ -1,254 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<!-- Google tag (gtag.js) -->
|
| 5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
| 6 |
-
<script>
|
| 7 |
-
window.dataLayer = window.dataLayer || [];
|
| 8 |
-
function gtag(){dataLayer.push(arguments);}
|
| 9 |
-
gtag('js', new Date());
|
| 10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
| 11 |
-
</script>
|
| 12 |
-
|
| 13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
| 14 |
-
<meta charset="UTF-8">
|
| 15 |
-
<title>MMAudio</title>
|
| 16 |
-
|
| 17 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
| 18 |
-
|
| 19 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 20 |
-
<!-- CSS only -->
|
| 21 |
-
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
| 22 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
| 23 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
|
| 24 |
-
|
| 25 |
-
<link rel="stylesheet" href="style_videos.css">
|
| 26 |
-
</head>
|
| 27 |
-
<body>
|
| 28 |
-
|
| 29 |
-
<div id="moviegen_all">
|
| 30 |
-
<h2 id="moviegen" style="text-align: center;">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</h2>
|
| 31 |
-
<p id="moviegen1" style="overflow: hidden;">
|
| 32 |
-
Example 1: Ice cracking with sharp snapping sound, and metal tool scraping against the ice surface.
|
| 33 |
-
<span style="float: right;"><a href="#index">Back to index</a></span>
|
| 34 |
-
</p>
|
| 35 |
-
|
| 36 |
-
<div class="row g-1">
|
| 37 |
-
<div class="col-sm-6">
|
| 38 |
-
<div class="video-header">Movie Gen Audio</div>
|
| 39 |
-
<div class="video-container">
|
| 40 |
-
<iframe src="https://youtube.com/embed/d7Lb0ihtGcE"></iframe>
|
| 41 |
-
</div>
|
| 42 |
-
</div>
|
| 43 |
-
<div class="col-sm-6">
|
| 44 |
-
<div class="video-header">Ours</div>
|
| 45 |
-
<div class="video-container">
|
| 46 |
-
<iframe src="https://youtube.com/embed/F4JoJ2r2m8U"></iframe>
|
| 47 |
-
</div>
|
| 48 |
-
</div>
|
| 49 |
-
</div>
|
| 50 |
-
<br>
|
| 51 |
-
|
| 52 |
-
<!-- <p id="moviegen2">Example 2: Rhythmic splashing and lapping of water. <span style="float:right;"><a href="#index">Back to index</a></span> </p>
|
| 53 |
-
|
| 54 |
-
<table>
|
| 55 |
-
<thead>
|
| 56 |
-
<tr>
|
| 57 |
-
<th>Movie Gen Audio</th>
|
| 58 |
-
<th>Ours</th>
|
| 59 |
-
</tr>
|
| 60 |
-
</thead>
|
| 61 |
-
<tbody>
|
| 62 |
-
<tr>
|
| 63 |
-
<td width="50%">
|
| 64 |
-
<div class="video-container">
|
| 65 |
-
<iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
|
| 66 |
-
</div>
|
| 67 |
-
</td>
|
| 68 |
-
<td width="50%">
|
| 69 |
-
<div class="video-container">
|
| 70 |
-
<iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
|
| 71 |
-
</div>
|
| 72 |
-
</td>
|
| 73 |
-
</tr>
|
| 74 |
-
</tbody>
|
| 75 |
-
</table> -->
|
| 76 |
-
|
| 77 |
-
<p id="moviegen2" style="overflow: hidden;">
|
| 78 |
-
Example 2: Rhythmic splashing and lapping of water.
|
| 79 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 80 |
-
</p>
|
| 81 |
-
<div class="row g-1">
|
| 82 |
-
<div class="col-sm-6">
|
| 83 |
-
<div class="video-header">Movie Gen Audio</div>
|
| 84 |
-
<div class="video-container">
|
| 85 |
-
<iframe src="https://youtube.com/embed/5gQNPK99CIk"></iframe>
|
| 86 |
-
</div>
|
| 87 |
-
</div>
|
| 88 |
-
<div class="col-sm-6">
|
| 89 |
-
<div class="video-header">Ours</div>
|
| 90 |
-
<div class="video-container">
|
| 91 |
-
<iframe src="https://youtube.com/embed/AbwnTzG-BpA"></iframe>
|
| 92 |
-
</div>
|
| 93 |
-
</div>
|
| 94 |
-
</div>
|
| 95 |
-
<br>
|
| 96 |
-
|
| 97 |
-
<p id="moviegen3" style="overflow: hidden;">
|
| 98 |
-
Example 3: Shovel scrapes against dry earth.
|
| 99 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 100 |
-
</p>
|
| 101 |
-
<div class="row g-1">
|
| 102 |
-
<div class="col-sm-6">
|
| 103 |
-
<div class="video-header">Movie Gen Audio</div>
|
| 104 |
-
<div class="video-container">
|
| 105 |
-
<iframe src="https://youtube.com/embed/PUKGyEve7XQ"></iframe>
|
| 106 |
-
</div>
|
| 107 |
-
</div>
|
| 108 |
-
<div class="col-sm-6">
|
| 109 |
-
<div class="video-header">Ours</div>
|
| 110 |
-
<div class="video-container">
|
| 111 |
-
<iframe src="https://youtube.com/embed/CNn7i8VNkdc"></iframe>
|
| 112 |
-
</div>
|
| 113 |
-
</div>
|
| 114 |
-
</div>
|
| 115 |
-
<br>
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
<p id="moviegen4" style="overflow: hidden;">
|
| 119 |
-
(Failure case) Example 4: Creamy sound of mashed potatoes being scooped.
|
| 120 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 121 |
-
</p>
|
| 122 |
-
<div class="row g-1">
|
| 123 |
-
<div class="col-sm-6">
|
| 124 |
-
<div class="video-header">Movie Gen Audio</div>
|
| 125 |
-
<div class="video-container">
|
| 126 |
-
<iframe src="https://youtube.com/embed/PJv1zxR9JjQ"></iframe>
|
| 127 |
-
</div>
|
| 128 |
-
</div>
|
| 129 |
-
<div class="col-sm-6">
|
| 130 |
-
<div class="video-header">Ours</div>
|
| 131 |
-
<div class="video-container">
|
| 132 |
-
<iframe src="https://youtube.com/embed/c3-LJ1lNsPQ"></iframe>
|
| 133 |
-
</div>
|
| 134 |
-
</div>
|
| 135 |
-
</div>
|
| 136 |
-
<br>
|
| 137 |
-
|
| 138 |
-
</div>
|
| 139 |
-
|
| 140 |
-
<div id="hunyuan_sora_all">
|
| 141 |
-
|
| 142 |
-
<h2 id="hunyuan" style="text-align: center;">Results on Videos Generated by Hunyuan</h2>
|
| 143 |
-
<p style="overflow: hidden;">
|
| 144 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 145 |
-
</p>
|
| 146 |
-
<div class="row g-1">
|
| 147 |
-
<div class="col-sm-6">
|
| 148 |
-
<div class="video-header">Typing</div>
|
| 149 |
-
<div class="video-container">
|
| 150 |
-
<iframe src="https://youtube.com/embed/8ln_9hhH_nk"></iframe>
|
| 151 |
-
</div>
|
| 152 |
-
</div>
|
| 153 |
-
<div class="col-sm-6">
|
| 154 |
-
<div class="video-header">Water is rushing down a stream and pouring</div>
|
| 155 |
-
<div class="video-container">
|
| 156 |
-
<iframe src="https://youtube.com/embed/5df1FZFQj30"></iframe>
|
| 157 |
-
</div>
|
| 158 |
-
</div>
|
| 159 |
-
</div>
|
| 160 |
-
<div class="row g-1">
|
| 161 |
-
<div class="col-sm-6">
|
| 162 |
-
<div class="video-header">Waves on beach</div>
|
| 163 |
-
<div class="video-container">
|
| 164 |
-
<iframe src="https://youtube.com/embed/7wQ9D5WgpFc"></iframe>
|
| 165 |
-
</div>
|
| 166 |
-
</div>
|
| 167 |
-
<div class="col-sm-6">
|
| 168 |
-
<div class="video-header">Water droplet</div>
|
| 169 |
-
<div class="video-container">
|
| 170 |
-
<iframe src="https://youtube.com/embed/q7M2nsalGjM"></iframe>
|
| 171 |
-
</div>
|
| 172 |
-
</div>
|
| 173 |
-
</div>
|
| 174 |
-
<br>
|
| 175 |
-
|
| 176 |
-
<h2 id="sora" style="text-align: center;">Results on Videos Generated by Sora</h2>
|
| 177 |
-
<p style="overflow: hidden;">
|
| 178 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 179 |
-
</p>
|
| 180 |
-
<div class="row g-1">
|
| 181 |
-
<div class="col-sm-6">
|
| 182 |
-
<div class="video-header">Ships riding waves</div>
|
| 183 |
-
<div class="video-container">
|
| 184 |
-
<iframe src="https://youtube.com/embed/JbgQzHHytk8"></iframe>
|
| 185 |
-
</div>
|
| 186 |
-
</div>
|
| 187 |
-
<div class="col-sm-6">
|
| 188 |
-
<div class="video-header">Train (no text prompt given)</div>
|
| 189 |
-
<div class="video-container">
|
| 190 |
-
<iframe src="https://youtube.com/embed/xOW7zrjpWC8"></iframe>
|
| 191 |
-
</div>
|
| 192 |
-
</div>
|
| 193 |
-
</div>
|
| 194 |
-
<div class="row g-1">
|
| 195 |
-
<div class="col-sm-6">
|
| 196 |
-
<div class="video-header">Seashore (no text prompt given)</div>
|
| 197 |
-
<div class="video-container">
|
| 198 |
-
<iframe src="https://youtube.com/embed/fIuw5Y8ZZ9E"></iframe>
|
| 199 |
-
</div>
|
| 200 |
-
</div>
|
| 201 |
-
<div class="col-sm-6">
|
| 202 |
-
<div class="video-header">Surfing (failure: unprompted music)</div>
|
| 203 |
-
<div class="video-container">
|
| 204 |
-
<iframe src="https://youtube.com/embed/UcSTk-v0M_s"></iframe>
|
| 205 |
-
</div>
|
| 206 |
-
</div>
|
| 207 |
-
</div>
|
| 208 |
-
<br>
|
| 209 |
-
|
| 210 |
-
<div id="mochi_ltx_all">
|
| 211 |
-
<h2 id="mochi" style="text-align: center;">Results on Videos Generated by Mochi 1</h2>
|
| 212 |
-
<p style="overflow: hidden;">
|
| 213 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 214 |
-
</p>
|
| 215 |
-
<div class="row g-1">
|
| 216 |
-
<div class="col-sm-6">
|
| 217 |
-
<div class="video-header">Magical fire and lightning (no text prompt given)</div>
|
| 218 |
-
<div class="video-container">
|
| 219 |
-
<iframe src="https://youtube.com/embed/tTlRZaSMNwY"></iframe>
|
| 220 |
-
</div>
|
| 221 |
-
</div>
|
| 222 |
-
<div class="col-sm-6">
|
| 223 |
-
<div class="video-header">Storm (no text prompt given)</div>
|
| 224 |
-
<div class="video-container">
|
| 225 |
-
<iframe src="https://youtube.com/embed/4hrZTMJUy3w"></iframe>
|
| 226 |
-
</div>
|
| 227 |
-
</div>
|
| 228 |
-
</div>
|
| 229 |
-
<br>
|
| 230 |
-
|
| 231 |
-
<h2 id="ltx" style="text-align: center;">Results on Videos Generated by LTX-Video</h2>
|
| 232 |
-
<p style="overflow: hidden;">
|
| 233 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 234 |
-
</p>
|
| 235 |
-
<div class="row g-1">
|
| 236 |
-
<div class="col-sm-6">
|
| 237 |
-
<div class="video-header">Firewood burning and cracking</div>
|
| 238 |
-
<div class="video-container">
|
| 239 |
-
<iframe src="https://youtube.com/embed/P7_DDpgev0g"></iframe>
|
| 240 |
-
</div>
|
| 241 |
-
</div>
|
| 242 |
-
<div class="col-sm-6">
|
| 243 |
-
<div class="video-header">Waterfall, water splashing</div>
|
| 244 |
-
<div class="video-container">
|
| 245 |
-
<iframe src="https://youtube.com/embed/4MvjceYnIO0"></iframe>
|
| 246 |
-
</div>
|
| 247 |
-
</div>
|
| 248 |
-
</div>
|
| 249 |
-
<br>
|
| 250 |
-
|
| 251 |
-
</div>
|
| 252 |
-
|
| 253 |
-
</body>
|
| 254 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/video_main.html
DELETED
|
@@ -1,98 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<!-- Google tag (gtag.js) -->
|
| 5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
| 6 |
-
<script>
|
| 7 |
-
window.dataLayer = window.dataLayer || [];
|
| 8 |
-
function gtag(){dataLayer.push(arguments);}
|
| 9 |
-
gtag('js', new Date());
|
| 10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
| 11 |
-
</script>
|
| 12 |
-
|
| 13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
| 14 |
-
<meta charset="UTF-8">
|
| 15 |
-
<title>MMAudio</title>
|
| 16 |
-
|
| 17 |
-
<link rel="icon" type="image/png" href="images/icon.png">
|
| 18 |
-
|
| 19 |
-
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">
|
| 20 |
-
<!-- CSS only -->
|
| 21 |
-
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
| 22 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
| 23 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
|
| 24 |
-
|
| 25 |
-
<link rel="stylesheet" href="style_videos.css">
|
| 26 |
-
|
| 27 |
-
<script type="text/javascript">
|
| 28 |
-
$(document).ready(function(){
|
| 29 |
-
$("#content").load("video_gen.html #moviegen_all");
|
| 30 |
-
$("#load_moveigen").click(function(){
|
| 31 |
-
$("#content").load("video_gen.html #moviegen_all");
|
| 32 |
-
});
|
| 33 |
-
$("#load_hunyuan_sora").click(function(){
|
| 34 |
-
$("#content").load("video_gen.html #hunyuan_sora_all");
|
| 35 |
-
});
|
| 36 |
-
$("#load_mochi_ltx").click(function(){
|
| 37 |
-
$("#content").load("video_gen.html #mochi_ltx_all");
|
| 38 |
-
});
|
| 39 |
-
$("#load_vgg1").click(function(){
|
| 40 |
-
$("#content").load("video_vgg.html #vgg1");
|
| 41 |
-
});
|
| 42 |
-
$("#load_vgg2").click(function(){
|
| 43 |
-
$("#content").load("video_vgg.html #vgg2");
|
| 44 |
-
});
|
| 45 |
-
$("#load_vgg3").click(function(){
|
| 46 |
-
$("#content").load("video_vgg.html #vgg3");
|
| 47 |
-
});
|
| 48 |
-
$("#load_vgg4").click(function(){
|
| 49 |
-
$("#content").load("video_vgg.html #vgg4");
|
| 50 |
-
});
|
| 51 |
-
$("#load_vgg5").click(function(){
|
| 52 |
-
$("#content").load("video_vgg.html #vgg5");
|
| 53 |
-
});
|
| 54 |
-
$("#load_vgg6").click(function(){
|
| 55 |
-
$("#content").load("video_vgg.html #vgg6");
|
| 56 |
-
});
|
| 57 |
-
$("#load_vgg_extra").click(function(){
|
| 58 |
-
$("#content").load("video_vgg.html #vgg_extra");
|
| 59 |
-
});
|
| 60 |
-
});
|
| 61 |
-
</script>
|
| 62 |
-
</head>
|
| 63 |
-
<body>
|
| 64 |
-
<h1 id="index" style="text-align: center;">Index</h1>
|
| 65 |
-
<p><b>(Click on the links to load the corresponding videos)</b> <span style="float:right;"><a href="index.html">Back to project page</a></span></p>
|
| 66 |
-
|
| 67 |
-
<ol>
|
| 68 |
-
<li>
|
| 69 |
-
<a href="#" id="load_moveigen">Comparisons with Movie Gen Audio on Videos Generated by MovieGen</a>
|
| 70 |
-
</li>
|
| 71 |
-
<li>
|
| 72 |
-
<a href="#" id="load_hunyuan_sora">Results on Videos Generated by Hunyuan and Sora</a>
|
| 73 |
-
</li>
|
| 74 |
-
<li>
|
| 75 |
-
<a href="#" id="load_mochi_ltx">Results on Videos Generated by Mochi 1 and LTX-Video</a>
|
| 76 |
-
</li>
|
| 77 |
-
<li>
|
| 78 |
-
On VGGSound
|
| 79 |
-
<ol>
|
| 80 |
-
<li><a id='load_vgg1' href="#">Example 1: Wolf howling</a></li>
|
| 81 |
-
<li><a id='load_vgg2' href="#">Example 2: Striking a golf ball</a></li>
|
| 82 |
-
<li><a id='load_vgg3' href="#">Example 3: Hitting a drum</a></li>
|
| 83 |
-
<li><a id='load_vgg4' href="#">Example 4: Dog barking</a></li>
|
| 84 |
-
<li><a id='load_vgg5' href="#">Example 5: Playing a string instrument</a></li>
|
| 85 |
-
<li><a id='load_vgg6' href="#">Example 6: A group of people playing tambourines</a></li>
|
| 86 |
-
<li><a id='load_vgg_extra' href="#">Extra results & failure cases</a></li>
|
| 87 |
-
</ol>
|
| 88 |
-
</li>
|
| 89 |
-
</ol>
|
| 90 |
-
|
| 91 |
-
<div id="content" class="container-fluid">
|
| 92 |
-
|
| 93 |
-
</div>
|
| 94 |
-
<br>
|
| 95 |
-
<br>
|
| 96 |
-
|
| 97 |
-
</body>
|
| 98 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/docs/video_vgg.html
DELETED
|
@@ -1,452 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<!-- Google tag (gtag.js) -->
|
| 5 |
-
<script async src="https://www.googletagmanager.com/gtag/js?id=G-0JKBJ3WRJZ"></script>
|
| 6 |
-
<script>
|
| 7 |
-
window.dataLayer = window.dataLayer || [];
|
| 8 |
-
function gtag(){dataLayer.push(arguments);}
|
| 9 |
-
gtag('js', new Date());
|
| 10 |
-
gtag('config', 'G-0JKBJ3WRJZ');
|
| 11 |
-
</script>
|
| 12 |
-
|
| 13 |
-
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
| 14 |
-
<meta charset="UTF-8">
|
| 15 |
-
<title>MMAudio</title>
|
| 16 |
-
|
| 17 |
-
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 18 |
-
<!-- CSS only -->
|
| 19 |
-
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
| 20 |
-
integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
| 21 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
| 22 |
-
|
| 23 |
-
<link rel="stylesheet" href="style_videos.css">
|
| 24 |
-
</head>
|
| 25 |
-
<body>
|
| 26 |
-
|
| 27 |
-
<div id="vgg1">
|
| 28 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 29 |
-
<p style="overflow: hidden;">
|
| 30 |
-
Example 1: Wolf howling.
|
| 31 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 32 |
-
</p>
|
| 33 |
-
<div class="row g-1">
|
| 34 |
-
<div class="col-sm-3">
|
| 35 |
-
<div class="video-header">Ground-truth</div>
|
| 36 |
-
<div class="video-container">
|
| 37 |
-
<iframe src="https://youtube.com/embed/9J_V74gqMUA"></iframe>
|
| 38 |
-
</div>
|
| 39 |
-
</div>
|
| 40 |
-
<div class="col-sm-3">
|
| 41 |
-
<div class="video-header">Ours</div>
|
| 42 |
-
<div class="video-container">
|
| 43 |
-
<iframe src="https://youtube.com/embed/P6O8IpjErPc"></iframe>
|
| 44 |
-
</div>
|
| 45 |
-
</div>
|
| 46 |
-
<div class="col-sm-3">
|
| 47 |
-
<div class="video-header">V2A-Mapper</div>
|
| 48 |
-
<div class="video-container">
|
| 49 |
-
<iframe src="https://youtube.com/embed/w-5eyqepvTk"></iframe>
|
| 50 |
-
</div>
|
| 51 |
-
</div>
|
| 52 |
-
<div class="col-sm-3">
|
| 53 |
-
<div class="video-header">FoleyCrafter</div>
|
| 54 |
-
<div class="video-container">
|
| 55 |
-
<iframe src="https://youtube.com/embed/VOLfoZlRkzo"></iframe>
|
| 56 |
-
</div>
|
| 57 |
-
</div>
|
| 58 |
-
</div>
|
| 59 |
-
<div class="row g-1">
|
| 60 |
-
<div class="col-sm-3">
|
| 61 |
-
<div class="video-header">Frieren</div>
|
| 62 |
-
<div class="video-container">
|
| 63 |
-
<iframe src="https://youtube.com/embed/49owKyA5Pa8"></iframe>
|
| 64 |
-
</div>
|
| 65 |
-
</div>
|
| 66 |
-
<div class="col-sm-3">
|
| 67 |
-
<div class="video-header">VATT</div>
|
| 68 |
-
<div class="video-container">
|
| 69 |
-
<iframe src="https://youtube.com/embed/QVtrFgbeGDM"></iframe>
|
| 70 |
-
</div>
|
| 71 |
-
</div>
|
| 72 |
-
<div class="col-sm-3">
|
| 73 |
-
<div class="video-header">V-AURA</div>
|
| 74 |
-
<div class="video-container">
|
| 75 |
-
<iframe src="https://youtube.com/embed/8r0uEfSNjvI"></iframe>
|
| 76 |
-
</div>
|
| 77 |
-
</div>
|
| 78 |
-
<div class="col-sm-3">
|
| 79 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 80 |
-
<div class="video-container">
|
| 81 |
-
<iframe src="https://youtube.com/embed/bn-sLg2qulk"></iframe>
|
| 82 |
-
</div>
|
| 83 |
-
</div>
|
| 84 |
-
</div>
|
| 85 |
-
</div>
|
| 86 |
-
|
| 87 |
-
<div id="vgg2">
|
| 88 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 89 |
-
<p style="overflow: hidden;">
|
| 90 |
-
Example 2: Striking a golf ball.
|
| 91 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 92 |
-
</p>
|
| 93 |
-
|
| 94 |
-
<div class="row g-1">
|
| 95 |
-
<div class="col-sm-3">
|
| 96 |
-
<div class="video-header">Ground-truth</div>
|
| 97 |
-
<div class="video-container">
|
| 98 |
-
<iframe src="https://youtube.com/embed/1hwSu42kkho"></iframe>
|
| 99 |
-
</div>
|
| 100 |
-
</div>
|
| 101 |
-
<div class="col-sm-3">
|
| 102 |
-
<div class="video-header">Ours</div>
|
| 103 |
-
<div class="video-container">
|
| 104 |
-
<iframe src="https://youtube.com/embed/kZibDoDCNxI"></iframe>
|
| 105 |
-
</div>
|
| 106 |
-
</div>
|
| 107 |
-
<div class="col-sm-3">
|
| 108 |
-
<div class="video-header">V2A-Mapper</div>
|
| 109 |
-
<div class="video-container">
|
| 110 |
-
<iframe src="https://youtube.com/embed/jgKfLBLhh7Y"></iframe>
|
| 111 |
-
</div>
|
| 112 |
-
</div>
|
| 113 |
-
<div class="col-sm-3">
|
| 114 |
-
<div class="video-header">FoleyCrafter</div>
|
| 115 |
-
<div class="video-container">
|
| 116 |
-
<iframe src="https://youtube.com/embed/Lfsx8mOPcJo"></iframe>
|
| 117 |
-
</div>
|
| 118 |
-
</div>
|
| 119 |
-
</div>
|
| 120 |
-
<div class="row g-1">
|
| 121 |
-
<div class="col-sm-3">
|
| 122 |
-
<div class="video-header">Frieren</div>
|
| 123 |
-
<div class="video-container">
|
| 124 |
-
<iframe src="https://youtube.com/embed/tz-LpbB0MBc"></iframe>
|
| 125 |
-
</div>
|
| 126 |
-
</div>
|
| 127 |
-
<div class="col-sm-3">
|
| 128 |
-
<div class="video-header">VATT</div>
|
| 129 |
-
<div class="video-container">
|
| 130 |
-
<iframe src="https://youtube.com/embed/RTDUHMi08n4"></iframe>
|
| 131 |
-
</div>
|
| 132 |
-
</div>
|
| 133 |
-
<div class="col-sm-3">
|
| 134 |
-
<div class="video-header">V-AURA</div>
|
| 135 |
-
<div class="video-container">
|
| 136 |
-
<iframe src="https://youtube.com/embed/N-3TDOsPnZQ"></iframe>
|
| 137 |
-
</div>
|
| 138 |
-
</div>
|
| 139 |
-
<div class="col-sm-3">
|
| 140 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 141 |
-
<div class="video-container">
|
| 142 |
-
<iframe src="https://youtube.com/embed/QnsHnLn4gB0"></iframe>
|
| 143 |
-
</div>
|
| 144 |
-
</div>
|
| 145 |
-
</div>
|
| 146 |
-
</div>
|
| 147 |
-
|
| 148 |
-
<div id="vgg3">
|
| 149 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 150 |
-
<p style="overflow: hidden;">
|
| 151 |
-
Example 3: Hitting a drum.
|
| 152 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 153 |
-
</p>
|
| 154 |
-
|
| 155 |
-
<div class="row g-1">
|
| 156 |
-
<div class="col-sm-3">
|
| 157 |
-
<div class="video-header">Ground-truth</div>
|
| 158 |
-
<div class="video-container">
|
| 159 |
-
<iframe src="https://youtube.com/embed/0oeIwq77w0Q"></iframe>
|
| 160 |
-
</div>
|
| 161 |
-
</div>
|
| 162 |
-
<div class="col-sm-3">
|
| 163 |
-
<div class="video-header">Ours</div>
|
| 164 |
-
<div class="video-container">
|
| 165 |
-
<iframe src="https://youtube.com/embed/-UtPV9ohuIM"></iframe>
|
| 166 |
-
</div>
|
| 167 |
-
</div>
|
| 168 |
-
<div class="col-sm-3">
|
| 169 |
-
<div class="video-header">V2A-Mapper</div>
|
| 170 |
-
<div class="video-container">
|
| 171 |
-
<iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
|
| 172 |
-
</div>
|
| 173 |
-
</div>
|
| 174 |
-
<div class="col-sm-3">
|
| 175 |
-
<div class="video-header">FoleyCrafter</div>
|
| 176 |
-
<div class="video-container">
|
| 177 |
-
<iframe src="https://youtube.com/embed/kkCsXPOlBvY"></iframe>
|
| 178 |
-
</div>
|
| 179 |
-
</div>
|
| 180 |
-
</div>
|
| 181 |
-
<div class="row g-1">
|
| 182 |
-
<div class="col-sm-3">
|
| 183 |
-
<div class="video-header">Frieren</div>
|
| 184 |
-
<div class="video-container">
|
| 185 |
-
<iframe src="https://youtube.com/embed/MbNKsVsuvig"></iframe>
|
| 186 |
-
</div>
|
| 187 |
-
</div>
|
| 188 |
-
<div class="col-sm-3">
|
| 189 |
-
<div class="video-header">VATT</div>
|
| 190 |
-
<div class="video-container">
|
| 191 |
-
<iframe src="https://youtube.com/embed/2yYviBjrpBw"></iframe>
|
| 192 |
-
</div>
|
| 193 |
-
</div>
|
| 194 |
-
<div class="col-sm-3">
|
| 195 |
-
<div class="video-header">V-AURA</div>
|
| 196 |
-
<div class="video-container">
|
| 197 |
-
<iframe src="https://youtube.com/embed/9yivkgN-zwc"></iframe>
|
| 198 |
-
</div>
|
| 199 |
-
</div>
|
| 200 |
-
<div class="col-sm-3">
|
| 201 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 202 |
-
<div class="video-container">
|
| 203 |
-
<iframe src="https://youtube.com/embed/6dnyQt4Fuhs"></iframe>
|
| 204 |
-
</div>
|
| 205 |
-
</div>
|
| 206 |
-
</div>
|
| 207 |
-
</div>
|
| 208 |
-
</div>
|
| 209 |
-
|
| 210 |
-
<div id="vgg4">
|
| 211 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 212 |
-
<p style="overflow: hidden;">
|
| 213 |
-
Example 4: Dog barking.
|
| 214 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 215 |
-
</p>
|
| 216 |
-
|
| 217 |
-
<div class="row g-1">
|
| 218 |
-
<div class="col-sm-3">
|
| 219 |
-
<div class="video-header">Ground-truth</div>
|
| 220 |
-
<div class="video-container">
|
| 221 |
-
<iframe src="https://youtube.com/embed/ckaqvTyMYAw"></iframe>
|
| 222 |
-
</div>
|
| 223 |
-
</div>
|
| 224 |
-
<div class="col-sm-3">
|
| 225 |
-
<div class="video-header">Ours</div>
|
| 226 |
-
<div class="video-container">
|
| 227 |
-
<iframe src="https://youtube.com/embed/_aRndFZzZ-I"></iframe>
|
| 228 |
-
</div>
|
| 229 |
-
</div>
|
| 230 |
-
<div class="col-sm-3">
|
| 231 |
-
<div class="video-header">V2A-Mapper</div>
|
| 232 |
-
<div class="video-container">
|
| 233 |
-
<iframe src="https://youtube.com/embed/mNCISP3LBl0"></iframe>
|
| 234 |
-
</div>
|
| 235 |
-
</div>
|
| 236 |
-
<div class="col-sm-3">
|
| 237 |
-
<div class="video-header">FoleyCrafter</div>
|
| 238 |
-
<div class="video-container">
|
| 239 |
-
<iframe src="https://youtube.com/embed/phZBQ3L7foE"></iframe>
|
| 240 |
-
</div>
|
| 241 |
-
</div>
|
| 242 |
-
</div>
|
| 243 |
-
<div class="row g-1">
|
| 244 |
-
<div class="col-sm-3">
|
| 245 |
-
<div class="video-header">Frieren</div>
|
| 246 |
-
<div class="video-container">
|
| 247 |
-
<iframe src="https://youtube.com/embed/Sb5Mg1-ORao"></iframe>
|
| 248 |
-
</div>
|
| 249 |
-
</div>
|
| 250 |
-
<div class="col-sm-3">
|
| 251 |
-
<div class="video-header">VATT</div>
|
| 252 |
-
<div class="video-container">
|
| 253 |
-
<iframe src="https://youtube.com/embed/eHmAGOmtDDg"></iframe>
|
| 254 |
-
</div>
|
| 255 |
-
</div>
|
| 256 |
-
<div class="col-sm-3">
|
| 257 |
-
<div class="video-header">V-AURA</div>
|
| 258 |
-
<div class="video-container">
|
| 259 |
-
<iframe src="https://youtube.com/embed/NEGa3krBrm0"></iframe>
|
| 260 |
-
</div>
|
| 261 |
-
</div>
|
| 262 |
-
<div class="col-sm-3">
|
| 263 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 264 |
-
<div class="video-container">
|
| 265 |
-
<iframe src="https://youtube.com/embed/aO0EAXlwE7A"></iframe>
|
| 266 |
-
</div>
|
| 267 |
-
</div>
|
| 268 |
-
</div>
|
| 269 |
-
</div>
|
| 270 |
-
|
| 271 |
-
<div id="vgg5">
|
| 272 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 273 |
-
<p style="overflow: hidden;">
|
| 274 |
-
Example 5: Playing a string instrument.
|
| 275 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 276 |
-
</p>
|
| 277 |
-
|
| 278 |
-
<div class="row g-1">
|
| 279 |
-
<div class="col-sm-3">
|
| 280 |
-
<div class="video-header">Ground-truth</div>
|
| 281 |
-
<div class="video-container">
|
| 282 |
-
<iframe src="https://youtube.com/embed/KP1QhWauIOc"></iframe>
|
| 283 |
-
</div>
|
| 284 |
-
</div>
|
| 285 |
-
<div class="col-sm-3">
|
| 286 |
-
<div class="video-header">Ours</div>
|
| 287 |
-
<div class="video-container">
|
| 288 |
-
<iframe src="https://youtube.com/embed/ovaJhWSquYE"></iframe>
|
| 289 |
-
</div>
|
| 290 |
-
</div>
|
| 291 |
-
<div class="col-sm-3">
|
| 292 |
-
<div class="video-header">V2A-Mapper</div>
|
| 293 |
-
<div class="video-container">
|
| 294 |
-
<iframe src="https://youtube.com/embed/N723FS9lcy8"></iframe>
|
| 295 |
-
</div>
|
| 296 |
-
</div>
|
| 297 |
-
<div class="col-sm-3">
|
| 298 |
-
<div class="video-header">FoleyCrafter</div>
|
| 299 |
-
<div class="video-container">
|
| 300 |
-
<iframe src="https://youtube.com/embed/t0N4ZAAXo58"></iframe>
|
| 301 |
-
</div>
|
| 302 |
-
</div>
|
| 303 |
-
</div>
|
| 304 |
-
<div class="row g-1">
|
| 305 |
-
<div class="col-sm-3">
|
| 306 |
-
<div class="video-header">Frieren</div>
|
| 307 |
-
<div class="video-container">
|
| 308 |
-
<iframe src="https://youtube.com/embed/8YSRs03QNNA"></iframe>
|
| 309 |
-
</div>
|
| 310 |
-
</div>
|
| 311 |
-
<div class="col-sm-3">
|
| 312 |
-
<div class="video-header">VATT</div>
|
| 313 |
-
<div class="video-container">
|
| 314 |
-
<iframe src="https://youtube.com/embed/vOpMz55J1kY"></iframe>
|
| 315 |
-
</div>
|
| 316 |
-
</div>
|
| 317 |
-
<div class="col-sm-3">
|
| 318 |
-
<div class="video-header">V-AURA</div>
|
| 319 |
-
<div class="video-container">
|
| 320 |
-
<iframe src="https://youtube.com/embed/9JHC75vr9h0"></iframe>
|
| 321 |
-
</div>
|
| 322 |
-
</div>
|
| 323 |
-
<div class="col-sm-3">
|
| 324 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 325 |
-
<div class="video-container">
|
| 326 |
-
<iframe src="https://youtube.com/embed/9w0JckNzXmY"></iframe>
|
| 327 |
-
</div>
|
| 328 |
-
</div>
|
| 329 |
-
</div>
|
| 330 |
-
</div>
|
| 331 |
-
|
| 332 |
-
<div id="vgg6">
|
| 333 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 334 |
-
<p style="overflow: hidden;">
|
| 335 |
-
Example 6: A group of people playing tambourines.
|
| 336 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 337 |
-
</p>
|
| 338 |
-
|
| 339 |
-
<div class="row g-1">
|
| 340 |
-
<div class="col-sm-3">
|
| 341 |
-
<div class="video-header">Ground-truth</div>
|
| 342 |
-
<div class="video-container">
|
| 343 |
-
<iframe src="https://youtube.com/embed/mx6JLxzUkRc"></iframe>
|
| 344 |
-
</div>
|
| 345 |
-
</div>
|
| 346 |
-
<div class="col-sm-3">
|
| 347 |
-
<div class="video-header">Ours</div>
|
| 348 |
-
<div class="video-container">
|
| 349 |
-
<iframe src="https://youtube.com/embed/oLirHhP9Su8"></iframe>
|
| 350 |
-
</div>
|
| 351 |
-
</div>
|
| 352 |
-
<div class="col-sm-3">
|
| 353 |
-
<div class="video-header">V2A-Mapper</div>
|
| 354 |
-
<div class="video-container">
|
| 355 |
-
<iframe src="https://youtube.com/embed/HkLkHMqptv0"></iframe>
|
| 356 |
-
</div>
|
| 357 |
-
</div>
|
| 358 |
-
<div class="col-sm-3">
|
| 359 |
-
<div class="video-header">FoleyCrafter</div>
|
| 360 |
-
<div class="video-container">
|
| 361 |
-
<iframe src="https://youtube.com/embed/rpHiiODjmNU"></iframe>
|
| 362 |
-
</div>
|
| 363 |
-
</div>
|
| 364 |
-
</div>
|
| 365 |
-
<div class="row g-1">
|
| 366 |
-
<div class="col-sm-3">
|
| 367 |
-
<div class="video-header">Frieren</div>
|
| 368 |
-
<div class="video-container">
|
| 369 |
-
<iframe src="https://youtube.com/embed/1mVD3fJ0LpM"></iframe>
|
| 370 |
-
</div>
|
| 371 |
-
</div>
|
| 372 |
-
<div class="col-sm-3">
|
| 373 |
-
<div class="video-header">VATT</div>
|
| 374 |
-
<div class="video-container">
|
| 375 |
-
<iframe src="https://youtube.com/embed/yjVFnJiEJlw"></iframe>
|
| 376 |
-
</div>
|
| 377 |
-
</div>
|
| 378 |
-
<div class="col-sm-3">
|
| 379 |
-
<div class="video-header">V-AURA</div>
|
| 380 |
-
<div class="video-container">
|
| 381 |
-
<iframe src="https://youtube.com/embed/neVeMSWtRkU"></iframe>
|
| 382 |
-
</div>
|
| 383 |
-
</div>
|
| 384 |
-
<div class="col-sm-3">
|
| 385 |
-
<div class="video-header">Seeing and Hearing</div>
|
| 386 |
-
<div class="video-container">
|
| 387 |
-
<iframe src="https://youtube.com/embed/EUE7YwyVWz8"></iframe>
|
| 388 |
-
</div>
|
| 389 |
-
</div>
|
| 390 |
-
</div>
|
| 391 |
-
</div>
|
| 392 |
-
|
| 393 |
-
<div id="vgg_extra">
|
| 394 |
-
<h2 style="text-align: center;">Comparisons with state-of-the-art methods in VGGSound</h2>
|
| 395 |
-
<p style="overflow: hidden;">
|
| 396 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 397 |
-
</p>
|
| 398 |
-
|
| 399 |
-
<div class="row g-1">
|
| 400 |
-
<div class="col-sm-3">
|
| 401 |
-
<div class="video-header">Moving train</div>
|
| 402 |
-
<div class="video-container">
|
| 403 |
-
<iframe src="https://youtube.com/embed/Ta6H45rBzJc"></iframe>
|
| 404 |
-
</div>
|
| 405 |
-
</div>
|
| 406 |
-
<div class="col-sm-3">
|
| 407 |
-
<div class="video-header">Water splashing</div>
|
| 408 |
-
<div class="video-container">
|
| 409 |
-
<iframe src="https://youtube.com/embed/hl6AtgHXpb4"></iframe>
|
| 410 |
-
</div>
|
| 411 |
-
</div>
|
| 412 |
-
<div class="col-sm-3">
|
| 413 |
-
<div class="video-header">Skateboarding</div>
|
| 414 |
-
<div class="video-container">
|
| 415 |
-
<iframe src="https://youtube.com/embed/n4sCNi_9buI"></iframe>
|
| 416 |
-
</div>
|
| 417 |
-
</div>
|
| 418 |
-
<div class="col-sm-3">
|
| 419 |
-
<div class="video-header">Synchronized clapping</div>
|
| 420 |
-
<div class="video-container">
|
| 421 |
-
<iframe src="https://youtube.com/embed/oxexfpLn7FE"></iframe>
|
| 422 |
-
</div>
|
| 423 |
-
</div>
|
| 424 |
-
</div>
|
| 425 |
-
|
| 426 |
-
<br><br>
|
| 427 |
-
|
| 428 |
-
<div id="extra-failure">
|
| 429 |
-
<h2 style="text-align: center;">Failure cases</h2>
|
| 430 |
-
<p style="overflow: hidden;">
|
| 431 |
-
<span style="float:right;"><a href="#index">Back to index</a></span>
|
| 432 |
-
</p>
|
| 433 |
-
|
| 434 |
-
<div class="row g-1">
|
| 435 |
-
<div class="col-sm-6">
|
| 436 |
-
<div class="video-header">Human speech</div>
|
| 437 |
-
<div class="video-container">
|
| 438 |
-
<iframe src="https://youtube.com/embed/nx0CyrDu70Y"></iframe>
|
| 439 |
-
</div>
|
| 440 |
-
</div>
|
| 441 |
-
<div class="col-sm-6">
|
| 442 |
-
<div class="video-header">Unfamiliar vision input</div>
|
| 443 |
-
<div class="video-container">
|
| 444 |
-
<iframe src="https://youtube.com/embed/hfnAqmK3X7w"></iframe>
|
| 445 |
-
</div>
|
| 446 |
-
</div>
|
| 447 |
-
</div>
|
| 448 |
-
</div>
|
| 449 |
-
</div>
|
| 450 |
-
|
| 451 |
-
</body>
|
| 452 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/eval_onsets.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
# Modified from https://github.com/XYPB/CondFoleyGen/blob/main/predict_onset.py
|
| 2 |
-
|
| 3 |
-
import argparse
|
| 4 |
-
import copy
|
| 5 |
-
import os
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
import librosa
|
| 9 |
-
import numpy as np
|
| 10 |
-
from sklearn.metrics import (average_precision_score, f1_score, precision_recall_curve)
|
| 11 |
-
from tqdm import tqdm
|
| 12 |
-
|
| 13 |
-
sample_rate = 22050
|
| 14 |
-
conf_interval = int(0.05 * 22050)
|
| 15 |
-
duration = 8
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def onset_nms(onsets, wav_norm, window=0.05):
|
| 19 |
-
confidence = [np.max(wav_norm[o - conf_interval:o + conf_interval]) for o in onsets]
|
| 20 |
-
|
| 21 |
-
onset_remain = onsets.tolist()
|
| 22 |
-
output = []
|
| 23 |
-
sorted_idx = np.argsort(confidence)[::-1]
|
| 24 |
-
for idx in sorted_idx:
|
| 25 |
-
cur = onsets[idx]
|
| 26 |
-
if cur not in onset_remain:
|
| 27 |
-
continue
|
| 28 |
-
output.append(cur)
|
| 29 |
-
onset_remain.remove(cur)
|
| 30 |
-
for o in onset_remain:
|
| 31 |
-
if abs(cur - o) < window * sample_rate:
|
| 32 |
-
onset_remain.remove(o)
|
| 33 |
-
return np.array(sorted(output))
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def predict_audio(audio_path: Path, delta: float) -> np.ndarray:
|
| 37 |
-
wav, _ = librosa.load(audio_path, sr=sample_rate)
|
| 38 |
-
wav = wav[:duration * sample_rate]
|
| 39 |
-
onsets = librosa.onset.onset_detect(y=wav, sr=sample_rate, units='samples', delta=delta)
|
| 40 |
-
wav_norm = (wav - wav.min()) / (wav.max() - wav.min() + 1e-6)
|
| 41 |
-
|
| 42 |
-
return onsets, wav_norm
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def read_gt(gt_file: Path) -> np.ndarray:
|
| 46 |
-
all_times = []
|
| 47 |
-
with open(gt_file, 'r') as f:
|
| 48 |
-
lines = f.readlines()
|
| 49 |
-
for l in lines:
|
| 50 |
-
time = float(l.split(' ')[0])
|
| 51 |
-
if time >= duration:
|
| 52 |
-
break
|
| 53 |
-
all_times.append(time)
|
| 54 |
-
return np.array(all_times)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def main():
|
| 58 |
-
parser = argparse.ArgumentParser()
|
| 59 |
-
parser.add_argument('--input_dir', type=Path)
|
| 60 |
-
parser.add_argument('--gt_dir', type=Path)
|
| 61 |
-
parser.add_argument('--delta', type=float, default=0.3)
|
| 62 |
-
args = parser.parse_args()
|
| 63 |
-
|
| 64 |
-
input_dir = args.input_dir
|
| 65 |
-
gt_dir = args.gt_dir
|
| 66 |
-
delta = args.delta
|
| 67 |
-
|
| 68 |
-
overall_acc = 0
|
| 69 |
-
overall_ap = 0
|
| 70 |
-
overall_f1 = 0
|
| 71 |
-
|
| 72 |
-
audio_files = sorted(os.listdir(input_dir))
|
| 73 |
-
audio_files = [f for f in audio_files if f.endswith('.flac') or f.endswith('.wav')]
|
| 74 |
-
for audio_file in tqdm(audio_files):
|
| 75 |
-
base_name = Path(audio_file).stem
|
| 76 |
-
gt_name = base_name.replace('_denoised', '_times')
|
| 77 |
-
gt_file = gt_dir / f'{gt_name}.txt'
|
| 78 |
-
gt_times = read_gt(gt_file) * sample_rate
|
| 79 |
-
|
| 80 |
-
onsets, wav_norm = predict_audio(input_dir / audio_file, delta)
|
| 81 |
-
onsets = onset_nms(onsets, wav_norm)
|
| 82 |
-
|
| 83 |
-
onsets_onuse = copy.deepcopy(onsets.tolist())
|
| 84 |
-
onsets_res = [0 for _ in onsets_onuse]
|
| 85 |
-
|
| 86 |
-
y_gt = []
|
| 87 |
-
y_pred = []
|
| 88 |
-
hit_cnt = 0
|
| 89 |
-
for gt_onset in gt_times:
|
| 90 |
-
diff = [abs(pred_onset - gt_onset) for pred_onset in onsets_onuse]
|
| 91 |
-
idx_in_window = [idx for idx in range(len(onsets_onuse)) if diff[idx] < delta * 22050]
|
| 92 |
-
if len(idx_in_window) == 0:
|
| 93 |
-
y_gt.append(1)
|
| 94 |
-
y_pred.append(0)
|
| 95 |
-
else:
|
| 96 |
-
conf_in_window = [wav_norm[onsets[idx]] for idx in idx_in_window]
|
| 97 |
-
max_conf_idx = np.argsort(conf_in_window)[-1]
|
| 98 |
-
match_idx = idx_in_window[max_conf_idx]
|
| 99 |
-
conf = np.max(wav_norm[onsets_onuse[match_idx] -
|
| 100 |
-
conf_interval:onsets_onuse[match_idx] + conf_interval])
|
| 101 |
-
hit_cnt += 1
|
| 102 |
-
y_gt.append(1)
|
| 103 |
-
y_pred.append(conf)
|
| 104 |
-
# y_pred.append(1)
|
| 105 |
-
for i in range(len(onsets)):
|
| 106 |
-
if onsets[i] == onsets_onuse[match_idx]:
|
| 107 |
-
onsets_res[i] = 1
|
| 108 |
-
onsets_onuse.remove(onsets_onuse[match_idx])
|
| 109 |
-
if len(onsets_onuse) == 0:
|
| 110 |
-
break
|
| 111 |
-
|
| 112 |
-
for o in onsets_onuse:
|
| 113 |
-
y_gt.append(0)
|
| 114 |
-
y_pred.append(np.max(wav_norm[o - conf_interval:o + conf_interval]))
|
| 115 |
-
# y_pred.append(1)
|
| 116 |
-
|
| 117 |
-
acc = hit_cnt / len(gt_times) if len(gt_times) != 0 else 0
|
| 118 |
-
ap = average_precision_score(y_gt, y_pred)
|
| 119 |
-
f1 = f1_score(y_gt, [1 if p > 0 else 0 for p in y_pred])
|
| 120 |
-
# print(y_gt, y_pred, ap, f1)
|
| 121 |
-
|
| 122 |
-
overall_acc += acc
|
| 123 |
-
overall_ap += ap
|
| 124 |
-
overall_f1 += f1
|
| 125 |
-
|
| 126 |
-
overall_acc /= len(audio_files)
|
| 127 |
-
overall_ap /= len(audio_files)
|
| 128 |
-
overall_f1 /= len(audio_files)
|
| 129 |
-
print(f'Overall accuracy: {overall_acc:.4f}')
|
| 130 |
-
print(f'Overall AP: {overall_ap:.4f}')
|
| 131 |
-
print(f'Overall F1: {overall_f1:.4f}')
|
| 132 |
-
|
| 133 |
-
# write to file
|
| 134 |
-
with open(input_dir / 'eval_results.txt', 'w') as f:
|
| 135 |
-
f.write(f'Overall accuracy: {overall_acc:.4f}\n')
|
| 136 |
-
f.write(f'Overall AP: {overall_ap:.4f}\n')
|
| 137 |
-
f.write(f'Overall F1: {overall_f1:.4f}\n')
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
if __name__ == '__main__':
|
| 141 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/gradio_demo.py
DELETED
|
@@ -1,343 +0,0 @@
|
|
| 1 |
-
import gc
|
| 2 |
-
import logging
|
| 3 |
-
from argparse import ArgumentParser
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
from fractions import Fraction
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
import gradio as gr
|
| 9 |
-
import torch
|
| 10 |
-
import torchaudio
|
| 11 |
-
|
| 12 |
-
from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
|
| 13 |
-
load_video, make_video, setup_eval_logging)
|
| 14 |
-
from mmaudio.model.flow_matching import FlowMatching
|
| 15 |
-
from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
| 16 |
-
from mmaudio.model.sequence_config import SequenceConfig
|
| 17 |
-
from mmaudio.model.utils.features_utils import FeaturesUtils
|
| 18 |
-
|
| 19 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
| 20 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 21 |
-
|
| 22 |
-
log = logging.getLogger()
|
| 23 |
-
|
| 24 |
-
device = 'cpu'
|
| 25 |
-
if torch.cuda.is_available():
|
| 26 |
-
device = 'cuda'
|
| 27 |
-
elif torch.backends.mps.is_available():
|
| 28 |
-
device = 'mps'
|
| 29 |
-
else:
|
| 30 |
-
log.warning('CUDA/MPS are not available, running on CPU')
|
| 31 |
-
dtype = torch.bfloat16
|
| 32 |
-
|
| 33 |
-
model: ModelConfig = all_model_cfg['large_44k_v2']
|
| 34 |
-
model.download_if_needed()
|
| 35 |
-
output_dir = Path('./output/gradio')
|
| 36 |
-
|
| 37 |
-
setup_eval_logging()
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
|
| 41 |
-
seq_cfg = model.seq_cfg
|
| 42 |
-
|
| 43 |
-
net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
|
| 44 |
-
net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
|
| 45 |
-
log.info(f'Loaded weights from {model.model_path}')
|
| 46 |
-
|
| 47 |
-
feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
|
| 48 |
-
synchformer_ckpt=model.synchformer_ckpt,
|
| 49 |
-
enable_conditions=True,
|
| 50 |
-
mode=model.mode,
|
| 51 |
-
bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
|
| 52 |
-
need_vae_encoder=False)
|
| 53 |
-
feature_utils = feature_utils.to(device, dtype).eval()
|
| 54 |
-
|
| 55 |
-
return net, feature_utils, seq_cfg
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
net, feature_utils, seq_cfg = get_model()
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
@torch.inference_mode()
|
| 62 |
-
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
|
| 63 |
-
cfg_strength: float, duration: float):
|
| 64 |
-
|
| 65 |
-
rng = torch.Generator(device=device)
|
| 66 |
-
if seed >= 0:
|
| 67 |
-
rng.manual_seed(seed)
|
| 68 |
-
else:
|
| 69 |
-
rng.seed()
|
| 70 |
-
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 71 |
-
|
| 72 |
-
video_info = load_video(video, duration)
|
| 73 |
-
clip_frames = video_info.clip_frames
|
| 74 |
-
sync_frames = video_info.sync_frames
|
| 75 |
-
duration = video_info.duration_sec
|
| 76 |
-
clip_frames = clip_frames.unsqueeze(0)
|
| 77 |
-
sync_frames = sync_frames.unsqueeze(0)
|
| 78 |
-
seq_cfg.duration = duration
|
| 79 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 80 |
-
|
| 81 |
-
audios = generate(clip_frames,
|
| 82 |
-
sync_frames, [prompt],
|
| 83 |
-
negative_text=[negative_prompt],
|
| 84 |
-
feature_utils=feature_utils,
|
| 85 |
-
net=net,
|
| 86 |
-
fm=fm,
|
| 87 |
-
rng=rng,
|
| 88 |
-
cfg_strength=cfg_strength)
|
| 89 |
-
audio = audios.float().cpu()[0]
|
| 90 |
-
|
| 91 |
-
current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 92 |
-
output_dir.mkdir(exist_ok=True, parents=True)
|
| 93 |
-
video_save_path = output_dir / f'{current_time_string}.mp4'
|
| 94 |
-
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
| 95 |
-
gc.collect()
|
| 96 |
-
return video_save_path
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
@torch.inference_mode()
|
| 100 |
-
def image_to_audio(image: gr.Image, prompt: str, negative_prompt: str, seed: int, num_steps: int,
|
| 101 |
-
cfg_strength: float, duration: float):
|
| 102 |
-
|
| 103 |
-
rng = torch.Generator(device=device)
|
| 104 |
-
if seed >= 0:
|
| 105 |
-
rng.manual_seed(seed)
|
| 106 |
-
else:
|
| 107 |
-
rng.seed()
|
| 108 |
-
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 109 |
-
|
| 110 |
-
image_info = load_image(image)
|
| 111 |
-
clip_frames = image_info.clip_frames
|
| 112 |
-
sync_frames = image_info.sync_frames
|
| 113 |
-
clip_frames = clip_frames.unsqueeze(0)
|
| 114 |
-
sync_frames = sync_frames.unsqueeze(0)
|
| 115 |
-
seq_cfg.duration = duration
|
| 116 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 117 |
-
|
| 118 |
-
audios = generate(clip_frames,
|
| 119 |
-
sync_frames, [prompt],
|
| 120 |
-
negative_text=[negative_prompt],
|
| 121 |
-
feature_utils=feature_utils,
|
| 122 |
-
net=net,
|
| 123 |
-
fm=fm,
|
| 124 |
-
rng=rng,
|
| 125 |
-
cfg_strength=cfg_strength,
|
| 126 |
-
image_input=True)
|
| 127 |
-
audio = audios.float().cpu()[0]
|
| 128 |
-
|
| 129 |
-
current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 130 |
-
output_dir.mkdir(exist_ok=True, parents=True)
|
| 131 |
-
video_save_path = output_dir / f'{current_time_string}.mp4'
|
| 132 |
-
video_info = VideoInfo.from_image_info(image_info, duration, fps=Fraction(1))
|
| 133 |
-
make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
|
| 134 |
-
gc.collect()
|
| 135 |
-
return video_save_path
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
@torch.inference_mode()
|
| 139 |
-
def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
|
| 140 |
-
duration: float):
|
| 141 |
-
|
| 142 |
-
rng = torch.Generator(device=device)
|
| 143 |
-
if seed >= 0:
|
| 144 |
-
rng.manual_seed(seed)
|
| 145 |
-
else:
|
| 146 |
-
rng.seed()
|
| 147 |
-
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
|
| 148 |
-
|
| 149 |
-
clip_frames = sync_frames = None
|
| 150 |
-
seq_cfg.duration = duration
|
| 151 |
-
net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
|
| 152 |
-
|
| 153 |
-
audios = generate(clip_frames,
|
| 154 |
-
sync_frames, [prompt],
|
| 155 |
-
negative_text=[negative_prompt],
|
| 156 |
-
feature_utils=feature_utils,
|
| 157 |
-
net=net,
|
| 158 |
-
fm=fm,
|
| 159 |
-
rng=rng,
|
| 160 |
-
cfg_strength=cfg_strength)
|
| 161 |
-
audio = audios.float().cpu()[0]
|
| 162 |
-
|
| 163 |
-
current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 164 |
-
output_dir.mkdir(exist_ok=True, parents=True)
|
| 165 |
-
audio_save_path = output_dir / f'{current_time_string}.flac'
|
| 166 |
-
torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
|
| 167 |
-
gc.collect()
|
| 168 |
-
return audio_save_path
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
video_to_audio_tab = gr.Interface(
|
| 172 |
-
fn=video_to_audio,
|
| 173 |
-
description="""
|
| 174 |
-
Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
|
| 175 |
-
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
|
| 176 |
-
|
| 177 |
-
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
|
| 178 |
-
Doing so does not improve results.
|
| 179 |
-
""",
|
| 180 |
-
inputs=[
|
| 181 |
-
gr.Video(),
|
| 182 |
-
gr.Text(label='Prompt'),
|
| 183 |
-
gr.Text(label='Negative prompt', value='music'),
|
| 184 |
-
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
|
| 185 |
-
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
|
| 186 |
-
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
|
| 187 |
-
gr.Number(label='Duration (sec)', value=8, minimum=1),
|
| 188 |
-
],
|
| 189 |
-
outputs='playable_video',
|
| 190 |
-
cache_examples=False,
|
| 191 |
-
title='MMAudio — Video-to-Audio Synthesis',
|
| 192 |
-
examples=[
|
| 193 |
-
[
|
| 194 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
|
| 195 |
-
'waves, seagulls',
|
| 196 |
-
'',
|
| 197 |
-
0,
|
| 198 |
-
25,
|
| 199 |
-
4.5,
|
| 200 |
-
10,
|
| 201 |
-
],
|
| 202 |
-
[
|
| 203 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
|
| 204 |
-
'',
|
| 205 |
-
'music',
|
| 206 |
-
0,
|
| 207 |
-
25,
|
| 208 |
-
4.5,
|
| 209 |
-
10,
|
| 210 |
-
],
|
| 211 |
-
[
|
| 212 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
|
| 213 |
-
'bubbles',
|
| 214 |
-
'',
|
| 215 |
-
0,
|
| 216 |
-
25,
|
| 217 |
-
4.5,
|
| 218 |
-
10,
|
| 219 |
-
],
|
| 220 |
-
[
|
| 221 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
|
| 222 |
-
'Indian holy music',
|
| 223 |
-
'',
|
| 224 |
-
0,
|
| 225 |
-
25,
|
| 226 |
-
4.5,
|
| 227 |
-
10,
|
| 228 |
-
],
|
| 229 |
-
[
|
| 230 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
|
| 231 |
-
'galloping',
|
| 232 |
-
'',
|
| 233 |
-
0,
|
| 234 |
-
25,
|
| 235 |
-
4.5,
|
| 236 |
-
10,
|
| 237 |
-
],
|
| 238 |
-
[
|
| 239 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
|
| 240 |
-
'waves, storm',
|
| 241 |
-
'',
|
| 242 |
-
0,
|
| 243 |
-
25,
|
| 244 |
-
4.5,
|
| 245 |
-
10,
|
| 246 |
-
],
|
| 247 |
-
[
|
| 248 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
|
| 249 |
-
'storm',
|
| 250 |
-
'',
|
| 251 |
-
0,
|
| 252 |
-
25,
|
| 253 |
-
4.5,
|
| 254 |
-
10,
|
| 255 |
-
],
|
| 256 |
-
[
|
| 257 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
|
| 258 |
-
'',
|
| 259 |
-
'',
|
| 260 |
-
0,
|
| 261 |
-
25,
|
| 262 |
-
4.5,
|
| 263 |
-
10,
|
| 264 |
-
],
|
| 265 |
-
[
|
| 266 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
|
| 267 |
-
'typing',
|
| 268 |
-
'',
|
| 269 |
-
0,
|
| 270 |
-
25,
|
| 271 |
-
4.5,
|
| 272 |
-
10,
|
| 273 |
-
],
|
| 274 |
-
[
|
| 275 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
|
| 276 |
-
'',
|
| 277 |
-
'',
|
| 278 |
-
0,
|
| 279 |
-
25,
|
| 280 |
-
4.5,
|
| 281 |
-
10,
|
| 282 |
-
],
|
| 283 |
-
[
|
| 284 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
|
| 285 |
-
'',
|
| 286 |
-
'',
|
| 287 |
-
0,
|
| 288 |
-
25,
|
| 289 |
-
4.5,
|
| 290 |
-
10,
|
| 291 |
-
],
|
| 292 |
-
])
|
| 293 |
-
|
| 294 |
-
text_to_audio_tab = gr.Interface(
|
| 295 |
-
fn=text_to_audio,
|
| 296 |
-
description="""
|
| 297 |
-
Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
|
| 298 |
-
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
|
| 299 |
-
""",
|
| 300 |
-
inputs=[
|
| 301 |
-
gr.Text(label='Prompt'),
|
| 302 |
-
gr.Text(label='Negative prompt'),
|
| 303 |
-
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
|
| 304 |
-
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
|
| 305 |
-
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
|
| 306 |
-
gr.Number(label='Duration (sec)', value=8, minimum=1),
|
| 307 |
-
],
|
| 308 |
-
outputs='audio',
|
| 309 |
-
cache_examples=False,
|
| 310 |
-
title='MMAudio — Text-to-Audio Synthesis',
|
| 311 |
-
)
|
| 312 |
-
|
| 313 |
-
image_to_audio_tab = gr.Interface(
|
| 314 |
-
fn=image_to_audio,
|
| 315 |
-
description="""
|
| 316 |
-
Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
|
| 317 |
-
Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
|
| 318 |
-
|
| 319 |
-
NOTE: It takes longer to process high-resolution images (>384 px on the shorter side).
|
| 320 |
-
Doing so does not improve results.
|
| 321 |
-
""",
|
| 322 |
-
inputs=[
|
| 323 |
-
gr.Image(type='filepath'),
|
| 324 |
-
gr.Text(label='Prompt'),
|
| 325 |
-
gr.Text(label='Negative prompt'),
|
| 326 |
-
gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
|
| 327 |
-
gr.Number(label='Num steps', value=25, precision=0, minimum=1),
|
| 328 |
-
gr.Number(label='Guidance Strength', value=4.5, minimum=1),
|
| 329 |
-
gr.Number(label='Duration (sec)', value=8, minimum=1),
|
| 330 |
-
],
|
| 331 |
-
outputs='playable_video',
|
| 332 |
-
cache_examples=False,
|
| 333 |
-
title='MMAudio — Image-to-Audio Synthesis (experimental)',
|
| 334 |
-
)
|
| 335 |
-
|
| 336 |
-
if __name__ == "__main__":
|
| 337 |
-
parser = ArgumentParser()
|
| 338 |
-
parser.add_argument('--port', type=int, default=7860)
|
| 339 |
-
args = parser.parse_args()
|
| 340 |
-
|
| 341 |
-
gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab, image_to_audio_tab],
|
| 342 |
-
['Video-to-Audio', 'Text-to-Audio', 'Image-to-Audio (experimental)']).launch(
|
| 343 |
-
server_port=args.port, allowed_paths=[output_dir])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/sets/vgg-test.tsv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
MMAudio/sets/vgg-train.tsv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
MMAudio/sets/vgg-val.tsv
DELETED
|
@@ -1,2049 +0,0 @@
|
|
| 1 |
-
id label
|
| 2 |
-
--96EN9NUQM_000242 alarm clock ringing
|
| 3 |
-
-2toZf00LvI_000012 bowling impact
|
| 4 |
-
-8OE7Vydkl4_000221 bowling impact
|
| 5 |
-
-AEZuuoyJug_000030 playing violin, fiddle
|
| 6 |
-
-CUgrFw8TEI_000045 dog whimpering
|
| 7 |
-
-CexapzRAPQ_000051 ferret dooking
|
| 8 |
-
-DHGwygUsQc_000030 skateboarding
|
| 9 |
-
-G-o-Y4WuaU_000139 playing harmonica
|
| 10 |
-
-G_2v0L4U_s_000078 playing tennis
|
| 11 |
-
-HIPq7T3eFI_000011 driving motorcycle
|
| 12 |
-
-I8C3cRr5TY_000030 female singing
|
| 13 |
-
-K232jBK8VQ_000030 car passing by
|
| 14 |
-
-L_RH-nw11I_000025 vacuum cleaner cleaning floors
|
| 15 |
-
-MfBpxtGQmE_000020 ambulance siren
|
| 16 |
-
-NYZDjBz60I_000085 child singing
|
| 17 |
-
-QWcNg6FCgE_000022 playing bass guitar
|
| 18 |
-
-T06kz4MI20_000030 female singing
|
| 19 |
-
-UJJsEdgqMQ_000011 horse clip-clop
|
| 20 |
-
-VyLmfnIc5Q_000162 driving snowmobile
|
| 21 |
-
-W3y3qz3yp8_000256 people eating crisps
|
| 22 |
-
-WBvJuF2UOk_000030 playing acoustic guitar
|
| 23 |
-
-Yep0TGjWmc_000140 subway, metro, underground
|
| 24 |
-
-YrSxLTPdcA_000004 underwater bubbling
|
| 25 |
-
-YwZOeyAQC8_000002 baby laughter
|
| 26 |
-
-Zd-ZSnZ3so_000159 playing banjo
|
| 27 |
-
-_mqzXgg5eQ_000046 ripping paper
|
| 28 |
-
-c7lpU-_-V8_000030 motorboat, speedboat acceleration
|
| 29 |
-
-c96lccP5nc_000200 skidding
|
| 30 |
-
-eqkzAKGBZg_000030 playing drum kit
|
| 31 |
-
-geN4ECfl0Q_000030 playing bass guitar
|
| 32 |
-
-ibjrtJo9rY_000030 duck quacking
|
| 33 |
-
-nEg1olBLcw_000030 male singing
|
| 34 |
-
-s2G3Kto0Gw_000030 typing on computer keyboard
|
| 35 |
-
-s6dPB8fyQQ_000030 playing electric guitar
|
| 36 |
-
-tGOjLdrF6g_000087 playing squash
|
| 37 |
-
-v12qcLw5u0_000187 machine gun shooting
|
| 38 |
-
-vC3oqlxf4I_000010 slot machine
|
| 39 |
-
-vY141CdTc4_000030 playing bass guitar
|
| 40 |
-
-vmyjjovGXM_000116 cattle, bovinae cowbell
|
| 41 |
-
-vra5dNsP4w_000080 playing bass guitar
|
| 42 |
-
-w7WfMgSBD4_000047 lighting firecrackers
|
| 43 |
-
-wJ_UfBsiR0_000280 playing accordion
|
| 44 |
-
-xzWsDpVEiE_000060 child speech, kid speaking
|
| 45 |
-
-yby37u00N4_000030 playing violin, fiddle
|
| 46 |
-
-zHk3s6BkpA_000030 chainsawing trees
|
| 47 |
-
-zZR-ps0nJY_000137 hail
|
| 48 |
-
0-fd-lvizrY_000024 yodelling
|
| 49 |
-
00eb49xIULo_000030 female speech, woman speaking
|
| 50 |
-
01LPFe-13Aw_000030 playing electric guitar
|
| 51 |
-
01W8XIz7KDM_000007 donkey, ass braying
|
| 52 |
-
02t6zmS4RAk_000102 playing didgeridoo
|
| 53 |
-
038-gneOcks_000309 people eating crisps
|
| 54 |
-
04m_7jCGHko_000030 wind noise
|
| 55 |
-
04sf3v7xOzo_000005 cat meowing
|
| 56 |
-
055LCXe4pR8_000012 people whistling
|
| 57 |
-
09qDi4Auiyo_000030 playing electric guitar
|
| 58 |
-
0Ca2CTVwOxs_000019 cuckoo bird calling
|
| 59 |
-
0CvAFdtyVlo_000023 underwater bubbling
|
| 60 |
-
0G0mSrzOZ2M_000400 driving buses
|
| 61 |
-
0IvNbabusiY_000030 playing flute
|
| 62 |
-
0JPlNHX2HQ8_000049 playing accordion
|
| 63 |
-
0Lro_JzyUX0_000030 male speech, man speaking
|
| 64 |
-
0McmdH07r7w_000050 playing flute
|
| 65 |
-
0OHWW60khJ4_000030 playing bass guitar
|
| 66 |
-
0PZQL-Msz0s_000030 horse clip-clop
|
| 67 |
-
0RFEHUrGOP0_000170 playing acoustic guitar
|
| 68 |
-
0SsaL_YNyjY_000030 waterfall burbling
|
| 69 |
-
0T4gZQwzyKY_000030 people crowd
|
| 70 |
-
0U_Q9JTATCk_000044 owl hooting
|
| 71 |
-
0WIzNXqWrZk_000204 playing hockey
|
| 72 |
-
0XzJKHmoN6w_000019 duck quacking
|
| 73 |
-
0cMnDz8SSwQ_000014 disc scratching
|
| 74 |
-
0dkhsBmUZSY_000030 people cheering
|
| 75 |
-
0fQJ9nShofs_000093 dinosaurs bellowing
|
| 76 |
-
0hCiGC4c97g_000033 crow cawing
|
| 77 |
-
0hWyQpwHNDU_000030 motorboat, speedboat acceleration
|
| 78 |
-
0iVM2GY3R_c_000030 ambulance siren
|
| 79 |
-
0kar1O-1Ckk_000114 playing french horn
|
| 80 |
-
0m3kYCMUuCk_000000 cattle, bovinae cowbell
|
| 81 |
-
0sY8RR7V_q4_000220 female singing
|
| 82 |
-
0tJevlglhe4_000010 railroad car, train wagon
|
| 83 |
-
0uHGQmkKMr0_000223 people marching
|
| 84 |
-
0yAboI4QC6k_000109 hail
|
| 85 |
-
1-2zGkXe070_000098 rope skipping
|
| 86 |
-
10fjkn2eM_M_000050 slot machine
|
| 87 |
-
12tsmtyIALQ_000009 cat meowing
|
| 88 |
-
13LB6yibhQ8_000009 scuba diving
|
| 89 |
-
1CIxzqH4zzM_000040 ice cracking
|
| 90 |
-
1Fp6zPswdjI_000233 tapping guitar
|
| 91 |
-
1JMgZaCb9WM_000204 playing steelpan
|
| 92 |
-
1MCjHVRBDTk_000055 slot machine
|
| 93 |
-
1MLUEfkJDSw_000001 beat boxing
|
| 94 |
-
1MPwoS-R83A_000030 cat meowing
|
| 95 |
-
1Mx2iDMsZj8_000018 playing french horn
|
| 96 |
-
1NTsWn1Gir4_000103 playing snare drum
|
| 97 |
-
1NvpdqTAf3U_000030 skidding
|
| 98 |
-
1NwFHr4VHS0_000090 playing clarinet
|
| 99 |
-
1RB0gsxkPBo_000020 lions growling
|
| 100 |
-
1RSK3TFru0g_000000 sailing
|
| 101 |
-
1T1PLOWu65c_000250 skiing
|
| 102 |
-
1TARmg2FYJQ_000010 people whistling
|
| 103 |
-
1V65GzuCqaw_000030 bird chirping, tweeting
|
| 104 |
-
1Vn7SftZxS4_000030 rowboat, canoe, kayak rowing
|
| 105 |
-
1WaTnza9cn0_000160 playing violin, fiddle
|
| 106 |
-
1YGJDa3aCGo_000289 fire truck siren
|
| 107 |
-
1_CC87jIhXk_000382 swimming
|
| 108 |
-
1acVFuCvOJg_000512 canary calling
|
| 109 |
-
1bBdyTowO-M_000041 parrot talking
|
| 110 |
-
1dO7fONpkvE_000000 people farting
|
| 111 |
-
1eYmBacWt3k_000027 civil defense siren
|
| 112 |
-
1f9IgOjZjn4_000037 rapping
|
| 113 |
-
1gVugA2dsi4_000332 dinosaurs bellowing
|
| 114 |
-
1gXDaVse3SQ_000387 planing timber
|
| 115 |
-
1inu4aoQFKM_000164 planing timber
|
| 116 |
-
1kdGia7plHk_000030 playing electric guitar
|
| 117 |
-
1nDhQKLRJbg_000030 playing marimba, xylophone
|
| 118 |
-
1p8YDM6gG6Y_000014 dog howling
|
| 119 |
-
1t63KIS6F4I_000070 people sobbing
|
| 120 |
-
1x7wVFMW4dk_000030 playing acoustic guitar
|
| 121 |
-
1zWc46eeWLU_000167 playing sitar
|
| 122 |
-
2-Ipq91ns0k_000036 playing bass drum
|
| 123 |
-
21OWtKgJlIE_000270 canary calling
|
| 124 |
-
23ky1UGWeKg_000190 playing bass guitar
|
| 125 |
-
26KmPM2YkmQ_000004 ambulance siren
|
| 126 |
-
2A5eS9kMm-U_000018 owl hooting
|
| 127 |
-
2CebaASg1m4_000030 male singing
|
| 128 |
-
2EeOU7PgSck_000030 female singing
|
| 129 |
-
2F2NSNlc6dQ_000030 male singing
|
| 130 |
-
2FNZwK-4sUA_000030 female speech, woman speaking
|
| 131 |
-
2Jt4iqSqNTg_000012 bird chirping, tweeting
|
| 132 |
-
2LBEllUpWiA_000000 volcano explosion
|
| 133 |
-
2MDjnJzuUaU_000015 skidding
|
| 134 |
-
2NIaPAfScHM_000030 motorboat, speedboat acceleration
|
| 135 |
-
2NjwuyNgNoE_000050 playing hammond organ
|
| 136 |
-
2P7ZXBq5r04_000274 playing cornet
|
| 137 |
-
2RPPKMapBWY_000036 ice cream truck, ice cream van
|
| 138 |
-
2SlVaOyh69w_000219 cattle mooing
|
| 139 |
-
2Sto24aXwao_000097 baltimore oriole calling
|
| 140 |
-
2VdOQylRl08_000002 playing lacrosse
|
| 141 |
-
2YIZLARm8sI_000201 parrot talking
|
| 142 |
-
2d43OFDr5aI_000001 frog croaking
|
| 143 |
-
2ehs70MWQTs_000050 waterfall burbling
|
| 144 |
-
2fCC4BkdMT0_000106 basketball bounce
|
| 145 |
-
2fn6GFSwTEw_000096 cap gun shooting
|
| 146 |
-
2iwPgYGH_Ew_000400 railroad car, train wagon
|
| 147 |
-
2jy1b77hxXc_000136 playing bass guitar
|
| 148 |
-
2lALVOKDQNM_000059 dog howling
|
| 149 |
-
2myGIZCgZ2g_000018 tractor digging
|
| 150 |
-
2rSFLrwcvcY_000020 pheasant crowing
|
| 151 |
-
2szJ9STQPUk_000030 male singing
|
| 152 |
-
2w6jRF1Ekhs_000130 playing sitar
|
| 153 |
-
2xlWTgqPUOA_000004 beat boxing
|
| 154 |
-
2yeuzECPVUI_000033 playing badminton
|
| 155 |
-
2zev5MpJKPc_000039 chicken clucking
|
| 156 |
-
33NCPZjFuLE_000056 playing oboe
|
| 157 |
-
35MtyyqqQyw_000030 playing acoustic guitar
|
| 158 |
-
35c4EPiZ8JM_000030 horse clip-clop
|
| 159 |
-
35iGp2g_U6A_000000 church bell ringing
|
| 160 |
-
37Tl9YROdbA_000077 playing trombone
|
| 161 |
-
3EcAiTE0JyE_000052 playing theremin
|
| 162 |
-
3JyLYEjo4ok_000000 people giggling
|
| 163 |
-
3LfWg5Be60Q_000163 people burping
|
| 164 |
-
3MOG_CAcWkw_000142 playing badminton
|
| 165 |
-
3NcIWxDdTW0_000050 dog growling
|
| 166 |
-
3O8InHTYtk0_000020 male singing
|
| 167 |
-
3Okx0T5vpFc_000192 airplane flyby
|
| 168 |
-
3OxJ7KtIb2A_000100 playing saxophone
|
| 169 |
-
3QHNbJ_XATY_000036 civil defense siren
|
| 170 |
-
3S2-TODd__k_000090 train horning
|
| 171 |
-
3VK-nOg0-RQ_000046 pheasant crowing
|
| 172 |
-
3VSUuTABb3U_000074 wind chime
|
| 173 |
-
3WUTEMZv3EI_000046 slot machine
|
| 174 |
-
3YuBzhAU_Yc_000000 race car, auto racing
|
| 175 |
-
3cMrwXYnjd4_000026 air horn
|
| 176 |
-
3d5tPNd4Olk_000020 wind noise
|
| 177 |
-
3dBQbWPOjjI_000030 playing acoustic guitar
|
| 178 |
-
3djcJkGeJK8_000293 running electric fan
|
| 179 |
-
3e8ECt9wF5Y_000015 playing saxophone
|
| 180 |
-
3en9IzSPnNU_000027 driving snowmobile
|
| 181 |
-
3gTMehPiQ9s_000150 playing harpsichord
|
| 182 |
-
3kXROE2wcRA_000069 bowling impact
|
| 183 |
-
3p9aVzs8aYA_000030 female singing
|
| 184 |
-
3u3iunnXAOs_000432 playing hammond organ
|
| 185 |
-
3wboiuBfavA_000172 people nose blowing
|
| 186 |
-
3yolbg1tH9U_000030 male singing
|
| 187 |
-
4-_AWdbZnzE_000005 playing trombone
|
| 188 |
-
42Iss6TfcpQ_000742 lip smacking
|
| 189 |
-
433xsSMNLf4_000070 playing electronic organ
|
| 190 |
-
43ijm8y4z2o_000030 horse clip-clop
|
| 191 |
-
44UMQ5ZFuuY_000030 engine accelerating, revving, vroom
|
| 192 |
-
457yRHL0f2E_000030 female singing
|
| 193 |
-
45iXudFVQ4E_000000 subway, metro, underground
|
| 194 |
-
46LjKw-7mU0_000030 male singing
|
| 195 |
-
47QYxqXGZ3w_000244 people shuffling
|
| 196 |
-
47SP2azKv8Q_000030 playing electric guitar
|
| 197 |
-
47YlecLyyK0_000030 playing acoustic guitar
|
| 198 |
-
47y5k6vaUxE_000089 francolin calling
|
| 199 |
-
49gi-iYJ1F0_000107 tap dancing
|
| 200 |
-
4CLnZSI8aPs_000092 hair dryer drying
|
| 201 |
-
4DcOTOS_LE0_000454 sliding door
|
| 202 |
-
4DzuWR9ekko_000000 playing bugle
|
| 203 |
-
4E6mA8Y2Be0_000060 using sewing machines
|
| 204 |
-
4FOFcRJR9go_000084 playing glockenspiel
|
| 205 |
-
4H29LCZTMBs_000050 using sewing machines
|
| 206 |
-
4K345_DRFRk_000056 playing volleyball
|
| 207 |
-
4Ofe_ManxZc_000047 playing french horn
|
| 208 |
-
4OxCr981HvY_000016 ice cracking
|
| 209 |
-
4SlcVylJxxk_000297 arc welding
|
| 210 |
-
4WGMFP00rIg_000030 playing acoustic guitar
|
| 211 |
-
4YnMOFstVnk_000066 parrot talking
|
| 212 |
-
4_QGupz8UNA_000189 hail
|
| 213 |
-
4aFirNGu_P8_000381 planing timber
|
| 214 |
-
4dhyddSUAWg_000175 police radio chatter
|
| 215 |
-
4dkU-c4g1VM_000111 dog barking
|
| 216 |
-
4h9o2iL6nps_000050 child speech, kid speaking
|
| 217 |
-
4hU6jqQQUto_000009 playing harpsichord
|
| 218 |
-
4iBqpFUnPoA_000170 fireworks banging
|
| 219 |
-
4j7GbxZQjB8_000024 car engine knocking
|
| 220 |
-
4jHrFbnaVRc_000294 firing muskets
|
| 221 |
-
4kvqtJEFqjw_000190 playing bagpipes
|
| 222 |
-
4ldID97D-oU_000020 people coughing
|
| 223 |
-
4n657Imjmjo_000015 sheep bleating
|
| 224 |
-
4o2IRyXi-aY_000667 playing harpsichord
|
| 225 |
-
4rehS_cPodk_000020 female speech, woman speaking
|
| 226 |
-
4t_Qz9RyUm8_000006 alarm clock ringing
|
| 227 |
-
4yUvIrchOzQ_000280 playing saxophone
|
| 228 |
-
4zf3qRiZ3Ok_000030 child singing
|
| 229 |
-
4zsLfdNLUD4_000033 cat hissing
|
| 230 |
-
50OgBbJZUUc_000064 typing on typewriter
|
| 231 |
-
50jxPCLUFdU_000002 cricket chirping
|
| 232 |
-
53ohFLBl0iE_000052 alarm clock ringing
|
| 233 |
-
542uea0zO1I_000036 sea lion barking
|
| 234 |
-
54XBPEFJQc4_000076 playing djembe
|
| 235 |
-
574NjiOGi5s_000030 female singing
|
| 236 |
-
58KzLvK1OYs_000144 dog growling
|
| 237 |
-
5901zjV6oAo_000006 swimming
|
| 238 |
-
5AFKEd8nSpg_000050 people sniggering
|
| 239 |
-
5CtoZvJaGAM_000096 woodpecker pecking tree
|
| 240 |
-
5D201VjroT0_000229 sharpen knife
|
| 241 |
-
5D2E7s9bEf0_000010 basketball bounce
|
| 242 |
-
5EPnuy_sKHI_000010 singing bowl
|
| 243 |
-
5IZv217s4_E_000049 playing badminton
|
| 244 |
-
5KRxqVykvvI_000030 printer printing
|
| 245 |
-
5S3QDnRCnOQ_000003 tapping guitar
|
| 246 |
-
5Sv97J7mksY_000030 playing electric guitar
|
| 247 |
-
5UqwkZ1XK18_000050 helicopter
|
| 248 |
-
5VyCTHzLVdU_000011 playing bongo
|
| 249 |
-
5WVhslWt1wU_000030 female singing
|
| 250 |
-
5Wb1zMq_DiU_000020 fireworks banging
|
| 251 |
-
5XK1Vgiwllc_000073 playing mandolin
|
| 252 |
-
5X_B2L1-4Bc_000030 playing electric guitar
|
| 253 |
-
5briopN06L8_000000 playing piano
|
| 254 |
-
5eHlhJ-ZOpg_000030 playing hammond organ
|
| 255 |
-
5fZn_7LbKSI_000020 people burping
|
| 256 |
-
5hi4T4Gp6v4_000002 air horn
|
| 257 |
-
5hjKe0FWq9E_000002 horse neighing
|
| 258 |
-
5iEbFJkG6Xg_000557 bird squawking
|
| 259 |
-
5jQLK4Z1EH4_000020 wind noise
|
| 260 |
-
5jt7lR8WY3g_000172 playing castanets
|
| 261 |
-
5lV59hZgwRM_000009 scuba diving
|
| 262 |
-
5mBCF05DV5s_000280 church bell ringing
|
| 263 |
-
5mJ7_05tlhs_000005 crow cawing
|
| 264 |
-
5nscL4EBrXA_000030 male singing
|
| 265 |
-
5r1zW38AWvs_000057 wind chime
|
| 266 |
-
5rP9Z4jEq6s_000024 cap gun shooting
|
| 267 |
-
5xJdFysNSf4_000110 race car, auto racing
|
| 268 |
-
5xefixXFNwk_000020 playing bass guitar
|
| 269 |
-
64eXDlUgPoA_000079 lighting firecrackers
|
| 270 |
-
64lQIoDGX6o_000040 playing marimba, xylophone
|
| 271 |
-
64ollREPrUw_000132 raining
|
| 272 |
-
64zPbHPyiwE_000030 male speech, man speaking
|
| 273 |
-
659mhmSPXWA_000276 bee, wasp, etc. buzzing
|
| 274 |
-
65u3pwOEcBg_000002 frog croaking
|
| 275 |
-
67hDkeQalow_000030 motorboat, speedboat acceleration
|
| 276 |
-
68mXCuRvQkw_000045 people burping
|
| 277 |
-
6ARTjahUaYY_000030 playing electric guitar
|
| 278 |
-
6BQgJ0tvUkc_000162 baby babbling
|
| 279 |
-
6CYhRsU4F34_000000 people whistling
|
| 280 |
-
6EcmHiscsOc_000287 lighting firecrackers
|
| 281 |
-
6GsamqJ5tFU_000075 airplane flyby
|
| 282 |
-
6IMlkVOKxJw_000032 cap gun shooting
|
| 283 |
-
6IQkdce9a7Q_000184 slot machine
|
| 284 |
-
6KO3eMyEeOg_000000 race car, auto racing
|
| 285 |
-
6LB-qRj_zW4_000030 horse clip-clop
|
| 286 |
-
6LKFDTu9vRQ_000018 playing french horn
|
| 287 |
-
6NxeHScEnJE_000000 dog bow-wow
|
| 288 |
-
6OhTwJrVxXs_000028 playing timbales
|
| 289 |
-
6RGa6DvWpt0_000035 people marching
|
| 290 |
-
6UcuQgsHFCA_000142 playing french horn
|
| 291 |
-
6Y6CvX7EP68_000030 singing choir
|
| 292 |
-
6ZbVXBeNsX8_000125 playing didgeridoo
|
| 293 |
-
6aYfccsgIjk_000094 baby crying
|
| 294 |
-
6gTR_Avjz6g_000170 playing cymbal
|
| 295 |
-
6j2g_OZnW74_000189 missile launch
|
| 296 |
-
6mE_v9a5dbM_000030 male singing
|
| 297 |
-
6o0mZVMfKss_000140 people clapping
|
| 298 |
-
6of3tx7IOik_000030 wind noise
|
| 299 |
-
6shIFnN-LsY_000141 playing flute
|
| 300 |
-
6v53uAVpXC4_000071 people babbling
|
| 301 |
-
6wpifZcwOJU_000023 underwater bubbling
|
| 302 |
-
6xAClSJ21qA_000491 rapping
|
| 303 |
-
6xgTrufXcCM_000126 wood thrush calling
|
| 304 |
-
6yBZH5cV7GE_000030 playing electric guitar
|
| 305 |
-
6z_pfZ6Rvfs_000023 playing table tennis
|
| 306 |
-
7-7r-FRwp_w_000041 playing glockenspiel
|
| 307 |
-
72d2TsdeSg8_000000 tap dancing
|
| 308 |
-
75FLwnGZJTc_000125 playing oboe
|
| 309 |
-
75m0cvRBGY0_000030 vehicle horn, car horn, honking
|
| 310 |
-
76F-K-7HUXE_000010 lions roaring
|
| 311 |
-
77rq4-p4vV8_000030 wind noise
|
| 312 |
-
78hdsP0edMg_000030 railroad car, train wagon
|
| 313 |
-
7Ck8cfF2rl0_000200 otter growling
|
| 314 |
-
7ELF2dbWe5w_000010 female singing
|
| 315 |
-
7I_wdG-eOc0_000106 playing hammond organ
|
| 316 |
-
7JT43yyNGkk_000003 black capped chickadee calling
|
| 317 |
-
7JX-Bx0BETQ_000205 rapping
|
| 318 |
-
7LMkG7uISis_000102 playing gong
|
| 319 |
-
7MuetSj86N0_000490 bird squawking
|
| 320 |
-
7NyPcaVKao4_000025 dog growling
|
| 321 |
-
7Odi8SKArQI_000030 playing saxophone
|
| 322 |
-
7P-1-qzwyYA_000055 magpie calling
|
| 323 |
-
7Qr1ncg86N4_000007 lions roaring
|
| 324 |
-
7TMOCRG4EBA_000030 female singing
|
| 325 |
-
7U5V5Teqo8Q_000000 dog barking
|
| 326 |
-
7V6NAsZ86xw_000000 beat boxing
|
| 327 |
-
7VT8p9Er3n8_000020 mynah bird singing
|
| 328 |
-
7Y3u8Aj8UV4_000010 driving motorcycle
|
| 329 |
-
7YGUQYRwnHs_000019 horse neighing
|
| 330 |
-
7YTsyqVSEeI_000006 child singing
|
| 331 |
-
7b__KH3VA_o_000035 people booing
|
| 332 |
-
7caL9c6N1zc_000122 child singing
|
| 333 |
-
7gdSJ30FfNU_000490 people hiccup
|
| 334 |
-
7h7_U2q-VwY_000276 dog baying
|
| 335 |
-
7hdXzJpOXiY_000018 police car (siren)
|
| 336 |
-
7iT77hG1X18_000063 playing erhu
|
| 337 |
-
7kIhqlZok8c_000074 running electric fan
|
| 338 |
-
7kIhqlZok8c_000241 running electric fan
|
| 339 |
-
7lz-THXCwi8_000030 male speech, man speaking
|
| 340 |
-
7ogdSWU90s4_000100 opening or closing drawers
|
| 341 |
-
7pc3c5ZGbwo_000030 ocean burbling
|
| 342 |
-
7rLRSpEqgZk_000253 playing sitar
|
| 343 |
-
7tsuYUeV7_k_000191 airplane flyby
|
| 344 |
-
7vF2Qq0Pg6w_000024 ice cream truck, ice cream van
|
| 345 |
-
7xZxYm27FdA_000020 toilet flushing
|
| 346 |
-
7xaNqQ8FAwI_000100 mynah bird singing
|
| 347 |
-
7yBOHsPAJgw_000040 vacuum cleaner cleaning floors
|
| 348 |
-
7yXROxIZfeo_000053 raining
|
| 349 |
-
80KT6bYCFkg_000077 playing tuning fork
|
| 350 |
-
81ACguOEqoM_000042 electric shaver, electric razor shaving
|
| 351 |
-
82ic2Xisrqg_000030 car engine knocking
|
| 352 |
-
83mmLOdwZlA_000081 air conditioning noise
|
| 353 |
-
85Nd7APr5Os_000028 ice cracking
|
| 354 |
-
85Nd7APr5Os_000052 ice cracking
|
| 355 |
-
87-ZrpDyRHE_000238 cat hissing
|
| 356 |
-
88oLbuKd7Rg_000030 car passing by
|
| 357 |
-
8906Y6i-h10_000102 playing cymbal
|
| 358 |
-
89NzFtLSRSo_000030 engine accelerating, revving, vroom
|
| 359 |
-
8DDro-N5-54_000029 swimming
|
| 360 |
-
8GTkmen1bBg_000110 playing piano
|
| 361 |
-
8IdUE6nhR3E_000030 playing violin, fiddle
|
| 362 |
-
8N0GxZtk9wE_000051 playing didgeridoo
|
| 363 |
-
8NMHjXutgVs_000333 electric shaver, electric razor shaving
|
| 364 |
-
8Rh7NvJDexA_000068 tapping guitar
|
| 365 |
-
8VEqGk0W4xY_000192 playing darts
|
| 366 |
-
8XH7xIWWC6c_000090 people cheering
|
| 367 |
-
8Y9VKxl-1gE_000063 rapping
|
| 368 |
-
8ZWKl-_qHM0_000010 driving buses
|
| 369 |
-
8_xdWIziFpI_000030 baby crying
|
| 370 |
-
8b2ASj5nmos_000251 playing darts
|
| 371 |
-
8c9PJLozdtA_000020 playing bass drum
|
| 372 |
-
8jkr7bOR8ck_000146 playing table tennis
|
| 373 |
-
8lIh0qRN7PE_000220 cattle mooing
|
| 374 |
-
8m7VIFtS4gc_000000 typing on typewriter
|
| 375 |
-
8n76LfbY3qo_000232 chainsawing trees
|
| 376 |
-
8ngu3TPmfZQ_000110 playing drum kit
|
| 377 |
-
8ugZkKeLL7Y_000030 male speech, man speaking
|
| 378 |
-
8vE2wod7rhE_000030 horse neighing
|
| 379 |
-
8ytjUazIdno_000023 playing glockenspiel
|
| 380 |
-
9-N8v-cC0Tg_000002 air horn
|
| 381 |
-
9-xW047dMpk_000125 missile launch
|
| 382 |
-
913ItBzDHLQ_000124 playing synthesizer
|
| 383 |
-
91kWVMnyKxA_000019 bird squawking
|
| 384 |
-
92G0bdxj5ck_000091 mouse pattering
|
| 385 |
-
93a7wS41kLc_000060 splashing water
|
| 386 |
-
93rlsDDmFYo_000110 playing cymbal
|
| 387 |
-
95UKs8K92C4_000110 playing timpani
|
| 388 |
-
96wdXcwIbgk_001238 playing bongo
|
| 389 |
-
97svDuqFctI_000139 playing steel guitar, slide guitar
|
| 390 |
-
98Nc3x8U1JI_000187 playing badminton
|
| 391 |
-
993A2y5lv-s_000030 bird chirping, tweeting
|
| 392 |
-
99WZAe6QKUc_000030 people whispering
|
| 393 |
-
99cGCS0ko2Q_000120 playing saxophone
|
| 394 |
-
99ylFYthGcI_000004 donkey, ass braying
|
| 395 |
-
9A8hgZdD__g_000030 horse clip-clop
|
| 396 |
-
9CFR1VdlIMc_000142 rope skipping
|
| 397 |
-
9CmEsDtIz_Q_000060 playing accordion
|
| 398 |
-
9D9sfe1eaK8_000000 frog croaking
|
| 399 |
-
9HtYErt1moA_000100 playing drum kit
|
| 400 |
-
9IwjfATt51Y_000030 playing french horn
|
| 401 |
-
9JwaE3BmICE_000061 female speech, woman speaking
|
| 402 |
-
9LFGIpAO3NE_000161 tapping guitar
|
| 403 |
-
9LY2BJ2fqts_000090 people gargling
|
| 404 |
-
9Q1RM-pY2yY_000180 playing bongo
|
| 405 |
-
9UvWyax1fEU_000000 people booing
|
| 406 |
-
9Y3ausHODlk_000557 playing electronic organ
|
| 407 |
-
9ZCgk2e7wZM_000269 woodpecker pecking tree
|
| 408 |
-
9ZE18L9NN1Y_000105 striking pool
|
| 409 |
-
9af_fvuAY8E_000167 barn swallow calling
|
| 410 |
-
9exZEq85L1k_000260 playing trombone
|
| 411 |
-
9fvJeyH-4II_000100 fireworks banging
|
| 412 |
-
9gJ4NQYcakk_000030 male speech, man speaking
|
| 413 |
-
9k0OwVahe5Y_000066 playing cymbal
|
| 414 |
-
9pBp5wd9rpw_000043 firing cannon
|
| 415 |
-
9s6jvP1V56w_000080 people crowd
|
| 416 |
-
9t7YT0OKpaM_000130 playing cello
|
| 417 |
-
9tyf9HGsIe4_000000 people finger snapping
|
| 418 |
-
9zriIjvwqJw_000480 people burping
|
| 419 |
-
A08MfFxzmxo_000030 playing accordion
|
| 420 |
-
A0tXM5fSFrw_000062 alligators, crocodiles hissing
|
| 421 |
-
A1-T0wdI8Nw_000070 sloshing water
|
| 422 |
-
A1vf6We9a_Q_000290 mouse pattering
|
| 423 |
-
A2GEU2r5KnQ_000030 playing acoustic guitar
|
| 424 |
-
A551qSirV68_000298 alligators, crocodiles hissing
|
| 425 |
-
A55rHYLkwQk_000000 singing bowl
|
| 426 |
-
A95vqV9oM6g_000081 people marching
|
| 427 |
-
A9pIMNKQWCk_000005 footsteps on snow
|
| 428 |
-
AApTo3l6NfA_000030 train horning
|
| 429 |
-
AFmF56HVvVg_000151 cat purring
|
| 430 |
-
AI3om1uyCH0_000009 fire truck siren
|
| 431 |
-
AJhEl41TC5s_000443 lathe spinning
|
| 432 |
-
AM8-hH1Oahw_000510 driving buses
|
| 433 |
-
AOPgsB4hsH8_000206 electric grinder grinding
|
| 434 |
-
ARFBC4LeCFY_000019 chicken clucking
|
| 435 |
-
ARrb06s5a0Y_000082 donkey, ass braying
|
| 436 |
-
AUufm-TAVg8_000101 playing french horn
|
| 437 |
-
AW-JhveJXFw_000030 typing on typewriter
|
| 438 |
-
AXDomj6KnkE_000141 playing tabla
|
| 439 |
-
AZgG_6NE8j4_000240 parrot talking
|
| 440 |
-
Ac0OxSV8Nqk_000030 female speech, woman speaking
|
| 441 |
-
ActIkLSW20Y_000350 railroad car, train wagon
|
| 442 |
-
AefLmdFYR6k_000136 playing tambourine
|
| 443 |
-
AfrcYQw5mXw_000010 telephone bell ringing
|
| 444 |
-
Agl-AQmIYBE_000030 rowboat, canoe, kayak rowing
|
| 445 |
-
AhUYTb14QZU_000020 chimpanzee pant-hooting
|
| 446 |
-
AiriN8WOgiI_000123 playing bass drum
|
| 447 |
-
AjD1BiY0o8E_000001 pheasant crowing
|
| 448 |
-
AlTNj6IWey4_000112 airplane flyby
|
| 449 |
-
AlebU-Vdy18_000022 playing theremin
|
| 450 |
-
ApMojxDfms0_000273 magpie calling
|
| 451 |
-
AteFCZfJfLY_000011 vehicle horn, car horn, honking
|
| 452 |
-
AvguIvLb0GY_000027 electric shaver, electric razor shaving
|
| 453 |
-
Aw5BwrqdmHc_000010 canary calling
|
| 454 |
-
Aw9arRIoBR4_000030 sliding door
|
| 455 |
-
Aygl9-ur8NU_000001 gibbon howling
|
| 456 |
-
Az1M0iLYjIg_000030 ice cream truck, ice cream van
|
| 457 |
-
B1ax5dX6XrU_000215 wood thrush calling
|
| 458 |
-
B7cF_In3_-c_000030 horse neighing
|
| 459 |
-
B7zgWPjx8hg_000026 wind noise
|
| 460 |
-
B9Mk5n5Zwjg_000240 driving buses
|
| 461 |
-
BBumD37-y80_000110 train horning
|
| 462 |
-
BC4LglYv70Q_000108 playing oboe
|
| 463 |
-
BH8QYqAvO2k_000020 playing vibraphone
|
| 464 |
-
BJ31LCL3Dy4_000100 crow cawing
|
| 465 |
-
BLWCHd07ATw_000080 playing electronic organ
|
| 466 |
-
BM1fw080pSs_000030 female speech, woman speaking
|
| 467 |
-
BM4YyahEm8Q_000078 spraying water
|
| 468 |
-
BO1K4wXy2CI_000299 mosquito buzzing
|
| 469 |
-
BPD7Qj1U_Bo_000131 playing theremin
|
| 470 |
-
BPxW7nP4loQ_000003 eagle screaming
|
| 471 |
-
BTdIM1mncyA_000030 female speech, woman speaking
|
| 472 |
-
BWw6dgq07Qo_000053 playing clarinet
|
| 473 |
-
BYJ2UHIHCLU_000009 hail
|
| 474 |
-
BbUBFko93XE_000031 people sneezing
|
| 475 |
-
BbfDej2cM2I_000001 volcano explosion
|
| 476 |
-
BdIWeKYKIzk_000000 train horning
|
| 477 |
-
BfyYYuE12dw_000006 goose honking
|
| 478 |
-
BkXyLmdb8Yw_000290 playing hammond organ
|
| 479 |
-
BkwStRX3xE0_000010 fireworks banging
|
| 480 |
-
BmsTQHrCwB8_000215 heart sounds, heartbeat
|
| 481 |
-
BnRtoIC87Po_000030 female speech, woman speaking
|
| 482 |
-
BniijKHywXM_000103 playing tambourine
|
| 483 |
-
Bo271H1XM40_000127 arc welding
|
| 484 |
-
BpV7n-YUtos_000248 rope skipping
|
| 485 |
-
BpfM3evN6H8_000009 people eating apple
|
| 486 |
-
BsHgr_sj6ec_000058 playing bongo
|
| 487 |
-
BvdvbeIdUtk_000072 people booing
|
| 488 |
-
C1fdGRZRPtU_000040 barn swallow calling
|
| 489 |
-
C2kKMjYETRQ_000050 playing acoustic guitar
|
| 490 |
-
C5ik_rcugw8_000040 people marching
|
| 491 |
-
C7zLftUgskY_000035 playing harp
|
| 492 |
-
C85lxZAStBk_000056 playing hammond organ
|
| 493 |
-
CA3sbGHEE3c_000001 people screaming
|
| 494 |
-
CFazHdGsxcU_000155 lighting firecrackers
|
| 495 |
-
CG-2XtQI6sM_000000 cat hissing
|
| 496 |
-
CG_qvz_V1Jo_000374 playing gong
|
| 497 |
-
CJjTs72p1gI_000002 alarm clock ringing
|
| 498 |
-
CKwtP-eN1Zk_000014 striking pool
|
| 499 |
-
CUY3hob5V_o_000010 opening or closing car doors
|
| 500 |
-
CUqga7lwvfM_000080 subway, metro, underground
|
| 501 |
-
CVVrs_KA6sU_000030 rowboat, canoe, kayak rowing
|
| 502 |
-
CViouHw-mfQ_000122 playing cello
|
| 503 |
-
CYTTsSPohw0_000122 planing timber
|
| 504 |
-
CdcfD8mg-k4_000030 singing choir
|
| 505 |
-
CeD6RlRSr8M_000099 cat purring
|
| 506 |
-
Cg3XzrFzzpM_000060 playing accordion
|
| 507 |
-
CgevwvZLE3c_000219 running electric fan
|
| 508 |
-
CheeUmf4IhE_000030 fireworks banging
|
| 509 |
-
ChhTVgWMxiI_000030 duck quacking
|
| 510 |
-
CiLwbeRDj8E_000000 people crowd
|
| 511 |
-
CjwqjkkoJHY_000199 car engine starting
|
| 512 |
-
CkYUBci5xEM_000070 using sewing machines
|
| 513 |
-
Cntxv6aE3DY_000030 sliding door
|
| 514 |
-
Co1qXvuwkes_000146 arc welding
|
| 515 |
-
CpW7umx_bi0_000067 playing mandolin
|
| 516 |
-
CqgPmVXNdNQ_000058 striking pool
|
| 517 |
-
Csr7c9uFvQk_000028 dog whimpering
|
| 518 |
-
CvN_oC0AGvM_000340 toilet flushing
|
| 519 |
-
Cvgc82TDNnE_000025 lions roaring
|
| 520 |
-
CvxL2n9DX6w_000251 lighting firecrackers
|
| 521 |
-
CyW4FoAJ1MU_000260 police car (siren)
|
| 522 |
-
CzGGyIj84Hs_000030 pigeon, dove cooing
|
| 523 |
-
D-HXQTcZNGU_000130 female speech, woman speaking
|
| 524 |
-
D109-sQNo1k_000028 sliding door
|
| 525 |
-
D6BCygx6jcs_000000 dog howling
|
| 526 |
-
D7kL3EEOyR4_000050 helicopter
|
| 527 |
-
DAy_bV1d9c4_000046 playing squash
|
| 528 |
-
DGU-HbuX6rs_000230 people crowd
|
| 529 |
-
DJan9OSSF7c_000060 plastic bottle crushing
|
| 530 |
-
DO-9yuU9brk_000028 sea lion barking
|
| 531 |
-
DOi5UxxTknA_000041 driving snowmobile
|
| 532 |
-
DPpo_Whnuqc_000022 missile launch
|
| 533 |
-
DQelhAtUyHY_000030 playing electric guitar
|
| 534 |
-
DR7TdSc2ahQ_000030 sliding door
|
| 535 |
-
DSThhOKXU-c_000250 playing bass guitar
|
| 536 |
-
DSgKhbtDWWo_000400 playing flute
|
| 537 |
-
DX5_AglGFMw_000349 metronome
|
| 538 |
-
DXfTYgSGLac_000177 alligators, crocodiles hissing
|
| 539 |
-
DZo15IMYpmA_000206 vacuum cleaner cleaning floors
|
| 540 |
-
DaMG8zJSkuw_000100 playing trumpet
|
| 541 |
-
DbgRhWmYTJk_000002 frog croaking
|
| 542 |
-
Dbi2L5z8U-w_000020 driving motorcycle
|
| 543 |
-
DdZ6PSUQoQA_000050 female speech, woman speaking
|
| 544 |
-
DfZmOeeF_CI_000024 lathe spinning
|
| 545 |
-
DhaOFNnOC8o_000102 playing steel guitar, slide guitar
|
| 546 |
-
DhxWWDGdF8I_000159 frog croaking
|
| 547 |
-
Dj3sIimPrCk_000330 pigeon, dove cooing
|
| 548 |
-
DmSsL0Xde-I_000005 missile launch
|
| 549 |
-
DpIKdB4c_JU_000030 sloshing water
|
| 550 |
-
DqnMEAN1GVc_000098 baby laughter
|
| 551 |
-
DrPa82cqlSM_000008 playing mandolin
|
| 552 |
-
DroorVxOn5s_000030 engine accelerating, revving, vroom
|
| 553 |
-
DsVtCIaWv-Y_000377 hedge trimmer running
|
| 554 |
-
DtRqBLRUTRo_000069 playing clarinet
|
| 555 |
-
Dtiv9RNaA4U_000106 train horning
|
| 556 |
-
Duk5ikgbUfU_000030 playing violin, fiddle
|
| 557 |
-
DuyL15HJn6M_000036 elk bugling
|
| 558 |
-
DvascfU3OM4_000233 playing bongo
|
| 559 |
-
DwE0cQ3Xz70_000030 chainsawing trees
|
| 560 |
-
Dxxg6NenmBQ_000153 playing oboe
|
| 561 |
-
E0ocfyjk1lw_000129 scuba diving
|
| 562 |
-
E22HBR9rEkI_000030 lawn mowing
|
| 563 |
-
E22UuQ6SRf4_000001 fire truck siren
|
| 564 |
-
E4IHTinI-3k_000010 people whistling
|
| 565 |
-
E4dvhMWr7K0_000140 playing didgeridoo
|
| 566 |
-
E5ICgH7JVFI_000003 driving snowmobile
|
| 567 |
-
E67GhkgB8Jc_000033 cell phone buzzing
|
| 568 |
-
E6tu_4cO7ok_000107 playing cornet
|
| 569 |
-
E8LoFlcAC-M_000051 playing vibraphone
|
| 570 |
-
EDtJ88ZJtWo_000008 playing bagpipes
|
| 571 |
-
EEJp_Ssp0No_000004 dog howling
|
| 572 |
-
EG2bfvkpzjk_000136 playing steelpan
|
| 573 |
-
EGKE_rOo-Gg_000030 playing violin, fiddle
|
| 574 |
-
EHHBn9EAtg4_000040 people booing
|
| 575 |
-
EHHefsog-aM_000069 black capped chickadee calling
|
| 576 |
-
EHkkma0y1T8_000030 people sneezing
|
| 577 |
-
EJudk9RWsZI_000000 car engine starting
|
| 578 |
-
EKkFWhdVAOU_000032 woodpecker pecking tree
|
| 579 |
-
ETcwLdOldMg_000000 blowtorch igniting
|
| 580 |
-
EU3OmHbOUo0_000000 cattle mooing
|
| 581 |
-
EbnPPw9P3MQ_000409 snake rattling
|
| 582 |
-
Ee1Glgpx3YE_000038 scuba diving
|
| 583 |
-
EeUHgSkCSi8_000666 turkey gobbling
|
| 584 |
-
EhDl29RiF74_000085 black capped chickadee calling
|
| 585 |
-
EhaE7gijT78_000119 baby laughter
|
| 586 |
-
EkbcNbEn1Z8_000063 opening or closing car doors
|
| 587 |
-
EoubRuwDlrw_000038 canary calling
|
| 588 |
-
ErdH1gc3ZmU_000003 playing cornet
|
| 589 |
-
F186zkBSFjE_000110 helicopter
|
| 590 |
-
F1ZVQSywml4_000040 skateboarding
|
| 591 |
-
F3yETAYfYZg_000009 playing theremin
|
| 592 |
-
F6xLA2AA2GA_000090 people crowd
|
| 593 |
-
FAdeuN1uc-M_000230 subway, metro, underground
|
| 594 |
-
FCir2lQei8M_000030 playing harpsichord
|
| 595 |
-
FEltES9TUEU_000008 hammering nails
|
| 596 |
-
FGWcwpr_SeM_000133 fire truck siren
|
| 597 |
-
FGoXt7LIK3U_000010 police car (siren)
|
| 598 |
-
FHz8YQy4q5A_000027 tractor digging
|
| 599 |
-
FIPu0jd8I28_000030 people screaming
|
| 600 |
-
FIpCyWCy9Qc_000030 playing violin, fiddle
|
| 601 |
-
FRxNI559-Xs_000280 railroad car, train wagon
|
| 602 |
-
FUVXK29tUwQ_000000 owl hooting
|
| 603 |
-
FWuYLFTe3_8_000000 playing trombone
|
| 604 |
-
FXzP5bUz-Lo_000017 horse clip-clop
|
| 605 |
-
FauD2eg73V8_000030 playing electronic organ
|
| 606 |
-
Fd_SXrGw6ag_000030 playing marimba, xylophone
|
| 607 |
-
Fe9YJozRi78_000148 child singing
|
| 608 |
-
FfpD5XC8b5w_000137 playing bongo
|
| 609 |
-
FglnuP1jpRY_000030 playing cello
|
| 610 |
-
FhHBIlZ_5T8_000035 wood thrush calling
|
| 611 |
-
Fj34VCzy_Og_000030 horse clip-clop
|
| 612 |
-
Fpqf057G_SY_000000 chipmunk chirping
|
| 613 |
-
FpwtNUX45qU_000047 snake hissing
|
| 614 |
-
Frs4_Uf8Tq4_000127 ice cracking
|
| 615 |
-
FtCT62fiyrU_000270 church bell ringing
|
| 616 |
-
FtNV_Gq62l8_000019 cat meowing
|
| 617 |
-
FudSk5EUbAY_000156 playing ukulele
|
| 618 |
-
FvZqgCIbO2Q_000003 hail
|
| 619 |
-
FyszP9lfbDk_000001 playing didgeridoo
|
| 620 |
-
G-5AgMNzjv4_000017 vehicle horn, car horn, honking
|
| 621 |
-
G-Eokh465wM_000030 printer printing
|
| 622 |
-
G-IdABSxeHI_000097 dinosaurs bellowing
|
| 623 |
-
G-jsAK9ITwM_000030 ocean burbling
|
| 624 |
-
G6FhQuR3_88_000000 playing congas
|
| 625 |
-
G6nSnVQCxBQ_000095 elephant trumpeting
|
| 626 |
-
G7E7D2Z_Juo_000070 people burping
|
| 627 |
-
G7F8HVNw1lI_000081 scuba diving
|
| 628 |
-
G9AKWSzZtWI_000030 people eating
|
| 629 |
-
G9F38sObAns_000025 playing harpsichord
|
| 630 |
-
GAFJeF_AqZA_000086 hail
|
| 631 |
-
GBf5DgubSuE_000030 wind noise
|
| 632 |
-
GD8dVFZaWNU_000030 skateboarding
|
| 633 |
-
GDQjuDpqnJI_000030 wind noise
|
| 634 |
-
GL1TqKjpv1Q_000047 playing theremin
|
| 635 |
-
GLA-upuVPSA_000057 police radio chatter
|
| 636 |
-
GLtFkIbCZOY_000140 pigeon, dove cooing
|
| 637 |
-
GMNJCJ0ykfc_000050 male singing
|
| 638 |
-
GOFDdcvXq40_000030 goose honking
|
| 639 |
-
GPl4twCSrLQ_000001 coyote howling
|
| 640 |
-
GS_JqZCyqOc_000050 stream burbling
|
| 641 |
-
GT2frI8BMMM_000013 vehicle horn, car horn, honking
|
| 642 |
-
GTZkjw4aVn0_000030 engine accelerating, revving, vroom
|
| 643 |
-
GUSlicDnqIA_000045 playing congas
|
| 644 |
-
GX4kLN3hW4Y_000149 planing timber
|
| 645 |
-
GXIPKWMIVhs_000072 playing oboe
|
| 646 |
-
GXRHmy5Bqas_000008 vehicle horn, car horn, honking
|
| 647 |
-
GYJCyn2piCc_000329 lip smacking
|
| 648 |
-
GZoVDjx9ltQ_000235 playing erhu
|
| 649 |
-
GZoypVKRpCo_000003 cuckoo bird calling
|
| 650 |
-
G_hP5gvRfNw_000033 cat growling
|
| 651 |
-
GaFqib8bCLM_000019 tapping guitar
|
| 652 |
-
GbTzdC4mOtQ_000030 machine gun shooting
|
| 653 |
-
GbUoljsX3lg_000672 people gargling
|
| 654 |
-
GgUkhedV5e0_000190 female speech, woman speaking
|
| 655 |
-
GhizOxu0ZpI_000060 people belly laughing
|
| 656 |
-
GidBfE5JU3s_000005 vehicle horn, car horn, honking
|
| 657 |
-
GjKjnplphn4_000200 playing acoustic guitar
|
| 658 |
-
Gjide6V8U-E_000039 dog growling
|
| 659 |
-
GoMH9AL7YRA_000050 ambulance siren
|
| 660 |
-
GvPc1ncg0OY_000138 people booing
|
| 661 |
-
Gwp62TNrER0_000014 barn swallow calling
|
| 662 |
-
GxUovR3d2aM_000019 car engine knocking
|
| 663 |
-
GzAdcTtwkM0_000011 missile launch
|
| 664 |
-
H-ZKdWCEhbI_000140 fire crackling
|
| 665 |
-
H-jnsSCa-c8_000090 playing lacrosse
|
| 666 |
-
H-rd3O5haG8_000070 playing bass guitar
|
| 667 |
-
H0zmJjMoV-4_000012 playing squash
|
| 668 |
-
H1lx8lLLceQ_000120 machine gun shooting
|
| 669 |
-
H2r4JHm00Vg_000260 sheep bleating
|
| 670 |
-
H6onyc5r6os_000024 heart sounds, heartbeat
|
| 671 |
-
H6z_gPH8m2A_000055 people crowd
|
| 672 |
-
H7BcUVlPDsg_000026 parrot talking
|
| 673 |
-
HCPKDz63_s4_000000 child speech, kid speaking
|
| 674 |
-
HCuORBJf-Ho_000027 playing cornet
|
| 675 |
-
HL36YvzbFYs_000210 goose honking
|
| 676 |
-
HL_E1j069EI_000030 female speech, woman speaking
|
| 677 |
-
HOD29VAXJD8_000030 car engine knocking
|
| 678 |
-
HOI0ZaKLAMM_000030 fireworks banging
|
| 679 |
-
HOI7KapLzz4_000030 playing violin, fiddle
|
| 680 |
-
HOupeg-QhHk_000073 yodelling
|
| 681 |
-
HQSafj2aCNI_000100 playing banjo
|
| 682 |
-
HQlV2jYCz5k_000030 playing violin, fiddle
|
| 683 |
-
HR35d67Dhts_000108 singing bowl
|
| 684 |
-
HRaGv5q3P3E_000000 opening or closing drawers
|
| 685 |
-
HTRRMT1NQOc_000060 playing saxophone
|
| 686 |
-
HTqUtEGJ0As_000030 people whistling
|
| 687 |
-
HUP72tlgzyE_000066 playing badminton
|
| 688 |
-
HUWvhtKby-A_000033 car engine knocking
|
| 689 |
-
HW2o3t3fE_k_000062 francolin calling
|
| 690 |
-
HX2ccFGAuMU_000163 electric shaver, electric razor shaving
|
| 691 |
-
HX5BeffFwV0_000008 smoke detector beeping
|
| 692 |
-
HaABMNzUOvo_000030 wind rustling leaves
|
| 693 |
-
Hakqd6g2jaY_000110 helicopter
|
| 694 |
-
HcO60nHH4W0_000023 playing bass drum
|
| 695 |
-
HckqMrtU3dg_000133 playing double bass
|
| 696 |
-
HebxWsaO-LA_000115 train whistling
|
| 697 |
-
HkCt4hh_x58_000030 rowboat, canoe, kayak rowing
|
| 698 |
-
HlUvoEXQZYk_000007 playing tambourine
|
| 699 |
-
Hlp5qKMfdYk_000180 playing bass guitar
|
| 700 |
-
Honj-TQHx3U_000129 airplane
|
| 701 |
-
Hqhi7LioGyM_000030 playing marimba, xylophone
|
| 702 |
-
HsCj9l5Barg_000045 fire truck siren
|
| 703 |
-
HsX5XlPFOWI_000380 lawn mowing
|
| 704 |
-
Hum53_V1zw8_000001 wind noise
|
| 705 |
-
Hwp_62TYhDk_000110 playing marimba, xylophone
|
| 706 |
-
I-WMZh-ieC8_000280 playing harp
|
| 707 |
-
I-qeWJGSXuQ_000083 playing bassoon
|
| 708 |
-
I4ffG1Bh-d8_000156 playing oboe
|
| 709 |
-
I5wV1AFabIA_000029 frog croaking
|
| 710 |
-
I6_30m_TQ2o_000000 playing tuning fork
|
| 711 |
-
IBy30oL3yxw_000399 playing harpsichord
|
| 712 |
-
ICajcUYAan8_000410 people babbling
|
| 713 |
-
IEiseWb8Tao_000080 playing acoustic guitar
|
| 714 |
-
IF92YmTMtdk_000089 cattle, bovinae cowbell
|
| 715 |
-
IFGbGcs3bQQ_000034 chinchilla barking
|
| 716 |
-
IHaWOJuekYY_000109 tap dancing
|
| 717 |
-
IINqN6L2NsY_000285 tapping guitar
|
| 718 |
-
IJvYFkrfjBg_000049 tornado roaring
|
| 719 |
-
IKj9E33H8e8_000012 pig oinking
|
| 720 |
-
ILBWV9AFKDU_000115 playing ukulele
|
| 721 |
-
IN-9DFoS3fM_000007 bird squawking
|
| 722 |
-
IWVztd9QsXg_000005 owl hooting
|
| 723 |
-
IWhgJgeUQuA_000090 playing bagpipes
|
| 724 |
-
IYhq5aun18M_000181 police radio chatter
|
| 725 |
-
IZAasx5KIKE_000010 fireworks banging
|
| 726 |
-
IaAKobKeOtU_000271 people marching
|
| 727 |
-
IeD5tKVhuI4_000030 playing synthesizer
|
| 728 |
-
IeK6EDl8Z_k_000033 people clapping
|
| 729 |
-
IeW36MTcnBs_000117 dog growling
|
| 730 |
-
Ieca4fwxfyY_000049 tractor digging
|
| 731 |
-
IicM8tOXAFg_000146 pheasant crowing
|
| 732 |
-
Ik40yoz30vE_000068 woodpecker pecking tree
|
| 733 |
-
Il82kphC6es_000172 dinosaurs bellowing
|
| 734 |
-
IlLCyGNjG3M_000060 playing harp
|
| 735 |
-
InxgcOFzxWY_000070 chicken clucking
|
| 736 |
-
IpDU10kKguU_000311 vacuum cleaner cleaning floors
|
| 737 |
-
IqCRbzhPkvU_000000 lawn mowing
|
| 738 |
-
IrcX151sayY_000098 tapping guitar
|
| 739 |
-
IrkyGrHjygY_000020 tapping guitar
|
| 740 |
-
Irx-WWFsQYU_000667 people eating
|
| 741 |
-
ItnOPd_CktY_000020 people coughing
|
| 742 |
-
IuTgZQVcMBg_000007 sloshing water
|
| 743 |
-
Ivho6H4q1zk_000017 typing on typewriter
|
| 744 |
-
Iylzuk-0j64_000163 slot machine
|
| 745 |
-
J0ZBjy_EEtg_000015 people clapping
|
| 746 |
-
J18R3qBnJtA_000120 waterfall burbling
|
| 747 |
-
J1kAKMeULF8_000500 subway, metro, underground
|
| 748 |
-
J3K5HEX3gko_000030 playing banjo
|
| 749 |
-
J4VeWujsLJg_000030 typing on computer keyboard
|
| 750 |
-
J5ugw2GUbnY_000001 dog whimpering
|
| 751 |
-
J7fVkoC-Ha8_000711 people eating crisps
|
| 752 |
-
J82OaPeyioI_000030 horse clip-clop
|
| 753 |
-
JC33o6YxH9c_000220 playing piano
|
| 754 |
-
JHNBF0WJ-EM_000029 people belly laughing
|
| 755 |
-
JIdUC1zZb9M_000060 rowboat, canoe, kayak rowing
|
| 756 |
-
JK4YikH2myA_000161 playing vibraphone
|
| 757 |
-
JKrghKg6UBU_000260 ocean burbling
|
| 758 |
-
JKxdjXEI9Wc_000015 eagle screaming
|
| 759 |
-
JLPpMZlBOEI_000038 playing accordion
|
| 760 |
-
JQ3bFZbatGk_000030 people running
|
| 761 |
-
JQr-BRXrjN4_000002 airplane
|
| 762 |
-
JVevxopJjU8_000823 playing tabla
|
| 763 |
-
JXi1ZtJecYo_000001 bowling impact
|
| 764 |
-
J_k6z7_YVJU_000090 playing piano
|
| 765 |
-
Jbiig_IQdIo_000282 cap gun shooting
|
| 766 |
-
JcXhB_4B32o_000090 playing clarinet
|
| 767 |
-
JeJGThFGm80_000001 lighting firecrackers
|
| 768 |
-
JfAjUMKjoVI_000460 playing harp
|
| 769 |
-
JfiFq8tn5Pk_000009 playing steel guitar, slide guitar
|
| 770 |
-
Jk-SBbw7Afg_000140 driving buses
|
| 771 |
-
K-B9CIVeQ_U_000030 horse clip-clop
|
| 772 |
-
K-MCXLQmnFA_000004 playing banjo
|
| 773 |
-
K3KsP-m_c5I_000353 basketball bounce
|
| 774 |
-
K5HBK1c7noI_000010 cat meowing
|
| 775 |
-
KBc_FdBzN2U_000017 wind chime
|
| 776 |
-
KFFJI_TZmoY_000047 crow cawing
|
| 777 |
-
KJwga4gMEzU_000239 people slurping
|
| 778 |
-
KJxSJR3v6oE_000013 church bell ringing
|
| 779 |
-
KKd2qSxww1o_000002 typing on typewriter
|
| 780 |
-
KM_VudA7hgo_000030 people running
|
| 781 |
-
KOzRB30gxpE_000362 planing timber
|
| 782 |
-
KQbCjNzlYPs_000082 writing on blackboard with chalk
|
| 783 |
-
KU5WQZsoKRE_000079 child singing
|
| 784 |
-
K_8tBU1LYxU_000000 chicken crowing
|
| 785 |
-
Kbc8ioemPlA_000081 tractor digging
|
| 786 |
-
KdD8xho7ymw_000037 cat purring
|
| 787 |
-
KfqdB93utIg_000000 waterfall burbling
|
| 788 |
-
KfyYM6nq--A_000011 playing vibraphone
|
| 789 |
-
KnwgxGWxp7Y_000025 people whistling
|
| 790 |
-
Kp0W7S-oExs_000030 driving buses
|
| 791 |
-
Kq0Dbp3C4d0_000017 dog howling
|
| 792 |
-
KrUuPSM4LxM_000215 magpie calling
|
| 793 |
-
KsuQWEN0COQ_000199 playing darts
|
| 794 |
-
Kus5SmqOIrA_000024 mynah bird singing
|
| 795 |
-
Kwha8UYndzI_000090 playing didgeridoo
|
| 796 |
-
Kz4Jm9_iFeg_000038 hail
|
| 797 |
-
KzK6d6Qpu_o_000010 dog barking
|
| 798 |
-
KztFbSJPxg0_000197 planing timber
|
| 799 |
-
L4u9LOjcXoE_000000 people sobbing
|
| 800 |
-
LAaJfzvvlTI_000053 lions roaring
|
| 801 |
-
LAx_fanEB_g_000168 arc welding
|
| 802 |
-
LB2EbSmDSKw_000007 baby laughter
|
| 803 |
-
LBH_D9h18bw_000042 rope skipping
|
| 804 |
-
LCcPzeH_Cn4_000160 sailing
|
| 805 |
-
LE49c8e5VMU_000049 mynah bird singing
|
| 806 |
-
LEpzp8DnWyY_000026 sharpen knife
|
| 807 |
-
LGMZ9c7q8tE_000168 cat purring
|
| 808 |
-
LHYHo8wJF74_000342 playing oboe
|
| 809 |
-
LJsSbG5A1y0_000000 lighting firecrackers
|
| 810 |
-
LL618LsL2zY_000030 pig oinking
|
| 811 |
-
LMbyOx04l9E_000036 vehicle horn, car horn, honking
|
| 812 |
-
LNl3ANFth4Y_000021 mynah bird singing
|
| 813 |
-
LOLFOiNiS1o_000067 sharpen knife
|
| 814 |
-
LPTsZZVr06o_000030 people eating
|
| 815 |
-
LSsYBN_RvPc_000122 rapping
|
| 816 |
-
LWrztDg2BGI_000245 playing synthesizer
|
| 817 |
-
LYUkVukRObA_000236 pigeon, dove cooing
|
| 818 |
-
L_Da1Sv1iKU_000028 playing didgeridoo
|
| 819 |
-
L_OvLmH_feU_000021 dog growling
|
| 820 |
-
LaGhL-3ctOc_000048 playing double bass
|
| 821 |
-
LbjkUR-ERQw_000049 opening or closing drawers
|
| 822 |
-
LciaPQ1XV3c_000217 playing badminton
|
| 823 |
-
Lfmcj5VW6VE_000050 playing acoustic guitar
|
| 824 |
-
LgdtTzvKnT4_000030 rowboat, canoe, kayak rowing
|
| 825 |
-
Lj4Ngu0ars8_000138 electric shaver, electric razor shaving
|
| 826 |
-
LlRZR8xPOEw_000021 frog croaking
|
| 827 |
-
Lmp51YN-7wc_000466 people marching
|
| 828 |
-
LtqXpk2YGls_000010 chainsawing trees
|
| 829 |
-
LuxrhiicesU_000000 donkey, ass braying
|
| 830 |
-
LvzMerRGbCE_000099 bouncing on trampoline
|
| 831 |
-
LxdOWpwSzi0_000400 mouse pattering
|
| 832 |
-
Lz8Ytz12MrU_000120 chopping wood
|
| 833 |
-
M0EaEBlx5fk_000126 yodelling
|
| 834 |
-
M9cNmb9HKPc_000110 turkey gobbling
|
| 835 |
-
MApvC99wovc_000159 car engine starting
|
| 836 |
-
MEtdxR3RdEA_000180 playing piano
|
| 837 |
-
MFEhejrPVmw_000040 dog barking
|
| 838 |
-
MMTvsiahcsc_000002 fire crackling
|
| 839 |
-
MMjEIFDYQvc_000117 yodelling
|
| 840 |
-
MQPNvRDVuUs_000100 playing french horn
|
| 841 |
-
MQPggq37uX8_000003 scuba diving
|
| 842 |
-
MQcS6DqCjKQ_000030 playing vibraphone
|
| 843 |
-
MRnnE9MTm64_000052 driving snowmobile
|
| 844 |
-
MTD6-1mrtP8_000072 owl hooting
|
| 845 |
-
MVmJujaAocY_000030 baby crying
|
| 846 |
-
MXmetP4F-EU_000019 door slamming
|
| 847 |
-
MdUG2H5K5eg_000117 roller coaster running
|
| 848 |
-
MenAsca8z6s_000137 slot machine
|
| 849 |
-
Mf6bCl5HKgc_000000 wind chime
|
| 850 |
-
MfSXrFJt6d4_000007 motorboat, speedboat acceleration
|
| 851 |
-
Mhzz75z8mbY_000166 playing ukulele
|
| 852 |
-
Mk8fhA3DAsA_000030 turkey gobbling
|
| 853 |
-
MkrFhq3F_z4_000100 playing accordion
|
| 854 |
-
MlX7I-OZIyk_000062 playing timpani
|
| 855 |
-
Mmyr6Gpclbk_000070 bird chirping, tweeting
|
| 856 |
-
Mnv4KVEt18I_000018 people giggling
|
| 857 |
-
Msh94MTYC6A_000290 chainsawing trees
|
| 858 |
-
MshXUve673A_000363 elk bugling
|
| 859 |
-
Mvn2oFoKxwI_000128 people booing
|
| 860 |
-
Mvue0y_EsDU_000000 orchestra
|
| 861 |
-
MwVghEDjyQM_000030 people sobbing
|
| 862 |
-
MwsoiJOqg_g_000030 duck quacking
|
| 863 |
-
MwyzEfk2xbA_000054 playing double bass
|
| 864 |
-
Mzc3DajWA0k_000030 ice cracking
|
| 865 |
-
Mzgas545UXU_000090 playing snare drum
|
| 866 |
-
N09QFSbvIC4_000150 playing electronic organ
|
| 867 |
-
N2DQWIePoLs_000030 playing violin, fiddle
|
| 868 |
-
N3_jZV1ejnA_000030 crow cawing
|
| 869 |
-
N5CNEOKptjo_000000 splashing water
|
| 870 |
-
N8cNWpCL0Rs_000183 owl hooting
|
| 871 |
-
N9cM9BdATNs_000081 people booing
|
| 872 |
-
NAETplWD64g_000030 playing harpsichord
|
| 873 |
-
NAk-PU3X_DQ_000026 mynah bird singing
|
| 874 |
-
NBeonGAqO84_000032 playing bugle
|
| 875 |
-
NCdkXluu-D8_000350 playing harpsichord
|
| 876 |
-
NFd5Zot-0_c_000006 heart sounds, heartbeat
|
| 877 |
-
NJfJ4E9EVoM_000120 people whispering
|
| 878 |
-
NN6mOUDBjEM_000042 dog howling
|
| 879 |
-
NWSsGcjVRDw_000245 playing tabla
|
| 880 |
-
NZs6RgHZOoI_000013 firing muskets
|
| 881 |
-
NfcCnLiHlqU_000134 playing erhu
|
| 882 |
-
NhO6B0zM9Pc_000030 playing electronic organ
|
| 883 |
-
NjKRF79wl5Y_000110 wind noise
|
| 884 |
-
Nkz9_eGsHKY_000057 people booing
|
| 885 |
-
NmdqThtOVro_000160 beat boxing
|
| 886 |
-
NnNm_oqkG0o_000050 people sobbing
|
| 887 |
-
NniPHshHj9M_000068 playing didgeridoo
|
| 888 |
-
NqxCX4G3N2g_000107 playing volleyball
|
| 889 |
-
NrCNo4V7RVM_000030 lawn mowing
|
| 890 |
-
NrWxMrh7cGw_000210 playing cornet
|
| 891 |
-
NtJQ6W2o0EI_000075 canary calling
|
| 892 |
-
NwIDavS0llk_000010 chicken crowing
|
| 893 |
-
O-C9p_sK_eI_000030 horse clip-clop
|
| 894 |
-
O0QV4_JRM0M_000002 car engine knocking
|
| 895 |
-
O15FUv56iCc_000040 playing cymbal
|
| 896 |
-
O3geFV-GoqM_000031 fire truck siren
|
| 897 |
-
O5LFB39yCA4_000085 missile launch
|
| 898 |
-
O5TMWyFd1DQ_000180 playing vibraphone
|
| 899 |
-
O6_sGC3v96g_000006 wood thrush calling
|
| 900 |
-
O7KCtFRaWck_000080 alarm clock ringing
|
| 901 |
-
OEu8pZpN8ZA_000000 using sewing machines
|
| 902 |
-
OIqUka8BOS8_021217 warbler chirping
|
| 903 |
-
OLtTuBhG-og_000075 playing squash
|
| 904 |
-
OTVFQoNRQTs_000060 playing bass guitar
|
| 905 |
-
OWlCVuOznw0_000019 arc welding
|
| 906 |
-
OZ14CiqpJL8_000010 child speech, kid speaking
|
| 907 |
-
Ocdu7Lz0IuU_000003 francolin calling
|
| 908 |
-
OdGHvGlSUcM_000157 playing timpani
|
| 909 |
-
Of59qi5xxkM_000050 playing drum kit
|
| 910 |
-
OiIaJb68Haw_000050 playing banjo
|
| 911 |
-
OjOQ0K6lza8_000018 playing tuning fork
|
| 912 |
-
Okd7ksWR-fc_000547 swimming
|
| 913 |
-
Oljdv3iSTBc_000047 people eating noodle
|
| 914 |
-
Om-Uc7ia1f0_000320 playing bass guitar
|
| 915 |
-
OrueZOVOAD8_000010 motorboat, speedboat acceleration
|
| 916 |
-
OtDVd-1zaqU_000030 motorboat, speedboat acceleration
|
| 917 |
-
Ow1ZEhmP3qU_000116 typing on typewriter
|
| 918 |
-
OyoJ99jDQdo_000147 playing didgeridoo
|
| 919 |
-
P1eMMIK0cTs_000011 mynah bird singing
|
| 920 |
-
P2taxpwuzcw_000053 wood thrush calling
|
| 921 |
-
P2wbv4C6bBA_000210 barn swallow calling
|
| 922 |
-
P35m_Rn7HbA_000030 motorboat, speedboat acceleration
|
| 923 |
-
P5Y1D-fSVfg_000054 lip smacking
|
| 924 |
-
P6sG1m6C4zI_000081 mouse pattering
|
| 925 |
-
PavKY6YlSl4_000026 people whistling
|
| 926 |
-
PawUc0pqf9M_000260 car engine starting
|
| 927 |
-
PbomocKzqKU_000109 splashing water
|
| 928 |
-
PeJxiP0CPn4_000025 playing didgeridoo
|
| 929 |
-
PfwBOCxEst8_000243 cheetah chirrup
|
| 930 |
-
PjbxRjKvzw4_000030 wind noise
|
| 931 |
-
Pll-TpbHen4_000067 airplane
|
| 932 |
-
Pp61sP7bols_000076 cap gun shooting
|
| 933 |
-
PrIQbadXX74_000692 playing oboe
|
| 934 |
-
PsmihTl5Cx8_000060 pig oinking
|
| 935 |
-
Pu4BCOv6e5Q_000020 fireworks banging
|
| 936 |
-
PvE48Ub_CgA_000034 bird chirping, tweeting
|
| 937 |
-
PvpA8y7-ZC4_000101 people burping
|
| 938 |
-
Pvt8VUQ_Bso_000030 playing vibraphone
|
| 939 |
-
PxEpiEid_c8_000177 slot machine
|
| 940 |
-
Py5s1uL46L0_000100 male singing
|
| 941 |
-
Pz618GchhGI_000001 otter growling
|
| 942 |
-
Pz9BhPMUzv8_000258 lathe spinning
|
| 943 |
-
Q38lPvwj5Gw_000234 swimming
|
| 944 |
-
Q57DFiTwcM4_000221 people eating noodle
|
| 945 |
-
Q5jnMD1z86k_000287 people eating noodle
|
| 946 |
-
Q7X3fyId2U0_000090 tornado roaring
|
| 947 |
-
Q7ZPnRQraJk_000200 playing clarinet
|
| 948 |
-
Q9AvyaxgRRo_000141 playing steel guitar, slide guitar
|
| 949 |
-
QBFaKTDXCCQ_000120 playing acoustic guitar
|
| 950 |
-
QBfcf-k5U28_000007 vehicle horn, car horn, honking
|
| 951 |
-
QCX3H9wXgpo_000053 cap gun shooting
|
| 952 |
-
QEcQtxP1fdg_000056 playing bongo
|
| 953 |
-
QGkUBiVG8-Y_000034 owl hooting
|
| 954 |
-
QH5ZtCI9Hts_000125 chopping wood
|
| 955 |
-
QL6Ws4i07is_000040 goat bleating
|
| 956 |
-
QO9sbXhMq08_000220 people hiccup
|
| 957 |
-
QOFuXRetSLI_000064 arc welding
|
| 958 |
-
QT1nE5lR7wA_000035 cat growling
|
| 959 |
-
QTqN9c6661s_000000 forging swords
|
| 960 |
-
QUMzyZRYpWs_000019 playing steel guitar, slide guitar
|
| 961 |
-
QXEq7sE7dqg_000030 police car (siren)
|
| 962 |
-
QXjaLCotbpY_000055 playing zither
|
| 963 |
-
QZ-cG6VdBHM_000070 helicopter
|
| 964 |
-
QaBmzAFivPQ_000096 people marching
|
| 965 |
-
QcL-X7hJJYQ_000000 people whistling
|
| 966 |
-
Qd9_UMNMhcA_000010 typing on computer keyboard
|
| 967 |
-
QdEYMboSweA_000001 playing oboe
|
| 968 |
-
Qdrcv-ZjC-g_000037 car engine starting
|
| 969 |
-
QgEi6pAW36g_000150 male speech, man speaking
|
| 970 |
-
QgHYiH6ES08_000120 basketball bounce
|
| 971 |
-
QgQKqaMqRgs_000062 playing bongo
|
| 972 |
-
QhRcayuLZ48_000390 playing piano
|
| 973 |
-
Qlj2HEcX05Q_000136 playing saxophone
|
| 974 |
-
QpA1_cezBwA_000123 mouse clicking
|
| 975 |
-
Qr2PeUXBJu4_000197 playing erhu
|
| 976 |
-
Qs8RjZlOcdU_000030 car passing by
|
| 977 |
-
QsHeqaa4Ckc_000072 people whistling
|
| 978 |
-
QsNHM92SIvo_000000 people whistling
|
| 979 |
-
QvEMTs9_RQE_000010 lawn mowing
|
| 980 |
-
Qwxa7ZCEBQs_000006 cricket chirping
|
| 981 |
-
QyRrtn5AoSg_000280 playing saxophone
|
| 982 |
-
QypTigdvLWU_000017 firing cannon
|
| 983 |
-
R0YwusOkMx0_000008 bowling impact
|
| 984 |
-
R1h5rRHM3oI_000000 donkey, ass braying
|
| 985 |
-
R29qwv_mh4E_000018 playing bassoon
|
| 986 |
-
R3VnztSX-k8_000200 ice cream truck, ice cream van
|
| 987 |
-
R7KnzEqUGAc_000040 playing cymbal
|
| 988 |
-
R7bSeIfRG-Y_000590 eating with cutlery
|
| 989 |
-
R8cWq9GoEpE_000037 pheasant crowing
|
| 990 |
-
RBHqcDacio0_000182 beat boxing
|
| 991 |
-
RCIMcizSSZU_000044 francolin calling
|
| 992 |
-
RDuDqEmKucQ_000030 motorboat, speedboat acceleration
|
| 993 |
-
RJNjaPizyKg_000099 playing theremin
|
| 994 |
-
RKZmAYXXWbg_000247 canary calling
|
| 995 |
-
RM6uf-sdVQI_000043 playing bass drum
|
| 996 |
-
RN96eLdMN_I_000005 bull bellowing
|
| 997 |
-
ROsAOQe62gs_000050 playing electronic organ
|
| 998 |
-
RVqCdL7_G2Y_000030 car engine knocking
|
| 999 |
-
RWnvolYKQ2o_000414 lip smacking
|
| 1000 |
-
R_SNrPUIa1A_000140 playing bassoon
|
| 1001 |
-
R_yW6SKe_-M_000080 people booing
|
| 1002 |
-
RaawVrMvP7k_000048 pheasant crowing
|
| 1003 |
-
Rb0IEIeJTKY_000002 basketball bounce
|
| 1004 |
-
Rc_exQXrUG0_000100 skidding
|
| 1005 |
-
ReZUlDwGaLY_000080 playing marimba, xylophone
|
| 1006 |
-
Ria-XrpfgsA_000089 people marching
|
| 1007 |
-
Rifu8nB2cCs_000043 cat purring
|
| 1008 |
-
Riu9TpsQ_mk_000009 pig oinking
|
| 1009 |
-
RmGGiQMURcQ_000022 people sniggering
|
| 1010 |
-
RoLNzNAv-Ig_000030 motorboat, speedboat acceleration
|
| 1011 |
-
RrpMoJrp4AY_000180 people crowd
|
| 1012 |
-
RsYAulhucVI_000011 lions roaring
|
| 1013 |
-
RtHMCINXA0s_000052 cricket chirping
|
| 1014 |
-
Rur-IfwPZho_000051 dog howling
|
| 1015 |
-
RwE9JAktTvU_000580 people coughing
|
| 1016 |
-
RyV40yhlOeU_000419 people marching
|
| 1017 |
-
S29c6T__5HU_000003 playing timpani
|
| 1018 |
-
S3Ipyd9HHLk_000185 magpie calling
|
| 1019 |
-
S45cdr4x-mc_000080 chainsawing trees
|
| 1020 |
-
S9fw7NHd2eo_000380 playing electric guitar
|
| 1021 |
-
SBYzwBhUpYs_000166 playing badminton
|
| 1022 |
-
SBwOIJoGChM_000116 hammering nails
|
| 1023 |
-
SCjdlZSW8nY_000111 playing table tennis
|
| 1024 |
-
SGkzdDWFIHI_000085 playing bass drum
|
| 1025 |
-
SHebWHn0c2Y_000005 chopping wood
|
| 1026 |
-
SPnZIDCnKwM_000030 orchestra
|
| 1027 |
-
SS6iMabGB1Y_000020 chimpanzee pant-hooting
|
| 1028 |
-
ST33aEP5Hbc_000006 train horning
|
| 1029 |
-
SXC13GS87Co_000031 woodpecker pecking tree
|
| 1030 |
-
SXHYr-7nPaw_000030 playing drum kit
|
| 1031 |
-
SYDQX7Whjm4_000061 woodpecker pecking tree
|
| 1032 |
-
SYWqIfMOmGE_000051 hammering nails
|
| 1033 |
-
S_0v5j4S100_000039 cat purring
|
| 1034 |
-
S_qPgRNSkIw_000370 people clapping
|
| 1035 |
-
SbXyRN0DD-g_000080 dog bow-wow
|
| 1036 |
-
Sc-Ld96kbN0_000144 playing synthesizer
|
| 1037 |
-
SdCzaAUA6Xs_000005 playing djembe
|
| 1038 |
-
SeZm-iy9n8M_000150 playing electronic organ
|
| 1039 |
-
Sf0aZczIZVU_000040 playing cello
|
| 1040 |
-
SgYh5Lb7tlM_000130 playing flute
|
| 1041 |
-
SifYJFmSSRw_000123 playing marimba, xylophone
|
| 1042 |
-
Sl4weBj8xfc_000030 typing on computer keyboard
|
| 1043 |
-
SoVEYhxQabk_000103 canary calling
|
| 1044 |
-
Spm_zrjedzk_000392 cap gun shooting
|
| 1045 |
-
Sqq2dUA8t3A_000586 playing harmonica
|
| 1046 |
-
SvJ0kUY22C8_000055 turkey gobbling
|
| 1047 |
-
Sw6qDVMsR5M_000030 playing violin, fiddle
|
| 1048 |
-
SwQie7apk78_000198 playing darts
|
| 1049 |
-
SyEVBFw_9oE_000120 people screaming
|
| 1050 |
-
SyfyWK7dKXA_000021 playing squash
|
| 1051 |
-
SzmORuHD4g4_000059 wind chime
|
| 1052 |
-
T-AN31N4LD0_000050 people screaming
|
| 1053 |
-
T0NMgZC7CDU_000011 chipmunk chirping
|
| 1054 |
-
T19Xf5-OTHw_000130 playing piano
|
| 1055 |
-
T2zZbnu_NtM_000029 playing table tennis
|
| 1056 |
-
T4KEGH_8lY8_000119 playing timpani
|
| 1057 |
-
TAdH0kUJj9k_000050 helicopter
|
| 1058 |
-
TCUnK4k7QZ0_000000 telephone bell ringing
|
| 1059 |
-
TDh8_ixGzIo_000030 printer printing
|
| 1060 |
-
TGngN3n7EMw_000024 airplane flyby
|
| 1061 |
-
TLSmnnnyhEk_000030 people shuffling
|
| 1062 |
-
TMyd50KWyNo_000311 people slurping
|
| 1063 |
-
TNCcQfbselM_000120 francolin calling
|
| 1064 |
-
TQapWHNS5FE_000024 car engine knocking
|
| 1065 |
-
TRW01xXMMqg_000210 playing accordion
|
| 1066 |
-
TRt_14JcRWQ_000080 playing bass drum
|
| 1067 |
-
TTElms_ZWqI_000428 hair dryer drying
|
| 1068 |
-
TTstWFDMmqc_000030 people whistling
|
| 1069 |
-
TUPEF6PQxow_000132 rapping
|
| 1070 |
-
T_iuImHtqUI_000010 people sobbing
|
| 1071 |
-
Ta__Ev0mkBk_000030 chainsawing trees
|
| 1072 |
-
TakDv24Tiq0_000032 plastic bottle crushing
|
| 1073 |
-
TcN0QofoTvg_000221 playing erhu
|
| 1074 |
-
TdkhMZZvdgc_000006 owl hooting
|
| 1075 |
-
Tdyh5ziqH-U_000007 lions roaring
|
| 1076 |
-
TiaGOZ-ibxw_000411 people booing
|
| 1077 |
-
TriRWR9YiNk_000016 frog croaking
|
| 1078 |
-
Tse5rzNV5dk_000084 pheasant crowing
|
| 1079 |
-
Tze9ybKops4_000020 playing synthesizer
|
| 1080 |
-
U3-h9ZARqD4_000264 police radio chatter
|
| 1081 |
-
U34oQw93afs_000219 playing tambourine
|
| 1082 |
-
U3zsgbf9WHQ_000194 horse neighing
|
| 1083 |
-
U4RRMpX2wCU_000010 toilet flushing
|
| 1084 |
-
U55bYLMVKiw_000193 pheasant crowing
|
| 1085 |
-
U6vVDGaKL3Q_000354 bouncing on trampoline
|
| 1086 |
-
U9qUXBqIoZ0_000106 dog howling
|
| 1087 |
-
UA62hwIBgGY_000020 chicken clucking
|
| 1088 |
-
UFIi1OuMx0o_000302 rope skipping
|
| 1089 |
-
UGwl5VOHuaw_000200 playing accordion
|
| 1090 |
-
UIFxlzHYPBM_000060 gibbon howling
|
| 1091 |
-
UJ1lZOY9LSY_000035 playing didgeridoo
|
| 1092 |
-
UM1j8kFaxi8_000020 motorboat, speedboat acceleration
|
| 1093 |
-
UOL-hbkzUN4_000010 barn swallow calling
|
| 1094 |
-
UOlwg402_r4_000070 people clapping
|
| 1095 |
-
UPUwaW8jfhA_000030 ice cream truck, ice cream van
|
| 1096 |
-
UQonGRRRpv4_000024 goose honking
|
| 1097 |
-
UUKyUUjv8qg_000030 church bell ringing
|
| 1098 |
-
UZAB21OSorM_000007 electric shaver, electric razor shaving
|
| 1099 |
-
UZYfRXafn9I_000005 ferret dooking
|
| 1100 |
-
UZp0AcdimvA_000021 cattle, bovinae cowbell
|
| 1101 |
-
UeCkRYU_SuM_000100 playing accordion
|
| 1102 |
-
Uf2j1VbOk8c_000055 pheasant crowing
|
| 1103 |
-
UfG4dP0szuY_000040 fireworks banging
|
| 1104 |
-
UjTYiJ0dm8s_000002 vehicle horn, car horn, honking
|
| 1105 |
-
UkdS0cwAGYE_000010 car engine starting
|
| 1106 |
-
UnGLtJX29Hc_000043 planing timber
|
| 1107 |
-
UoFgJXGWJXA_000111 playing congas
|
| 1108 |
-
UpWivODbpIY_000059 owl hooting
|
| 1109 |
-
UsJAb6aftq8_000580 playing bagpipes
|
| 1110 |
-
UuuQH-TFxMo_000034 missile launch
|
| 1111 |
-
UzKZijSs4-A_000004 fox barking
|
| 1112 |
-
UzPSMiqeH3Y_000118 singing choir
|
| 1113 |
-
V-ZbY0SL2XI_000040 people sniggering
|
| 1114 |
-
V1ALglq7_x8_000018 dog growling
|
| 1115 |
-
V6lQVpw888U_000590 machine gun shooting
|
| 1116 |
-
V6y-jCli4I4_000000 cuckoo bird calling
|
| 1117 |
-
V7SGeTSJz9w_000090 skateboarding
|
| 1118 |
-
V82SmRI0GHY_000030 playing clarinet
|
| 1119 |
-
V83lIhKVraY_000125 playing darts
|
| 1120 |
-
VCEicqV_2Xw_000030 ambulance siren
|
| 1121 |
-
VDXN0xwWgRA_000083 playing bass guitar
|
| 1122 |
-
VDzkPfnI1g4_000093 playing djembe
|
| 1123 |
-
VEER910vqMk_000002 duck quacking
|
| 1124 |
-
VEhmvrgrZb0_000000 chicken clucking
|
| 1125 |
-
VFj1vFMV3dQ_000025 playing darts
|
| 1126 |
-
VGrI3TMjWog_000120 playing vibraphone
|
| 1127 |
-
VHQjG81NcXE_000030 crow cawing
|
| 1128 |
-
VS9R3iOc4Vk_000027 pheasant crowing
|
| 1129 |
-
VU9W8Y1E5u4_000030 bouncing on trampoline
|
| 1130 |
-
VdxslFvStdo_000370 female speech, woman speaking
|
| 1131 |
-
VfXlyIjtfo4_000117 baby babbling
|
| 1132 |
-
Vgs_XjEqKl0_000020 people sobbing
|
| 1133 |
-
Vh4E5JPTMBM_000146 typing on typewriter
|
| 1134 |
-
VhLn9pUFwXw_000039 chopping wood
|
| 1135 |
-
VhUG4vTpPUo_000324 ripping paper
|
| 1136 |
-
VhsFniEZO-k_000026 mynah bird singing
|
| 1137 |
-
Vkbp8VmL3pM_000040 people sobbing
|
| 1138 |
-
VkgLWYydiPE_000125 tractor digging
|
| 1139 |
-
VlGuwiKwJAM_000027 playing sitar
|
| 1140 |
-
VlkgwzKAamE_000051 ripping paper
|
| 1141 |
-
Vnnw7lK63rg_000041 playing snare drum
|
| 1142 |
-
Vt3qBXzyS5k_000280 eating with cutlery
|
| 1143 |
-
VwZ8gzI3qNE_000106 people slapping
|
| 1144 |
-
VwqcV76E6Nk_000000 people booing
|
| 1145 |
-
VwqqmiiznQU_000028 woodpecker pecking tree
|
| 1146 |
-
Vxs0xCJI92Y_000080 driving motorcycle
|
| 1147 |
-
Vzb427ZmWvw_000220 fireworks banging
|
| 1148 |
-
W0PwVllBxkI_000114 playing steel guitar, slide guitar
|
| 1149 |
-
W1o_XgU8lec_000050 skateboarding
|
| 1150 |
-
W2_8zRHaEPk_000150 playing vibraphone
|
| 1151 |
-
W2gkFTFR8mw_000047 rope skipping
|
| 1152 |
-
W4eT7fj-aIA_000201 driving snowmobile
|
| 1153 |
-
W5oXrz8dqBk_000030 playing piano
|
| 1154 |
-
W5wBkCwEEmY_000140 playing banjo
|
| 1155 |
-
W7OJevEgq7w_000000 dog bow-wow
|
| 1156 |
-
W7u5kEt-q-8_000000 playing tennis
|
| 1157 |
-
W9L5rTbcMFA_000004 people eating noodle
|
| 1158 |
-
WABbXpAT_UA_000049 playing bagpipes
|
| 1159 |
-
WAhoodHHm2w_000001 playing squash
|
| 1160 |
-
WBOqGIqUwGg_000090 people sniggering
|
| 1161 |
-
WD0aVtBqoxo_000120 goose honking
|
| 1162 |
-
WDmJ4ZtLuNU_000102 playing timbales
|
| 1163 |
-
WGHTlOM4-3w_000050 sheep bleating
|
| 1164 |
-
WH7LBLKyEkA_000241 playing mandolin
|
| 1165 |
-
WIWRYG4vJC4_000020 people burping
|
| 1166 |
-
WIZTFH-LGpo_000001 planing timber
|
| 1167 |
-
WJQ27fShKvk_000000 playing tennis
|
| 1168 |
-
WQFZLDitkkM_000067 eletric blender running
|
| 1169 |
-
WQuoH_HyUAk_000030 playing cello
|
| 1170 |
-
WRvPzjj5uoE_000134 ice cream truck, ice cream van
|
| 1171 |
-
WWzD6E9Wp_k_000260 playing cornet
|
| 1172 |
-
WXMt58sLsf8_000028 zebra braying
|
| 1173 |
-
WZ568vdA7bU_000070 plastic bottle crushing
|
| 1174 |
-
We-E7-Sx3Zo_000260 barn swallow calling
|
| 1175 |
-
Wg86ercBjY0_000002 playing clarinet
|
| 1176 |
-
Wh8A7CAuLe0_000028 barn swallow calling
|
| 1177 |
-
Whjk5Fvue1o_000030 singing choir
|
| 1178 |
-
Wj0qIPUjTfE_000008 lions roaring
|
| 1179 |
-
WqKP-0cSKgs_000030 dog bow-wow
|
| 1180 |
-
WvRkqVmRH0g_000088 playing harp
|
| 1181 |
-
WvcM0ueEjfo_000050 people burping
|
| 1182 |
-
Ww3CMatNd84_000721 cat purring
|
| 1183 |
-
WxQHtaD0Yqg_000028 tractor digging
|
| 1184 |
-
X-o1Twh5SFY_000032 playing steelpan
|
| 1185 |
-
X0gT3reH8A8_000120 people sniggering
|
| 1186 |
-
X17lq90OIO8_000020 dog barking
|
| 1187 |
-
X5C9NY9MjA4_000105 train whistling
|
| 1188 |
-
X7EGSxA-aCI_000132 child singing
|
| 1189 |
-
XBAwcPvVSoA_000068 lathe spinning
|
| 1190 |
-
XDMTylVtYx4_000190 race car, auto racing
|
| 1191 |
-
XEOUYLlaef4_000003 rope skipping
|
| 1192 |
-
XJnKU_SXYlM_000049 playing tabla
|
| 1193 |
-
XK4Ws-xvt10_000267 vacuum cleaner cleaning floors
|
| 1194 |
-
XKp4HCxVmaI_000017 vehicle horn, car horn, honking
|
| 1195 |
-
XLTqSk1Z3D0_000000 police radio chatter
|
| 1196 |
-
XM6eeVHjmLk_000001 dog growling
|
| 1197 |
-
XNgq-cDV7FI_000101 dinosaurs bellowing
|
| 1198 |
-
XOTSovKwxLk_000030 child speech, kid speaking
|
| 1199 |
-
XSJzshsMz30_000030 chainsawing trees
|
| 1200 |
-
XTDo4OaFapg_000100 hammering nails
|
| 1201 |
-
XU8dCEdiGWc_000010 crow cawing
|
| 1202 |
-
XUyBxCbiv7A_000073 playing bassoon
|
| 1203 |
-
XVveRibUh18_000023 frog croaking
|
| 1204 |
-
XWp8qMpnD00_000026 electric shaver, electric razor shaving
|
| 1205 |
-
XYZ4Nd4qV-I_000101 people humming
|
| 1206 |
-
XdSCT_cQDbE_000010 splashing water
|
| 1207 |
-
Xgm17YbPztk_000022 playing didgeridoo
|
| 1208 |
-
XiExpKM1Hpo_000160 playing trombone
|
| 1209 |
-
XlJ-tAbzzSg_000234 alligators, crocodiles hissing
|
| 1210 |
-
XtExs7nIzts_000034 people booing
|
| 1211 |
-
Xv4AVT2QYhA_000100 rowboat, canoe, kayak rowing
|
| 1212 |
-
Xxq7CElxJLc_000063 singing choir
|
| 1213 |
-
Y798EuJZaPU_000017 playing squash
|
| 1214 |
-
Y9Oee-VRfVA_000339 airplane
|
| 1215 |
-
YC_k4W1YaDw_000030 race car, auto racing
|
| 1216 |
-
YD41QET24SM_000125 playing badminton
|
| 1217 |
-
YD7jTek7yVU_000206 arc welding
|
| 1218 |
-
YEatlg_b0BY_000054 people burping
|
| 1219 |
-
YISopDKuQ0k_000050 playing accordion
|
| 1220 |
-
YJ5xLJ85AwM_000106 tractor digging
|
| 1221 |
-
YOTnbp40tf4_000030 male singing
|
| 1222 |
-
YOrImbuhsQ8_000027 lions roaring
|
| 1223 |
-
YS_zTwf-FRo_000092 playing ukulele
|
| 1224 |
-
YU78jPcU6FI_000070 playing trumpet
|
| 1225 |
-
YUXZVAQ1iJ4_000007 volcano explosion
|
| 1226 |
-
YUcdJy-rpD8_000590 raining
|
| 1227 |
-
YVOmkmjoT40_000030 ocean burbling
|
| 1228 |
-
YYgYiO9DjEY_000161 tap dancing
|
| 1229 |
-
YbALYr-5WpM_000000 playing harmonica
|
| 1230 |
-
YbOztklOkF0_000023 goose honking
|
| 1231 |
-
YcvHv44MYiU_000027 barn swallow calling
|
| 1232 |
-
YdjsatpizhE_000023 airplane flyby
|
| 1233 |
-
Ye72yJyWxs8_000021 airplane flyby
|
| 1234 |
-
YeEySSrxwpg_000078 barn swallow calling
|
| 1235 |
-
YfZp5C7xrKs_000181 playing bassoon
|
| 1236 |
-
YgySYOAi8JQ_000396 skiing
|
| 1237 |
-
YhJwTBFij48_000015 motorboat, speedboat acceleration
|
| 1238 |
-
YjCLRifFCj0_000010 skateboarding
|
| 1239 |
-
YjJioclqdQ8_000150 wind noise
|
| 1240 |
-
Ys1P04EjGH4_000196 playing bassoon
|
| 1241 |
-
Ys9j6IBcFBo_000024 opening or closing car doors
|
| 1242 |
-
YvBCKb1LbCk_000095 fire truck siren
|
| 1243 |
-
Yvq8WrFpXhE_000057 people crowd
|
| 1244 |
-
YwNdDHEhm2g_000005 duck quacking
|
| 1245 |
-
YwTFxcWCac8_000381 electric grinder grinding
|
| 1246 |
-
YyqqXEmYPIA_000020 ambulance siren
|
| 1247 |
-
YzBaTwjmikc_000018 hammering nails
|
| 1248 |
-
Z-V-1iUbMWI_000520 lions growling
|
| 1249 |
-
Z1BhAXfiZtU_000037 vacuum cleaner cleaning floors
|
| 1250 |
-
Z4QR8uvx_Wk_000169 reversing beeps
|
| 1251 |
-
Z5SyUJSDCOA_000562 ripping paper
|
| 1252 |
-
Z7Hzc1Yw2aY_000060 sloshing water
|
| 1253 |
-
Z93pTtHnDXo_000110 playing vibraphone
|
| 1254 |
-
Z9nG2fIh214_000075 chinchilla barking
|
| 1255 |
-
ZALP7Di4HaM_000180 playing saxophone
|
| 1256 |
-
ZAZZ1wImM9M_000010 singing choir
|
| 1257 |
-
ZCA_NapBTlg_000060 dog barking
|
| 1258 |
-
ZDDnEdzjyrE_000597 playing tambourine
|
| 1259 |
-
ZFGcmmpt1bs_000094 playing bagpipes
|
| 1260 |
-
ZL_MxixlnHE_000079 reversing beeps
|
| 1261 |
-
ZNboftBNdyY_000406 cap gun shooting
|
| 1262 |
-
ZPODO-Ehl_M_000030 male singing
|
| 1263 |
-
ZQO_uhrJPNA_000110 playing violin, fiddle
|
| 1264 |
-
ZUjum5gZMKM_000140 playing accordion
|
| 1265 |
-
Z_Bk_CnpWsY_000198 people sneezing
|
| 1266 |
-
Z_sW4UxpbbY_000050 using sewing machines
|
| 1267 |
-
ZbtuNDtoyOI_000030 sliding door
|
| 1268 |
-
ZcskQV2A2cQ_000030 playing flute
|
| 1269 |
-
ZdtaSkUkrIE_000256 police radio chatter
|
| 1270 |
-
ZeDa5hT2ffk_000071 police radio chatter
|
| 1271 |
-
Zgbuj3y2iuY_000210 cattle mooing
|
| 1272 |
-
Zh2whhvFWsM_000016 pigeon, dove cooing
|
| 1273 |
-
ZhLwVzOZziA_000368 blowtorch igniting
|
| 1274 |
-
Zi3FOnx4nuk_000001 playing table tennis
|
| 1275 |
-
Zj73Wh6LEiU_000120 skateboarding
|
| 1276 |
-
ZjN9CL7B-9I_000239 playing timbales
|
| 1277 |
-
ZkfUo4l9ruc_000090 chainsawing trees
|
| 1278 |
-
Zl_ZWSLB8Ic_000024 sheep bleating
|
| 1279 |
-
Zs8liAFeuuQ_000058 smoke detector beeping
|
| 1280 |
-
ZtPoTqVxVvU_000050 helicopter
|
| 1281 |
-
Zu0BpngzT_Q_000007 bowling impact
|
| 1282 |
-
ZuwSkX0RQQY_000343 playing tennis
|
| 1283 |
-
ZxmKMSUpbvc_000065 car engine idling
|
| 1284 |
-
ZxpiZiSAm9I_000060 turkey gobbling
|
| 1285 |
-
Zy70U6w0yXw_000088 mynah bird singing
|
| 1286 |
-
ZyUqhIDVuNc_000541 scuba diving
|
| 1287 |
-
Zz0fhQuHZEE_000012 penguins braying
|
| 1288 |
-
_0iRtZRG6UA_000047 woodpecker pecking tree
|
| 1289 |
-
_4RRKzDUd60_000079 lathe spinning
|
| 1290 |
-
_7GnnuKVVCM_000023 engine accelerating, revving, vroom
|
| 1291 |
-
_8FhgH9k7Rw_000120 vacuum cleaner cleaning floors
|
| 1292 |
-
_9wN5d1Z1ak_000024 lions roaring
|
| 1293 |
-
_CF34A0RrPs_000018 horse neighing
|
| 1294 |
-
_Cks36T64zE_000061 striking pool
|
| 1295 |
-
_DdVu5sPsjk_000490 people whispering
|
| 1296 |
-
_GaEZe-Z73k_000233 fire crackling
|
| 1297 |
-
_HRn4aOhjhU_000016 canary calling
|
| 1298 |
-
_H_W34UobYU_000459 bouncing on trampoline
|
| 1299 |
-
_HcIHVLRzpM_000450 female singing
|
| 1300 |
-
_NShiXyBmsY_000270 train wheels squealing
|
| 1301 |
-
_Ow1h1eTNk0_000178 playing trombone
|
| 1302 |
-
_SfaPFwwJHs_000026 train wheels squealing
|
| 1303 |
-
_T0iCBHWKt0_000101 pig oinking
|
| 1304 |
-
_T5ZUrmRiQI_000108 playing ukulele
|
| 1305 |
-
_Uyw_Legahg_000045 tap dancing
|
| 1306 |
-
_VOx5BWJsyQ_000030 raining
|
| 1307 |
-
_WQQ3QvGrYw_000340 child speech, kid speaking
|
| 1308 |
-
_WUAz2RAZZc_000201 planing timber
|
| 1309 |
-
_YF3aFSsgUk_000093 playing steel guitar, slide guitar
|
| 1310 |
-
_YhSeML8rQo_000109 alligators, crocodiles hissing
|
| 1311 |
-
_aX_UzkXRd0_000140 helicopter
|
| 1312 |
-
_cvucKdFb5I_000043 people booing
|
| 1313 |
-
_dIzu78Ld2w_000166 lathe spinning
|
| 1314 |
-
_gQFB_Utuf0_000077 cat caterwauling
|
| 1315 |
-
_j8zzvBts98_000000 splashing water
|
| 1316 |
-
_m6lwfMU8Eo_000272 electric shaver, electric razor shaving
|
| 1317 |
-
_pSMw5FKHX0_000040 people sobbing
|
| 1318 |
-
_pfccpy7Cqc_000180 typing on typewriter
|
| 1319 |
-
_ru-n--PRNA_000030 police car (siren)
|
| 1320 |
-
_t-Abwz6JG4_000031 baby babbling
|
| 1321 |
-
_t259gootxc_000190 female speech, woman speaking
|
| 1322 |
-
_u9zUuBdo1k_000000 cat growling
|
| 1323 |
-
_vkXDgupDN8_000250 sailing
|
| 1324 |
-
_wvB2HlVn1I_000050 engine accelerating, revving, vroom
|
| 1325 |
-
_xGLwynjhSs_000010 playing french horn
|
| 1326 |
-
_xq-9GZBfrg_000014 pheasant crowing
|
| 1327 |
-
_yVgX3hi1OQ_000195 driving snowmobile
|
| 1328 |
-
_zTmqhuLwAM_000001 donkey, ass braying
|
| 1329 |
-
a0LIemH5Cw0_000010 people clapping
|
| 1330 |
-
a3ZAFViNYyk_000000 swimming
|
| 1331 |
-
a57DUeBMeHY_000320 rowboat, canoe, kayak rowing
|
| 1332 |
-
a6CPpulnJ2A_000420 stream burbling
|
| 1333 |
-
a8fa79w2aIQ_000023 lighting firecrackers
|
| 1334 |
-
aC3nlLHFOfk_000030 playing violin, fiddle
|
| 1335 |
-
aCnLa_H0-P0_000000 magpie calling
|
| 1336 |
-
aDXQSTbKlIc_000010 playing cornet
|
| 1337 |
-
aE32elV-Jtk_000210 people crowd
|
| 1338 |
-
aG1wGSIqGR4_000013 frog croaking
|
| 1339 |
-
aHzkCSXsrqg_000038 vacuum cleaner cleaning floors
|
| 1340 |
-
aJ41sea1s0U_000080 people farting
|
| 1341 |
-
aNArqTW4cbc_000025 vehicle horn, car horn, honking
|
| 1342 |
-
aNOELrfjAYY_000000 vehicle horn, car horn, honking
|
| 1343 |
-
aRI4l67ZlYQ_000063 planing timber
|
| 1344 |
-
aSYCwv_hda8_000030 subway, metro, underground
|
| 1345 |
-
aSleAKgkDDk_000000 playing accordion
|
| 1346 |
-
aVs2QBhLIhY_000162 playing didgeridoo
|
| 1347 |
-
acYp_SYmHs8_000164 running electric fan
|
| 1348 |
-
aclGsdr83pM_000400 playing saxophone
|
| 1349 |
-
aezIOAga5V8_000070 child speech, kid speaking
|
| 1350 |
-
agMolFR_pFc_000075 train whistling
|
| 1351 |
-
agrdgrC2cdI_001076 dinosaurs bellowing
|
| 1352 |
-
ah5cSy0yXs0_000178 slot machine
|
| 1353 |
-
aiTXGmkpfnk_000030 playing trumpet
|
| 1354 |
-
ainzK7QuseU_000001 dog whimpering
|
| 1355 |
-
aj6kdMafoek_000693 hair dryer drying
|
| 1356 |
-
aju2z1N0aOo_000030 wind rustling leaves
|
| 1357 |
-
ap3PdrjChdo_000040 playing bassoon
|
| 1358 |
-
apTvGua1-FY_000271 playing guiro
|
| 1359 |
-
asXWEB_SBEI_000060 playing cello
|
| 1360 |
-
atT7DPwTkds_000130 people clapping
|
| 1361 |
-
auHL-4XCFAk_000030 driving buses
|
| 1362 |
-
b-8lh_tfhLQ_000124 hair dryer drying
|
| 1363 |
-
b-gza98ikBo_000020 playing snare drum
|
| 1364 |
-
b2bpNgK0Cnc_000250 orchestra
|
| 1365 |
-
b4Bu0AHwBWs_000084 woodpecker pecking tree
|
| 1366 |
-
b4WK1A7DK18_000018 crow cawing
|
| 1367 |
-
b8q6Z7dtRvg_000030 playing flute
|
| 1368 |
-
bBMcsO6IeDE_000021 lions roaring
|
| 1369 |
-
bF89h31EEzg_000000 golf driving
|
| 1370 |
-
bFmIV3pNJPY_000001 basketball bounce
|
| 1371 |
-
bI_4_x735PA_000020 typing on computer keyboard
|
| 1372 |
-
bJtu55jpzNc_000140 playing violin, fiddle
|
| 1373 |
-
bJzkn2kRh8g_000070 helicopter
|
| 1374 |
-
bLAz_kbihLE_000147 elk bugling
|
| 1375 |
-
bMNcdb3Eeds_000064 civil defense siren
|
| 1376 |
-
bN9fXjHalIY_000065 playing timpani
|
| 1377 |
-
bPNt6iVmemQ_000504 playing bongo
|
| 1378 |
-
bPfP2rjJfDY_000609 playing ukulele
|
| 1379 |
-
bQV7q5VRaH0_000174 car engine knocking
|
| 1380 |
-
bT8QfAM9NRA_000197 cutting hair with electric trimmers
|
| 1381 |
-
bVdI6laTOXI_000480 people screaming
|
| 1382 |
-
bVskpqAJF8E_000116 people eating crisps
|
| 1383 |
-
bYT-N-_u448_000217 civil defense siren
|
| 1384 |
-
bZUN1tQnuDQ_000001 child singing
|
| 1385 |
-
b_C-fNIS8aI_000000 cat purring
|
| 1386 |
-
baVILr18Y9A_000015 civil defense siren
|
| 1387 |
-
bd-swxc3o4w_000260 playing hammond organ
|
| 1388 |
-
bo9sSwEqnzs_000030 orchestra
|
| 1389 |
-
bokQgOSQ2OA_000001 playing squash
|
| 1390 |
-
bpF6KhK8El0_000030 police car (siren)
|
| 1391 |
-
bsM-z2joYss_000030 child speech, kid speaking
|
| 1392 |
-
bsUBSFHXY0g_000040 helicopter
|
| 1393 |
-
bukJZ1FxymQ_000390 male speech, man speaking
|
| 1394 |
-
bw3GIZLj6kM_000000 playing piano
|
| 1395 |
-
bx5BUbiIXFw_000107 child singing
|
| 1396 |
-
bzxjT3h2ir8_000105 lip smacking
|
| 1397 |
-
c3UPyEZ1yQY_000070 typing on computer keyboard
|
| 1398 |
-
c4M3JIyAPcM_000020 playing bass drum
|
| 1399 |
-
c5dPZoWwmC0_000020 driving motorcycle
|
| 1400 |
-
c6e4pxgoCls_000105 magpie calling
|
| 1401 |
-
c84w0ECD-Lc_000010 ocean burbling
|
| 1402 |
-
cAI0pcOwk2g_000346 elk bugling
|
| 1403 |
-
cEddS8Y-qZc_000510 people clapping
|
| 1404 |
-
cFNcpddGRno_000340 fireworks banging
|
| 1405 |
-
cIHKR2E1uiQ_000303 smoke detector beeping
|
| 1406 |
-
cJSWXGTJMcc_000018 rowboat, canoe, kayak rowing
|
| 1407 |
-
cL_nCiBnlbk_000001 playing bugle
|
| 1408 |
-
cMdnie91zp4_000000 playing trombone
|
| 1409 |
-
cNUIc68WpD4_000075 people marching
|
| 1410 |
-
cRiW0u0QY18_000030 playing trumpet
|
| 1411 |
-
cSym5f2jySA_000005 chicken crowing
|
| 1412 |
-
cUBHfozbsao_000044 playing harp
|
| 1413 |
-
cV4QlanVa9w_000070 basketball bounce
|
| 1414 |
-
cVhWB3IniBo_000014 playing tuning fork
|
| 1415 |
-
cZfuBCVV6n8_000390 eating with cutlery
|
| 1416 |
-
ces9pc_r6Wo_000036 child singing
|
| 1417 |
-
ckwEyopmfKs_000024 crow cawing
|
| 1418 |
-
cmkEW0KJDYI_000165 arc welding
|
| 1419 |
-
cp-ZI_fQ1l0_000154 airplane flyby
|
| 1420 |
-
cwQY1bck2G8_000070 playing bagpipes
|
| 1421 |
-
cx4QSvep_wE_000009 train horning
|
| 1422 |
-
cxFdK2G6wq0_000030 playing bagpipes
|
| 1423 |
-
d-UQr-8UEUY_000069 playing saxophone
|
| 1424 |
-
d05lXeFKDn0_000275 pheasant crowing
|
| 1425 |
-
d4yBeEbVp1Y_000030 typing on computer keyboard
|
| 1426 |
-
d5HmVBPY1Qc_000230 playing saxophone
|
| 1427 |
-
d66pNyYB6WY_000013 people burping
|
| 1428 |
-
d8gWsmBdBhE_000097 playing sitar
|
| 1429 |
-
dBivnkxNOOc_000175 playing vibraphone
|
| 1430 |
-
dECLS-JHWYA_000000 vacuum cleaner cleaning floors
|
| 1431 |
-
dK46EdcZFzg_000030 playing trumpet
|
| 1432 |
-
dNMCURn41wU_000179 playing djembe
|
| 1433 |
-
dN_EzmXbsu8_000016 playing bass drum
|
| 1434 |
-
dSeWq0Qd9Hs_000318 playing tambourine
|
| 1435 |
-
dVg4IEbk-l8_000010 cat meowing
|
| 1436 |
-
d_OIBYBwexQ_000160 playing accordion
|
| 1437 |
-
daHwPM2azrc_000036 wood thrush calling
|
| 1438 |
-
dfr1OFz20sI_000000 goat bleating
|
| 1439 |
-
dgSOnxqNtFE_000246 people coughing
|
| 1440 |
-
dgS_Fy1FiNA_000110 people burping
|
| 1441 |
-
dhG_GSGW_RI_000004 volcano explosion
|
| 1442 |
-
dlJm9R5t_qg_000030 playing hammond organ
|
| 1443 |
-
dlWrMn_RDg0_000120 playing bassoon
|
| 1444 |
-
dqymshfwGEE_000030 playing saxophone
|
| 1445 |
-
duca08sjlbQ_000001 playing bongo
|
| 1446 |
-
dugd_OSzghs_000203 ice cracking
|
| 1447 |
-
e0LMGLr-T-I_000029 air conditioning noise
|
| 1448 |
-
e3ZJnO3s53o_000016 child singing
|
| 1449 |
-
eANsaSAzHm8_000010 driving motorcycle
|
| 1450 |
-
eBFPD8YrqiA_000140 driving buses
|
| 1451 |
-
eCpA_7B-k94_000030 dog bow-wow
|
| 1452 |
-
eDqfHtuB8Hk_000015 snake rattling
|
| 1453 |
-
eEUsoUKPxy8_000187 basketball bounce
|
| 1454 |
-
eFaLkcfCzos_000140 playing cello
|
| 1455 |
-
eK97_rb6BsY_000072 playing gong
|
| 1456 |
-
eLyQDSo2NAM_000129 opening or closing drawers
|
| 1457 |
-
eOJQsk_kdWI_000032 ice cracking
|
| 1458 |
-
eS8Tf1hfwxk_000205 sea waves
|
| 1459 |
-
eSEIPV-qSj0_000020 squishing water
|
| 1460 |
-
e_3GUZmPFBI_000020 playing erhu
|
| 1461 |
-
ebhtW1tIXRY_000002 donkey, ass braying
|
| 1462 |
-
ecTDu-EX3WE_000019 car engine knocking
|
| 1463 |
-
ecq96FWbCF0_000037 gibbon howling
|
| 1464 |
-
ed4wVB_RhHw_000011 baby crying
|
| 1465 |
-
ehw6y3_g-8A_000757 ripping paper
|
| 1466 |
-
ej6jlkTeobU_000002 car engine knocking
|
| 1467 |
-
el3i-oj08Q4_000173 playing oboe
|
| 1468 |
-
f-XD-BgLWk0_000000 skidding
|
| 1469 |
-
f5c5KuWylig_000343 vacuum cleaner cleaning floors
|
| 1470 |
-
f6Wl-9pzib0_000032 cattle mooing
|
| 1471 |
-
f8bMURZiPiU_000019 people whistling
|
| 1472 |
-
f9U7g3g4voA_000026 golf driving
|
| 1473 |
-
f9c9YZ8WgjM_000037 bull bellowing
|
| 1474 |
-
fD362l9P3u8_000041 tornado roaring
|
| 1475 |
-
fFn2P7ZRIeM_000480 playing clarinet
|
| 1476 |
-
fNlGlh1GaeA_000013 heart sounds, heartbeat
|
| 1477 |
-
fS17RfJYjS4_000001 pig oinking
|
| 1478 |
-
fTT_D_d_5FA_000080 people clapping
|
| 1479 |
-
f_55S5G8M2s_000000 playing harmonica
|
| 1480 |
-
faFCcN6y-C8_000020 ferret dooking
|
| 1481 |
-
fcyUlEGvMdc_000037 playing volleyball
|
| 1482 |
-
fj0qlDdWt1M_000158 playing squash
|
| 1483 |
-
fknz5hZg_3I_000295 playing darts
|
| 1484 |
-
fmc6hwse-IA_000085 skiing
|
| 1485 |
-
g-CydtX7btM_000086 eagle screaming
|
| 1486 |
-
g-u5YOJu_gY_000230 lawn mowing
|
| 1487 |
-
g1n-ZaW0QHQ_000095 reversing beeps
|
| 1488 |
-
g5OBeqvOmRU_000001 bathroom ventilation fan running
|
| 1489 |
-
g8E9gBfe8B4_000180 female speech, woman speaking
|
| 1490 |
-
gH25X_mj6mc_000210 ocean burbling
|
| 1491 |
-
gJbMwvsUyA8_000000 driving motorcycle
|
| 1492 |
-
gL2i_DTGUEY_000028 cattle, bovinae cowbell
|
| 1493 |
-
gLTvwzBktxE_000015 airplane
|
| 1494 |
-
gLj93C9rRsg_000055 playing tambourine
|
| 1495 |
-
gLokxx-ruH8_000230 playing piano
|
| 1496 |
-
gM9WSjAPDVc_000030 people babbling
|
| 1497 |
-
gPwtTVH44OY_000030 people shuffling
|
| 1498 |
-
gW-1oOsNGJs_000010 playing harp
|
| 1499 |
-
g_axbxP7Amc_000071 playing harp
|
| 1500 |
-
gaFtxq1hBU4_000118 spraying water
|
| 1501 |
-
gcBUpboDmjc_000033 hammering nails
|
| 1502 |
-
gfgv17hOPIM_000040 playing marimba, xylophone
|
| 1503 |
-
gjJ4nqwlgnE_000010 playing hammond organ
|
| 1504 |
-
gm0HkvshnPk_000340 cattle mooing
|
| 1505 |
-
goS6rwhPth4_000026 mynah bird singing
|
| 1506 |
-
goz-IQ8s6uk_000050 skidding
|
| 1507 |
-
gpaX15tTUoc_000017 cat growling
|
| 1508 |
-
guvnNwCkhcs_000030 people sniggering
|
| 1509 |
-
gwzqjVCFNqA_000040 playing clarinet
|
| 1510 |
-
gyioxO7fWzI_000046 lions roaring
|
| 1511 |
-
gyjZ7tnnZeA_000132 playing theremin
|
| 1512 |
-
gyt54t3R_BU_000032 blowtorch igniting
|
| 1513 |
-
gzqq0knK2FA_000003 cat meowing
|
| 1514 |
-
h-Z5cTyu4LE_000150 wind rustling leaves
|
| 1515 |
-
h0025UfxME0_000308 sharpen knife
|
| 1516 |
-
h0V51dolEjA_000194 airplane flyby
|
| 1517 |
-
h5Gq0y3qkX0_000063 volcano explosion
|
| 1518 |
-
h7EWw2n5D5I_000050 train horning
|
| 1519 |
-
h7pz6niHZuw_000006 donkey, ass braying
|
| 1520 |
-
hEePXITb26o_000042 playing harp
|
| 1521 |
-
hFVd2Em9-cc_000100 toilet flushing
|
| 1522 |
-
hQvIg0t546Q_000146 playing vibraphone
|
| 1523 |
-
hUlqIdQFuxE_000030 basketball bounce
|
| 1524 |
-
hV_CjOK-mME_000030 people sniggering
|
| 1525 |
-
hW-RxgLN2l0_000007 owl hooting
|
| 1526 |
-
h_tdr4t6unw_000300 typing on typewriter
|
| 1527 |
-
ha-5LhgpVmQ_000946 playing tympani
|
| 1528 |
-
hc9aQ8VL9o0_000083 playing steel guitar, slide guitar
|
| 1529 |
-
hcR4BiG8sZs_000150 playing clarinet
|
| 1530 |
-
hdphUn6ihrA_000450 lawn mowing
|
| 1531 |
-
hdqCHBTwnuQ_000253 playing badminton
|
| 1532 |
-
hgcLJFz2WKQ_000512 police radio chatter
|
| 1533 |
-
hlKkLqHpJ_s_000400 chicken crowing
|
| 1534 |
-
ht3jNf66nbo_000286 missile launch
|
| 1535 |
-
htRB8f0r2rg_000027 playing bassoon
|
| 1536 |
-
hvoOSCZo2-E_000030 cattle, bovinae cowbell
|
| 1537 |
-
hy87-XUmhkE_000004 playing timbales
|
| 1538 |
-
i-SmzP7T_E8_000295 skiing
|
| 1539 |
-
i3nEgFq4yfo_000578 heart sounds, heartbeat
|
| 1540 |
-
i4IsKRvCLi0_000036 lions roaring
|
| 1541 |
-
i5dz5NV4Vpc_000007 crow cawing
|
| 1542 |
-
i6BBre7xV-c_000937 dinosaurs bellowing
|
| 1543 |
-
i9PvGS9Xr9k_000332 lions roaring
|
| 1544 |
-
iAqJ9lPCU4w_000024 chicken crowing
|
| 1545 |
-
iD8gRmmiiqU_000130 driving motorcycle
|
| 1546 |
-
iDODqIflQ1Q_000061 parrot talking
|
| 1547 |
-
iFU48OcnO7k_000000 woodpecker pecking tree
|
| 1548 |
-
iLaLf95DcQk_000040 duck quacking
|
| 1549 |
-
iQMIGLrKlTI_000260 playing accordion
|
| 1550 |
-
iSjvZiygjCQ_000510 playing cello
|
| 1551 |
-
iTd7hOI27BE_000048 playing harp
|
| 1552 |
-
iXslVMHwkTU_000212 people eating noodle
|
| 1553 |
-
iZR9dpO64NA_000011 cap gun shooting
|
| 1554 |
-
i_-LCRDriig_000030 ocean burbling
|
| 1555 |
-
i_hhSKWxzeU_000038 frog croaking
|
| 1556 |
-
ibd7CKcSiTI_000122 playing bass drum
|
| 1557 |
-
icK4IQb2KsE_000000 hail
|
| 1558 |
-
ieRU5f5P4B8_000350 cap gun shooting
|
| 1559 |
-
ieXdQlIBgLk_000030 playing marimba, xylophone
|
| 1560 |
-
iiUvfvkeo0c_000237 disc scratching
|
| 1561 |
-
ijirbb9m05k_000285 swimming
|
| 1562 |
-
ipo5U5Grsno_000020 people cheering
|
| 1563 |
-
irUkV1DP7Cs_000030 playing cello
|
| 1564 |
-
irhsdhRIUwI_000010 fireworks banging
|
| 1565 |
-
itH-fbb9Ook_000250 ambulance siren
|
| 1566 |
-
ixv1jovJe3c_000151 playing timpani
|
| 1567 |
-
j-GF_0RxUlg_000176 playing bass guitar
|
| 1568 |
-
j-hyPaKjCAU_000030 playing accordion
|
| 1569 |
-
j0NNSluEaS0_000150 heart sounds, heartbeat
|
| 1570 |
-
j15Ldqb_XVw_000020 fireworks banging
|
| 1571 |
-
j2OhKQ6sm0o_000077 people eating noodle
|
| 1572 |
-
j3A_ekLNu1Y_000008 car passing by
|
| 1573 |
-
j4GHwj1Yqz8_000076 ice cream truck, ice cream van
|
| 1574 |
-
j5oZYOBOppQ_000003 mouse squeaking
|
| 1575 |
-
j6f4pheXNDE_000108 tractor digging
|
| 1576 |
-
jB-OcexH1n0_000033 cat caterwauling
|
| 1577 |
-
jBCKFPXuFOw_000086 strike lighter
|
| 1578 |
-
jBZ1C1ihCIY_000005 playing bongo
|
| 1579 |
-
jL4h1-_LECU_000022 church bell ringing
|
| 1580 |
-
jQRurvUk2xs_000051 writing on blackboard with chalk
|
| 1581 |
-
jVG2LQ2kA1Q_000067 playing glockenspiel
|
| 1582 |
-
j_WKRbDVZhs_000071 barn swallow calling
|
| 1583 |
-
j_vtU1U9rg0_000042 playing volleyball
|
| 1584 |
-
jb92NmGYNbU_000279 police radio chatter
|
| 1585 |
-
jd2ENRtbxRQ_000010 people coughing
|
| 1586 |
-
ji-27X81tIs_000133 playing bassoon
|
| 1587 |
-
ji4T1ArqCz0_000017 fire truck siren
|
| 1588 |
-
ji8HeUiTfoU_000030 orchestra
|
| 1589 |
-
jld-wHLRUWM_000020 playing accordion
|
| 1590 |
-
jmLX2yQ4eKk_000007 hail
|
| 1591 |
-
jsw5soBYfsc_000020 people farting
|
| 1592 |
-
jt7w_UY4yUI_000040 lions growling
|
| 1593 |
-
jxnPU7Okb5U_000043 playing snare drum
|
| 1594 |
-
jzw_Wa_TXVo_000018 playing squash
|
| 1595 |
-
k-AKVEheu4g_000096 alligators, crocodiles hissing
|
| 1596 |
-
k-jDS1jp_AA_000014 firing cannon
|
| 1597 |
-
k4h2VtrPwus_000161 rapping
|
| 1598 |
-
kAWAs_7SaKw_000000 bird chirping, tweeting
|
| 1599 |
-
kBmcp8nL6Kg_000195 playing didgeridoo
|
| 1600 |
-
kCsmvK06SCA_000254 playing sitar
|
| 1601 |
-
kDwFyUvAi4w_000077 playing bongo
|
| 1602 |
-
kEQJJyYkYTY_000200 child speech, kid speaking
|
| 1603 |
-
kL6xemyurI8_000140 people eating apple
|
| 1604 |
-
kPp7CwFBl1c_000030 playing violin, fiddle
|
| 1605 |
-
kPpaeW3DObU_000481 playing castanets
|
| 1606 |
-
kPus6xz6fN8_000030 car engine knocking
|
| 1607 |
-
kSdqIpAMz_M_000175 playing snare drum
|
| 1608 |
-
kSwrdM7UD98_000057 owl hooting
|
| 1609 |
-
kTyaqJIhX6Q_000020 playing accordion
|
| 1610 |
-
kVMXMaTyEbE_000116 playing theremin
|
| 1611 |
-
kVtj0bAYAF8_000000 people sobbing
|
| 1612 |
-
kW23iJgtyfk_000002 raining
|
| 1613 |
-
k_NIUqHoNz4_000037 playing bassoon
|
| 1614 |
-
khZPuH00RNc_000332 yodelling
|
| 1615 |
-
kjtZNsHp_a0_000330 lawn mowing
|
| 1616 |
-
kkgjiCKHvoY_000449 firing cannon
|
| 1617 |
-
kmPmQ6aylRc_000012 reversing beeps
|
| 1618 |
-
koTbsmbqyxo_000103 people booing
|
| 1619 |
-
kp_7Sd6s0h8_000306 people eating apple
|
| 1620 |
-
kqvpyaIls0c_000090 playing cello
|
| 1621 |
-
ksaiDSSJeOg_000030 playing cymbal
|
| 1622 |
-
ktBzLsiL6l0_000157 playing steelpan
|
| 1623 |
-
kxl_ZU3j99A_000415 missile launch
|
| 1624 |
-
ky92PHpUpEA_000050 playing accordion
|
| 1625 |
-
kyEDPVvDQt4_000040 bird wings flapping
|
| 1626 |
-
kz849EPouys_000318 magpie calling
|
| 1627 |
-
kzntbWmyWBg_000074 playing squash
|
| 1628 |
-
l0DQpxoSr2Q_000040 playing banjo
|
| 1629 |
-
l3i-cKkVL-o_000007 car engine knocking
|
| 1630 |
-
l3rzkrm98J0_000001 alligators, crocodiles hissing
|
| 1631 |
-
l3uGoel_Ats_000000 people crowd
|
| 1632 |
-
l4XYVX79H58_000400 people babbling
|
| 1633 |
-
l5LnwNRK7Bw_000030 playing cello
|
| 1634 |
-
l6uZDuUsdpc_000010 people burping
|
| 1635 |
-
l7ELBtiVtQ8_000190 striking pool
|
| 1636 |
-
l8bdmlXL-Lk_000197 playing didgeridoo
|
| 1637 |
-
l9ple4xWo3w_000193 chopping food
|
| 1638 |
-
lAF2dHM7Tyc_000170 playing electric guitar
|
| 1639 |
-
lEzMz9odWXM_000058 lathe spinning
|
| 1640 |
-
lGsxnfOPaUw_000022 baby crying
|
| 1641 |
-
lGtRJjnC4PI_000210 airplane
|
| 1642 |
-
lKhe8BxkRnU_000025 wind rustling leaves
|
| 1643 |
-
lLme6yedI6w_000040 cricket chirping
|
| 1644 |
-
lN2kwc34bo0_000050 train horning
|
| 1645 |
-
lP5znTMLevo_000030 playing bagpipes
|
| 1646 |
-
lQG8CRumj3g_000560 playing cello
|
| 1647 |
-
lUWrhn9z9FI_000096 pumping water
|
| 1648 |
-
lXIaZksDY38_000030 people shuffling
|
| 1649 |
-
lXwEV2S1rt4_000150 using sewing machines
|
| 1650 |
-
lc1QTC0R_CQ_000018 people shuffling
|
| 1651 |
-
ld9b7tfnqTE_000109 playing erhu
|
| 1652 |
-
ldF2EJCVY3g_000147 playing theremin
|
| 1653 |
-
ldvcH7bOy_o_000184 playing french horn
|
| 1654 |
-
levuF973w8s_000250 playing french horn
|
| 1655 |
-
lg6X9iqcqXI_000233 playing table tennis
|
| 1656 |
-
lg7DqdnmkmE_000130 skateboarding
|
| 1657 |
-
lj-PczKzEaw_000040 using sewing machines
|
| 1658 |
-
ljXTXoBG9rg_000077 pheasant crowing
|
| 1659 |
-
lnatlhCU5kI_000420 singing choir
|
| 1660 |
-
loMPOYNM66g_000123 playing timpani
|
| 1661 |
-
lqVp4OJ4hbY_000044 lions roaring
|
| 1662 |
-
lr1RLADQXNg_000110 helicopter
|
| 1663 |
-
lrFFGvB03Fw_000071 golf driving
|
| 1664 |
-
lsBttXzhPHw_000144 playing sitar
|
| 1665 |
-
lt5H2iH9Ln8_000120 chicken crowing
|
| 1666 |
-
lwgKXn21ymc_000774 people whispering
|
| 1667 |
-
lxFVAc2dHVM_000152 fire crackling
|
| 1668 |
-
lzLgjt8VRmU_000000 skateboarding
|
| 1669 |
-
m-4-BAv8cCQ_000380 lawn mowing
|
| 1670 |
-
m-NpPmAkncw_000030 male singing
|
| 1671 |
-
m0g-zWJJClA_000150 playing banjo
|
| 1672 |
-
m1lFSuSixy8_000350 people marching
|
| 1673 |
-
m1lFSuSixy8_000613 people marching
|
| 1674 |
-
m2E4i-EzHIE_000085 people finger snapping
|
| 1675 |
-
m4j5XY09HlE_000021 car engine idling
|
| 1676 |
-
mCyvq9TF5Ms_000052 typing on typewriter
|
| 1677 |
-
mInTDyk6c2A_000012 writing on blackboard with chalk
|
| 1678 |
-
mPnRdL1sC48_000240 people eating crisps
|
| 1679 |
-
mQ60N4HdDyI_000102 machine gun shooting
|
| 1680 |
-
mRCzIaqRG_c_000000 using sewing machines
|
| 1681 |
-
mWGLXbNhuB4_000096 hammering nails
|
| 1682 |
-
m_7BjYa44lo_000030 child speech, kid speaking
|
| 1683 |
-
ma0P7XOsBgE_000030 people running
|
| 1684 |
-
ma2RuCUufcI_000036 fox barking
|
| 1685 |
-
maUlA8WWTEQ_000004 hail
|
| 1686 |
-
maVHGHl01Yc_000034 lathe spinning
|
| 1687 |
-
mcVY3xsxgcU_000060 playing bagpipes
|
| 1688 |
-
mi9AokZ8m5s_000849 shot football
|
| 1689 |
-
mjK1vNF3lKE_000023 playing theremin
|
| 1690 |
-
mlihNhHFGTM_000030 playing harpsichord
|
| 1691 |
-
mt13n4XleGY_000030 orchestra
|
| 1692 |
-
mwu46g-jnac_000170 bird chirping, tweeting
|
| 1693 |
-
n-PjT4mDn9Y_000173 playing bagpipes
|
| 1694 |
-
n0PnM0u47m4_000042 mynah bird singing
|
| 1695 |
-
n0gO6pPICi4_000065 playing mandolin
|
| 1696 |
-
n21m6N5UmNk_000002 firing cannon
|
| 1697 |
-
n2CgftHGLJ0_000030 driving buses
|
| 1698 |
-
n3bX64Z_Yds_000000 playing clarinet
|
| 1699 |
-
n4wpVSIu7c0_000087 beat boxing
|
| 1700 |
-
n6PQq584nWA_000010 playing trumpet
|
| 1701 |
-
n8vhraccEnc_000009 dog howling
|
| 1702 |
-
nAtvzIyRwnU_000100 playing saxophone
|
| 1703 |
-
nEBUuVsMtGE_000000 church bell ringing
|
| 1704 |
-
nGIVQLeZ76E_000103 bowling impact
|
| 1705 |
-
nHDsu69zzSA_000000 skidding
|
| 1706 |
-
nIHYEEVzuzE_000095 canary calling
|
| 1707 |
-
nJ7TBigS5bY_000018 people booing
|
| 1708 |
-
nLOOmtvC9Hc_000066 playing steel guitar, slide guitar
|
| 1709 |
-
nLVmclZYZMY_000200 people screaming
|
| 1710 |
-
nP0vO3Xv10M_000010 dog barking
|
| 1711 |
-
nPCYkMhaLYs_000024 roller coaster running
|
| 1712 |
-
nTo6W-50CDg_000018 whale calling
|
| 1713 |
-
nXc-dHK2A2A_000016 playing theremin
|
| 1714 |
-
n_F_tRGGoEA_000107 frog croaking
|
| 1715 |
-
ngJ_Us2C19g_000040 police car (siren)
|
| 1716 |
-
niYH8Dpt4uE_000140 cattle mooing
|
| 1717 |
-
nnyll58-lrA_000009 wind chime
|
| 1718 |
-
nowY2-6reIk_000030 pigeon, dove cooing
|
| 1719 |
-
nz0qYNbFGD4_000030 people coughing
|
| 1720 |
-
o2-6TSqWPCY_000170 people clapping
|
| 1721 |
-
o2qd4hsquvE_000056 bird squawking
|
| 1722 |
-
o4F5dtUXivA_000034 playing steelpan
|
| 1723 |
-
o6kY64rTk2k_000291 singing bowl
|
| 1724 |
-
o7mBR043UCs_000014 pig oinking
|
| 1725 |
-
o8iHgGRzcTE_000020 people clapping
|
| 1726 |
-
o8oMY-WgW9Y_000030 wind rustling leaves
|
| 1727 |
-
o9uGfNn4JyU_000062 lions roaring
|
| 1728 |
-
oBrRQ5SiJTQ_000210 driving motorcycle
|
| 1729 |
-
oCZ3WCK5BZU_000000 driving motorcycle
|
| 1730 |
-
oDAI33ybJlo_000029 playing theremin
|
| 1731 |
-
oDuiwpaep1k_000035 sliding door
|
| 1732 |
-
oEEOscuru6s_000280 playing flute
|
| 1733 |
-
oEXqWoSZ9Ww_000024 playing erhu
|
| 1734 |
-
oG6EUnQjeF8_000077 swimming
|
| 1735 |
-
oIUi8gFI_XY_000178 cat purring
|
| 1736 |
-
oIXRSpjo7vk_000170 wood thrush calling
|
| 1737 |
-
oJ4m2OvhA8Q_000100 playing cello
|
| 1738 |
-
oSsLQCIJjyE_000030 singing bowl
|
| 1739 |
-
oVxKyGnz-IA_000230 chainsawing trees
|
| 1740 |
-
oXXHkjFLN3E_000237 electric shaver, electric razor shaving
|
| 1741 |
-
oX_XdxqTE9Y_000110 bird chirping, tweeting
|
| 1742 |
-
oYe46obCJhc_000039 alarm clock ringing
|
| 1743 |
-
oZ6l0EStee4_000011 police car (siren)
|
| 1744 |
-
oZKVPzRyn50_000432 playing electronic organ
|
| 1745 |
-
oad_agP1oJU_000287 playing harpsichord
|
| 1746 |
-
od2HXuT_NuI_000100 playing cello
|
| 1747 |
-
oePtbOc8Hqs_000000 foghorn
|
| 1748 |
-
oeSxlmkPj78_000030 ocean burbling
|
| 1749 |
-
ofFtXFnfebQ_000684 cat purring
|
| 1750 |
-
ohh7mWALd_k_000473 pheasant crowing
|
| 1751 |
-
olZa2vOpbD4_000110 male speech, man speaking
|
| 1752 |
-
omiGYobPra4_000100 toilet flushing
|
| 1753 |
-
onqGNrWQ7us_000587 machine gun shooting
|
| 1754 |
-
or7ikBeUhBg_000020 driving buses
|
| 1755 |
-
osA1JXFL2Gk_000021 parrot talking
|
| 1756 |
-
otp3r8SfygA_000102 people shuffling
|
| 1757 |
-
p-DcPCo7Swo_000086 playing double bass
|
| 1758 |
-
p4RWTSRg6Bg_000290 people crowd
|
| 1759 |
-
p5LsBog-XRk_000130 playing saxophone
|
| 1760 |
-
p5j91ecL43Y_000030 people whispering
|
| 1761 |
-
p8HTTAhm5ic_000100 waterfall burbling
|
| 1762 |
-
pAe8kcpjZII_000010 playing theremin
|
| 1763 |
-
pKUzj3ckXvI_000010 toilet flushing
|
| 1764 |
-
pNiB5w3JBVI_000003 spraying water
|
| 1765 |
-
pQrnDC-kPHk_000106 sharpen knife
|
| 1766 |
-
pRdi3oChUR4_000020 baltimore oriole calling
|
| 1767 |
-
pUMZEzdKmPM_000136 owl hooting
|
| 1768 |
-
pVJY1Q137cw_000681 cat purring
|
| 1769 |
-
pX_Sg3xDAUg_000000 people burping
|
| 1770 |
-
p_KsZsJwH0w_000555 sharpen knife
|
| 1771 |
-
pdzAs6Be2sY_000139 people gargling
|
| 1772 |
-
piYKrS14dxA_000113 mynah bird singing
|
| 1773 |
-
pnFtPlslgGw_000019 plastic bottle crushing
|
| 1774 |
-
ppDvhlGr5nI_000003 golf driving
|
| 1775 |
-
ppLjxFk8C4M_000023 heart sounds, heartbeat
|
| 1776 |
-
pqDHX5R4sdg_000220 female singing
|
| 1777 |
-
pqElMm80SX8_000025 airplane flyby
|
| 1778 |
-
prq7EqBGWaY_000035 playing harpsichord
|
| 1779 |
-
psz3LAhSi9U_000001 yodelling
|
| 1780 |
-
pu9pO-rCzy4_000153 people farting
|
| 1781 |
-
pugRM2Nsnyo_000283 church bell ringing
|
| 1782 |
-
pukny4fvbOQ_000040 playing clarinet
|
| 1783 |
-
pxpIsajKD-Y_000042 reversing beeps
|
| 1784 |
-
pxpIsajKD-Y_000065 reversing beeps
|
| 1785 |
-
pyHJrlNMYwo_000350 sheep bleating
|
| 1786 |
-
pzixqhh0xG4_000175 golf driving
|
| 1787 |
-
q0Hz09My-_E_000018 lions roaring
|
| 1788 |
-
q0R8KXxZOZM_000070 people farting
|
| 1789 |
-
q0lahEg486Y_000295 tractor digging
|
| 1790 |
-
q1oBXqEFXy4_000070 sloshing water
|
| 1791 |
-
q5fUdJoUrAE_000257 beat boxing
|
| 1792 |
-
q7cvNFoT9nQ_000027 lighting firecrackers
|
| 1793 |
-
qA-yeGwsVn4_000018 pheasant crowing
|
| 1794 |
-
qBDrrE6LnUo_000103 bird chirping, tweeting
|
| 1795 |
-
qBmsSZQ7HNg_000360 railroad car, train wagon
|
| 1796 |
-
qCcC7n2mOC0_000074 playing harpsichord
|
| 1797 |
-
qIcEYC46zmI_000087 playing cornet
|
| 1798 |
-
qJJEBEajF1M_000017 air conditioning noise
|
| 1799 |
-
qL-4fJyDGXc_000893 people eating noodle
|
| 1800 |
-
qNi5Xlf2ZVY_000510 people clapping
|
| 1801 |
-
qORUGCczq74_000042 swimming
|
| 1802 |
-
qRm5Yh3JPSg_000016 playing tambourine
|
| 1803 |
-
qRwun6pFuNA_000010 playing banjo
|
| 1804 |
-
qTRrHj-DNYc_000137 dinosaurs bellowing
|
| 1805 |
-
qW9b8qu_KrU_000180 lions growling
|
| 1806 |
-
qXFgtkhWLgM_000134 child singing
|
| 1807 |
-
q_ZMlkVS740_000222 playing congas
|
| 1808 |
-
qbmNcYH52eo_000516 striking pool
|
| 1809 |
-
qdl6t1bDb-8_000400 eating with cutlery
|
| 1810 |
-
qgv0riPveBQ_000030 bird chirping, tweeting
|
| 1811 |
-
qiw2I1oQIVQ_000057 playing snare drum
|
| 1812 |
-
qjBkiP7mBNI_000597 ripping paper
|
| 1813 |
-
qmjK_Wi0IK8_000080 people cheering
|
| 1814 |
-
qoPAdSFZ4f0_000370 chopping wood
|
| 1815 |
-
qpjOCvQEHdo_000080 people cheering
|
| 1816 |
-
qrNCI310T9Y_000018 chicken clucking
|
| 1817 |
-
qsj_OgZZDvQ_000080 tap dancing
|
| 1818 |
-
qsrNWdcjwwY_000320 female speech, woman speaking
|
| 1819 |
-
quF2HA3u2JY_000101 cupboard opening or closing
|
| 1820 |
-
quZSWDeSywg_000040 toilet flushing
|
| 1821 |
-
qv51EqZA8eE_000291 train horning
|
| 1822 |
-
qxeCxC_zpvU_000202 playing french horn
|
| 1823 |
-
r24KMnV5Rrk_000030 people running
|
| 1824 |
-
r42dJt0hxro_000010 gibbon howling
|
| 1825 |
-
r47N9mdOeXc_000030 playing violin, fiddle
|
| 1826 |
-
r4Zm5lEsI-M_000110 vehicle horn, car horn, honking
|
| 1827 |
-
r7e4wJy4NP8_000090 motorboat, speedboat acceleration
|
| 1828 |
-
r96LZqBtlwg_000050 dog whimpering
|
| 1829 |
-
r9uN-AltjDQ_000130 lawn mowing
|
| 1830 |
-
rAXnOxWHaLs_000030 playing french horn
|
| 1831 |
-
rAth9ueRqM4_000040 whale calling
|
| 1832 |
-
rD4zq3CvJSo_000130 people slapping
|
| 1833 |
-
rEdr-j9oAN0_000074 playing french horn
|
| 1834 |
-
rFA1GBcIGN4_000067 playing ukulele
|
| 1835 |
-
rFgrOflwKPg_000290 playing trombone
|
| 1836 |
-
rLuNw3Cm7rs_000024 lighting firecrackers
|
| 1837 |
-
rMDnGZU7jzE_000001 dog baying
|
| 1838 |
-
rQthEYYXM-k_000030 people sniggering
|
| 1839 |
-
rRP810El--s_000958 fire truck siren
|
| 1840 |
-
rSHvW5dGanw_000150 fireworks banging
|
| 1841 |
-
rSWPVWkAbec_000000 bee, wasp, etc. buzzing
|
| 1842 |
-
rTNSzUXd3wk_000180 playing double bass
|
| 1843 |
-
rVnkDOvLWm8_000180 cap gun shooting
|
| 1844 |
-
raz3OUu768k_000068 playing clarinet
|
| 1845 |
-
rfqqBv3eriU_000160 stream burbling
|
| 1846 |
-
rgdMDo5TBic_000355 playing squash
|
| 1847 |
-
rn381TUMxyE_000298 arc welding
|
| 1848 |
-
rs2FL8HJfGE_000030 people sniggering
|
| 1849 |
-
rwVhTlLcBO0_000099 playing erhu
|
| 1850 |
-
rx2lqMvj2Wo_000052 squishing water
|
| 1851 |
-
rz9PZZA04z8_000183 playing badminton
|
| 1852 |
-
s2QrQdxzLwQ_000074 playing glockenspiel
|
| 1853 |
-
s8zSSYQM0Tc_000127 footsteps on snow
|
| 1854 |
-
s9gzcUg_nlM_000030 playing drum kit
|
| 1855 |
-
sFTyeq295xU_000041 people humming
|
| 1856 |
-
sIHApNhq2Ik_000002 bird squawking
|
| 1857 |
-
sLEEurjCsAY_000051 typing on typewriter
|
| 1858 |
-
sLOjC8EWrHA_000070 driving buses
|
| 1859 |
-
sOg4MNTWx_0_000000 skateboarding
|
| 1860 |
-
sUHlRRyS2YM_000009 pigeon, dove cooing
|
| 1861 |
-
sUs8O9toO4M_000311 dinosaurs bellowing
|
| 1862 |
-
sXDJvBEzqjs_000000 dog bow-wow
|
| 1863 |
-
sYy0lPjLEXQ_000100 playing cymbal
|
| 1864 |
-
s_FLZ-ekB2A_000088 telephone bell ringing
|
| 1865 |
-
sa6B5XyFYIg_000040 playing bagpipes
|
| 1866 |
-
scm7r0uBepU_000467 mouse clicking
|
| 1867 |
-
smBHJiEPCRI_000030 duck quacking
|
| 1868 |
-
snbtH1P3MVA_000119 playing timbales
|
| 1869 |
-
snyzyJlTBbg_000003 dog baying
|
| 1870 |
-
surXSGAnpM0_000000 playing harmonica
|
| 1871 |
-
sxiVIGK5AEc_000010 people crowd
|
| 1872 |
-
syysO74ja30_000007 playing gong
|
| 1873 |
-
szQ-4VQQQsI_000020 railroad car, train wagon
|
| 1874 |
-
t0XoS_8YVP4_000728 magpie calling
|
| 1875 |
-
t2xJjZp1D1E_000030 dog growling
|
| 1876 |
-
t3YfjKEmei4_000080 race car, auto racing
|
| 1877 |
-
t3u3ykowlvs_000030 raining
|
| 1878 |
-
tD9rMw8YPBI_000030 child speech, kid speaking
|
| 1879 |
-
tDayTL0ivzU_000014 playing timbales
|
| 1880 |
-
tJChPvDD-hI_000035 parrot talking
|
| 1881 |
-
tLFNgY5NBMk_000001 playing bassoon
|
| 1882 |
-
tRw0KL6PMFU_000060 skateboarding
|
| 1883 |
-
tTePTFQV52M_000030 pig oinking
|
| 1884 |
-
tV0sIqEryIY_000037 wind chime
|
| 1885 |
-
tWDG6UsiG3s_000090 people babbling
|
| 1886 |
-
tYBxgXg8yxw_000046 woodpecker pecking tree
|
| 1887 |
-
tYzH5rkbuBQ_000000 frog croaking
|
| 1888 |
-
tm9rnG0455k_000010 skidding
|
| 1889 |
-
tuqcWxh_mdc_000012 baby crying
|
| 1890 |
-
twWBQjLyuxw_000014 bull bellowing
|
| 1891 |
-
u1nAQ6GgJ7Y_000154 playing volleyball
|
| 1892 |
-
u6AV24u4OMQ_000052 rope skipping
|
| 1893 |
-
u6c5tvrkqVA_000187 playing timbales
|
| 1894 |
-
u88CrTGAqbo_000000 lawn mowing
|
| 1895 |
-
uEPueBOV06U_000109 yodelling
|
| 1896 |
-
uGQ0TW02gBo_000004 frog croaking
|
| 1897 |
-
uI5eona1hc4_000000 elk bugling
|
| 1898 |
-
uIHnphQWVRA_000169 opening or closing drawers
|
| 1899 |
-
uIg0I7pAjvM_000030 race car, auto racing
|
| 1900 |
-
uJSDmIF4dhE_000260 driving buses
|
| 1901 |
-
uK0jcVxT-Pg_000030 driving buses
|
| 1902 |
-
uLm5oUt3XG4_000031 playing tabla
|
| 1903 |
-
uSmduC6gJxg_000050 rowboat, canoe, kayak rowing
|
| 1904 |
-
uWdgdlJqI2Y_000019 basketball bounce
|
| 1905 |
-
uWq8Q_cIEwE_000086 playing ukulele
|
| 1906 |
-
uZghS49MC1k_000180 skidding
|
| 1907 |
-
u_85N9h_cGs_000050 car passing by
|
| 1908 |
-
udVSYrFacsc_000072 playing cornet
|
| 1909 |
-
ugUyp_keJO4_000022 mouse clicking
|
| 1910 |
-
uiPC88KDlW4_000022 engine accelerating, revving, vroom
|
| 1911 |
-
unF6DdqG4l8_000050 people whistling
|
| 1912 |
-
upZ0sKmaZrI_000167 playing lacrosse
|
| 1913 |
-
uvUEfRqpEQU_000145 singing choir
|
| 1914 |
-
uyNyWLJIci8_000000 fire truck siren
|
| 1915 |
-
v5OdaMw5hhk_000030 playing snare drum
|
| 1916 |
-
vADdI9YTMRs_000243 playing timbales
|
| 1917 |
-
vJk_Jzr2YIs_000080 playing hammond organ
|
| 1918 |
-
vLLiaCDHSPY_000010 dog barking
|
| 1919 |
-
vUORRJqXp7A_000036 playing table tennis
|
| 1920 |
-
vXupVqDfK34_000116 cricket chirping
|
| 1921 |
-
v_cxwPhwaBQ_000000 people farting
|
| 1922 |
-
varD0b9CTgs_000020 people belly laughing
|
| 1923 |
-
vcwXIa-QB8A_000025 sailing
|
| 1924 |
-
vdXavSaj8-M_000070 playing accordion
|
| 1925 |
-
vgIgTWqXtms_000023 child singing
|
| 1926 |
-
vhqkCDgsuh4_000255 people booing
|
| 1927 |
-
vkA-v4DSriM_000229 playing tabla
|
| 1928 |
-
vktUwc0Cs7w_000170 playing clarinet
|
| 1929 |
-
vpAGr_NrM_w_000050 fireworks banging
|
| 1930 |
-
vzoQdjPITKw_000030 pigeon, dove cooing
|
| 1931 |
-
w-9xoB74oF0_000004 opening or closing car electric windows
|
| 1932 |
-
w-JaJ11OqQY_000345 people slurping
|
| 1933 |
-
w3kMt-zQ9t4_000215 playing table tennis
|
| 1934 |
-
w5T582MCzlY_000011 running electric fan
|
| 1935 |
-
w5vaBVSxgKg_000030 lawn mowing
|
| 1936 |
-
w8puug1pEUA_000170 stream burbling
|
| 1937 |
-
w9K_AmeWhlo_000071 fire crackling
|
| 1938 |
-
wAnqT37UgYY_000034 dog growling
|
| 1939 |
-
wEbJ-9cmSaE_000003 playing cornet
|
| 1940 |
-
wHdgExbL6dA_000034 playing badminton
|
| 1941 |
-
wOYLWY6UCu8_000262 playing ukulele
|
| 1942 |
-
wP-96GP6bsU_000000 vehicle horn, car horn, honking
|
| 1943 |
-
wT6-Isia2PQ_000149 child singing
|
| 1944 |
-
wTQ-1cd8owI_000181 dog bow-wow
|
| 1945 |
-
wUNpHu61l7Q_000190 male singing
|
| 1946 |
-
wVJ-S2zYxug_000040 playing drum kit
|
| 1947 |
-
wX4Ya3D20H8_000039 scuba diving
|
| 1948 |
-
wXsrff4No40_000237 playing hockey
|
| 1949 |
-
wYZc2-3ViXs_000155 civil defense siren
|
| 1950 |
-
wZj294W4RVU_000094 fire crackling
|
| 1951 |
-
w_yGhgrow38_000091 eletric blender running
|
| 1952 |
-
wdk-RmsGdyw_000310 driving buses
|
| 1953 |
-
wdlfOAR03iY_000000 playing glockenspiel
|
| 1954 |
-
we-ONoZIkWE_000018 dog howling
|
| 1955 |
-
wegIxELjtz4_000334 people eating noodle
|
| 1956 |
-
whIS2UodgLI_000002 gibbon howling
|
| 1957 |
-
wkwjx0oMAjw_000021 beat boxing
|
| 1958 |
-
wnW4qgQQg3g_000050 playing cello
|
| 1959 |
-
wrFyu2T1XOo_000000 hail
|
| 1960 |
-
wsHPe19Y9Nc_000081 electric shaver, electric razor shaving
|
| 1961 |
-
wtyuiWygNTc_000000 zebra braying
|
| 1962 |
-
wuAcPWyHMXo_000008 lions roaring
|
| 1963 |
-
wwQPX3zjV4s_000028 elk bugling
|
| 1964 |
-
x0_AiAhfeV0_000068 eagle screaming
|
| 1965 |
-
x0bbH2Tao_0_000000 dog howling
|
| 1966 |
-
x1Rt2zN-oXo_000000 dog growling
|
| 1967 |
-
x1bXQS9dUAc_000140 playing violin, fiddle
|
| 1968 |
-
x2uCcPNM6Nw_000030 pigeon, dove cooing
|
| 1969 |
-
x3cLaiaaF0M_000032 skiing
|
| 1970 |
-
x68R1rmvKgc_000060 female singing
|
| 1971 |
-
x6d8ytnWNDI_000045 barn swallow calling
|
| 1972 |
-
x8yymm3DtVA_000022 playing cello
|
| 1973 |
-
xK1vy_6H2VM_000010 scuba diving
|
| 1974 |
-
xMa1vAUhTfM_000429 ice cream truck, ice cream van
|
| 1975 |
-
xN_CePbfjVg_000004 playing bass drum
|
| 1976 |
-
xPIhTw0fbzI_000010 train horning
|
| 1977 |
-
xQaYumd1O48_000004 lions growling
|
| 1978 |
-
xS4brO1qu0g_000591 playing hockey
|
| 1979 |
-
xUCKcoE3K6Q_000313 lip smacking
|
| 1980 |
-
xVDGIF1pFvQ_000030 driving buses
|
| 1981 |
-
xVEXWvj0iWo_000060 rowboat, canoe, kayak rowing
|
| 1982 |
-
xWBMt4fI95M_000063 scuba diving
|
| 1983 |
-
xWgd4OMcKbs_000263 people nose blowing
|
| 1984 |
-
xY9mlbn2IhY_000000 people burping
|
| 1985 |
-
xYAHwbhWEgM_000030 playing violin, fiddle
|
| 1986 |
-
xbNNxwGRG20_000062 cattle, bovinae cowbell
|
| 1987 |
-
xdUbCcEbipM_000290 people crowd
|
| 1988 |
-
xeS25F6uHic_000162 airplane flyby
|
| 1989 |
-
xetF74UUCGk_000001 ice cream truck, ice cream van
|
| 1990 |
-
xf0cheS5wFM_000090 playing piano
|
| 1991 |
-
xfT0HF1Pbxk_000003 playing sitar
|
| 1992 |
-
xg_3Uas3z40_000240 skateboarding
|
| 1993 |
-
xibFeibkfWM_000036 alligators, crocodiles hissing
|
| 1994 |
-
xj0Xi47RC88_000200 lawn mowing
|
| 1995 |
-
xkUzsvSImy4_000306 people eating crisps
|
| 1996 |
-
xm0N3HXnSWc_000361 rope skipping
|
| 1997 |
-
xoViga6dJa4_000141 playing steelpan
|
| 1998 |
-
xocKilOzrb4_000065 reversing beeps
|
| 1999 |
-
xq5kMmAFYx8_000030 playing double bass
|
| 2000 |
-
xqv96EPg7so_000200 railroad car, train wagon
|
| 2001 |
-
xtvQjd6cwC4_000040 playing bagpipes
|
| 2002 |
-
y3TRiYwDbHo_000287 playing oboe
|
| 2003 |
-
y6wsRU2aNx4_000040 railroad car, train wagon
|
| 2004 |
-
y95ml0IYGr4_000440 chainsawing trees
|
| 2005 |
-
yA_63YfQ034_000022 dog growling
|
| 2006 |
-
yBwMu2NueR0_000284 rapping
|
| 2007 |
-
yE_SP127xy8_000010 people crowd
|
| 2008 |
-
yEfhYsMd1yc_000006 playing double bass
|
| 2009 |
-
yH3PJfYi_gs_000109 car engine starting
|
| 2010 |
-
yJGtoH8INnA_000084 tapping guitar
|
| 2011 |
-
yJN5_1tfqXo_000075 magpie calling
|
| 2012 |
-
yMMmjb3BRi0_000030 dog bow-wow
|
| 2013 |
-
yOhdod2Kg40_000210 playing bassoon
|
| 2014 |
-
yPJiPWkeT3U_000254 playing gong
|
| 2015 |
-
yPUYU6t3rwo_000370 bee, wasp, etc. buzzing
|
| 2016 |
-
yQzzdP-4iBU_000002 planing timber
|
| 2017 |
-
yUL9UefoANU_000128 tractor digging
|
| 2018 |
-
yVzIaZzLH38_000130 bee, wasp, etc. buzzing
|
| 2019 |
-
yYPNrg-s-NI_000060 child singing
|
| 2020 |
-
ybnXdQfSNZs_000001 police radio chatter
|
| 2021 |
-
ycN30BUfzeo_000070 playing clarinet
|
| 2022 |
-
ygOHZ_55jME_000174 electric shaver, electric razor shaving
|
| 2023 |
-
yjyZgzYuuSQ_000089 cat purring
|
| 2024 |
-
yo5I2MTqv9E_000030 playing marimba, xylophone
|
| 2025 |
-
ywD_am3uZh8_000020 splashing water
|
| 2026 |
-
ywYLMe6y-S0_000040 playing piano
|
| 2027 |
-
z9CCSNKepA8_000537 striking pool
|
| 2028 |
-
z9crgUIWcmA_000000 dog barking
|
| 2029 |
-
zBgR_gj8NGg_000083 striking pool
|
| 2030 |
-
zGbJAz-3Ao8_000070 playing banjo
|
| 2031 |
-
zGn9k6j8kVo_000049 rope skipping
|
| 2032 |
-
zILE3kr9nIU_000030 mouse pattering
|
| 2033 |
-
zJPgE79wkE4_000000 playing tennis
|
| 2034 |
-
zMKJFnBr1Gw_000013 reversing beeps
|
| 2035 |
-
zPlyG_ryFpg_000006 sliding door
|
| 2036 |
-
zRU8A0m9Op8_000145 driving snowmobile
|
| 2037 |
-
zVCqTRlc7NU_000020 fire truck siren
|
| 2038 |
-
zYPY3Fh1Xjo_000000 skidding
|
| 2039 |
-
zcZ0WVQ8t8s_000210 splashing water
|
| 2040 |
-
zhPLdAMVAuo_000257 church bell ringing
|
| 2041 |
-
zl6hP51zURM_000075 playing oboe
|
| 2042 |
-
zlt2EGxum58_000174 bouncing on trampoline
|
| 2043 |
-
zmSPCArJHB0_000190 bird squawking
|
| 2044 |
-
zpqGedo-jm4_000043 cell phone buzzing
|
| 2045 |
-
zrKMC4fAKp0_000202 playing cello
|
| 2046 |
-
zsnU7rt_Qq0_000005 baby laughter
|
| 2047 |
-
zw7dTh-Lx3o_000074 canary calling
|
| 2048 |
-
zzP5qr-ZxHY_000199 people marching
|
| 2049 |
-
zzftU8z4aOI_000230 skateboarding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MMAudio/train.py
DELETED
|
@@ -1,209 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import math
|
| 3 |
-
import random
|
| 4 |
-
from datetime import timedelta
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
|
| 7 |
-
import hydra
|
| 8 |
-
import numpy as np
|
| 9 |
-
import torch
|
| 10 |
-
import torch.distributed as distributed
|
| 11 |
-
from hydra import compose
|
| 12 |
-
from hydra.core.hydra_config import HydraConfig
|
| 13 |
-
from omegaconf import DictConfig, open_dict
|
| 14 |
-
from torch.distributed.elastic.multiprocessing.errors import record
|
| 15 |
-
|
| 16 |
-
from mmaudio.data.data_setup import setup_training_datasets, setup_val_datasets
|
| 17 |
-
from mmaudio.model.sequence_config import CONFIG_16K, CONFIG_44K
|
| 18 |
-
from mmaudio.runner import Runner
|
| 19 |
-
from mmaudio.sample import sample
|
| 20 |
-
from mmaudio.utils.dist_utils import info_if_rank_zero, local_rank, world_size
|
| 21 |
-
from mmaudio.utils.logger import TensorboardLogger
|
| 22 |
-
from mmaudio.utils.synthesize_ema import synthesize_ema
|
| 23 |
-
|
| 24 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
| 25 |
-
torch.backends.cudnn.allow_tf32 = True
|
| 26 |
-
|
| 27 |
-
log = logging.getLogger()
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def distributed_setup():
|
| 31 |
-
distributed.init_process_group(backend="nccl", timeout=timedelta(hours=2))
|
| 32 |
-
log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
|
| 33 |
-
return local_rank, world_size
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
@record
|
| 37 |
-
@hydra.main(version_base='1.3.2', config_path='config', config_name='train_config.yaml')
|
| 38 |
-
def train(cfg: DictConfig):
|
| 39 |
-
# initial setup
|
| 40 |
-
torch.cuda.set_device(local_rank)
|
| 41 |
-
torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
|
| 42 |
-
distributed_setup()
|
| 43 |
-
num_gpus = world_size
|
| 44 |
-
run_dir = HydraConfig.get().run.dir
|
| 45 |
-
|
| 46 |
-
# compose early such that it does not rely on future hard disk reading
|
| 47 |
-
eval_cfg = compose('eval_config', overrides=[f'exp_id={cfg.exp_id}'])
|
| 48 |
-
|
| 49 |
-
# patch data dim
|
| 50 |
-
if cfg.model.endswith('16k'):
|
| 51 |
-
seq_cfg = CONFIG_16K
|
| 52 |
-
elif cfg.model.endswith('44k'):
|
| 53 |
-
seq_cfg = CONFIG_44K
|
| 54 |
-
else:
|
| 55 |
-
raise ValueError(f'Unknown model: {cfg.model}')
|
| 56 |
-
with open_dict(cfg):
|
| 57 |
-
cfg.data_dim.latent_seq_len = seq_cfg.latent_seq_len
|
| 58 |
-
cfg.data_dim.clip_seq_len = seq_cfg.clip_seq_len
|
| 59 |
-
cfg.data_dim.sync_seq_len = seq_cfg.sync_seq_len
|
| 60 |
-
|
| 61 |
-
# wrap python logger with a tensorboard logger
|
| 62 |
-
log = TensorboardLogger(cfg.exp_id,
|
| 63 |
-
run_dir,
|
| 64 |
-
logging.getLogger(),
|
| 65 |
-
is_rank0=(local_rank == 0),
|
| 66 |
-
enable_email=cfg.enable_email and not cfg.debug)
|
| 67 |
-
|
| 68 |
-
info_if_rank_zero(log, f'All configuration: {cfg}')
|
| 69 |
-
info_if_rank_zero(log, f'Number of GPUs detected: {num_gpus}')
|
| 70 |
-
|
| 71 |
-
# number of dataloader workers
|
| 72 |
-
info_if_rank_zero(log, f'Number of dataloader workers (per GPU): {cfg.num_workers}')
|
| 73 |
-
|
| 74 |
-
# Set seeds to ensure the same initialization
|
| 75 |
-
torch.manual_seed(cfg.seed)
|
| 76 |
-
np.random.seed(cfg.seed)
|
| 77 |
-
random.seed(cfg.seed)
|
| 78 |
-
|
| 79 |
-
# setting up configurations
|
| 80 |
-
info_if_rank_zero(log, f'Training configuration: {cfg}')
|
| 81 |
-
cfg.batch_size //= num_gpus
|
| 82 |
-
info_if_rank_zero(log, f'Batch size (per GPU): {cfg.batch_size}')
|
| 83 |
-
|
| 84 |
-
# determine time to change max skip
|
| 85 |
-
total_iterations = cfg['num_iterations']
|
| 86 |
-
|
| 87 |
-
# setup datasets
|
| 88 |
-
dataset, sampler, loader = setup_training_datasets(cfg)
|
| 89 |
-
info_if_rank_zero(log, f'Number of training samples: {len(dataset)}')
|
| 90 |
-
info_if_rank_zero(log, f'Number of training batches: {len(loader)}')
|
| 91 |
-
|
| 92 |
-
val_dataset, val_loader, eval_loader = setup_val_datasets(cfg)
|
| 93 |
-
info_if_rank_zero(log, f'Number of val samples: {len(val_dataset)}')
|
| 94 |
-
val_cfg = cfg.data.ExtractedVGG_val
|
| 95 |
-
|
| 96 |
-
# compute and set mean and std
|
| 97 |
-
latent_mean, latent_std = dataset.compute_latent_stats()
|
| 98 |
-
|
| 99 |
-
# construct the trainer
|
| 100 |
-
trainer = Runner(cfg,
|
| 101 |
-
log=log,
|
| 102 |
-
run_path=run_dir,
|
| 103 |
-
for_training=True,
|
| 104 |
-
latent_mean=latent_mean,
|
| 105 |
-
latent_std=latent_std).enter_train()
|
| 106 |
-
eval_rng_clone = trainer.rng.graphsafe_get_state()
|
| 107 |
-
|
| 108 |
-
# load previous checkpoint if needed
|
| 109 |
-
if cfg['checkpoint'] is not None:
|
| 110 |
-
curr_iter = trainer.load_checkpoint(cfg['checkpoint'])
|
| 111 |
-
cfg['checkpoint'] = None
|
| 112 |
-
info_if_rank_zero(log, 'Model checkpoint loaded!')
|
| 113 |
-
else:
|
| 114 |
-
# if run_dir exists, load the latest checkpoint
|
| 115 |
-
checkpoint = trainer.get_latest_checkpoint_path()
|
| 116 |
-
if checkpoint is not None:
|
| 117 |
-
curr_iter = trainer.load_checkpoint(checkpoint)
|
| 118 |
-
info_if_rank_zero(log, 'Latest checkpoint loaded!')
|
| 119 |
-
else:
|
| 120 |
-
# load previous network weights if needed
|
| 121 |
-
curr_iter = 0
|
| 122 |
-
if cfg['weights'] is not None:
|
| 123 |
-
info_if_rank_zero(log, 'Loading weights from the disk')
|
| 124 |
-
trainer.load_weights(cfg['weights'])
|
| 125 |
-
cfg['weights'] = None
|
| 126 |
-
|
| 127 |
-
# determine max epoch
|
| 128 |
-
total_epoch = math.ceil(total_iterations / len(loader))
|
| 129 |
-
current_epoch = curr_iter // len(loader)
|
| 130 |
-
info_if_rank_zero(log, f'We will approximately use {total_epoch} epochs.')
|
| 131 |
-
|
| 132 |
-
# training loop
|
| 133 |
-
try:
|
| 134 |
-
# Need this to select random bases in different workers
|
| 135 |
-
np.random.seed(np.random.randint(2**30 - 1) + local_rank * 1000)
|
| 136 |
-
while curr_iter < total_iterations:
|
| 137 |
-
# Crucial for randomness!
|
| 138 |
-
sampler.set_epoch(current_epoch)
|
| 139 |
-
current_epoch += 1
|
| 140 |
-
log.debug(f'Current epoch: {current_epoch}')
|
| 141 |
-
|
| 142 |
-
trainer.enter_train()
|
| 143 |
-
trainer.log.data_timer.start()
|
| 144 |
-
for data in loader:
|
| 145 |
-
trainer.train_pass(data, curr_iter)
|
| 146 |
-
|
| 147 |
-
if (curr_iter + 1) % cfg.val_interval == 0:
|
| 148 |
-
# swap into a eval rng state, i.e., use the same seed for every validation pass
|
| 149 |
-
train_rng_snapshot = trainer.rng.graphsafe_get_state()
|
| 150 |
-
trainer.rng.graphsafe_set_state(eval_rng_clone)
|
| 151 |
-
info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
|
| 152 |
-
for data in val_loader:
|
| 153 |
-
trainer.validation_pass(data, curr_iter)
|
| 154 |
-
distributed.barrier()
|
| 155 |
-
trainer.val_integrator.finalize('val', curr_iter, ignore_timer=True)
|
| 156 |
-
trainer.rng.graphsafe_set_state(train_rng_snapshot)
|
| 157 |
-
|
| 158 |
-
if (curr_iter + 1) % cfg.eval_interval == 0:
|
| 159 |
-
save_eval = (curr_iter + 1) % cfg.save_eval_interval == 0
|
| 160 |
-
train_rng_snapshot = trainer.rng.graphsafe_get_state()
|
| 161 |
-
trainer.rng.graphsafe_set_state(eval_rng_clone)
|
| 162 |
-
info_if_rank_zero(log, f'Iteration {curr_iter}: validating')
|
| 163 |
-
for data in eval_loader:
|
| 164 |
-
audio_path = trainer.inference_pass(data,
|
| 165 |
-
curr_iter,
|
| 166 |
-
val_cfg,
|
| 167 |
-
save_eval=save_eval)
|
| 168 |
-
distributed.barrier()
|
| 169 |
-
trainer.rng.graphsafe_set_state(train_rng_snapshot)
|
| 170 |
-
trainer.eval(audio_path, curr_iter, val_cfg)
|
| 171 |
-
|
| 172 |
-
curr_iter += 1
|
| 173 |
-
|
| 174 |
-
if curr_iter >= total_iterations:
|
| 175 |
-
break
|
| 176 |
-
except Exception as e:
|
| 177 |
-
log.error(f'Error occurred at iteration {curr_iter}!')
|
| 178 |
-
log.critical(e.message if hasattr(e, 'message') else str(e))
|
| 179 |
-
raise
|
| 180 |
-
finally:
|
| 181 |
-
if not cfg.debug:
|
| 182 |
-
trainer.save_checkpoint(curr_iter)
|
| 183 |
-
trainer.save_weights(curr_iter)
|
| 184 |
-
|
| 185 |
-
# Inference pass
|
| 186 |
-
del trainer
|
| 187 |
-
torch.cuda.empty_cache()
|
| 188 |
-
|
| 189 |
-
# Synthesize EMA
|
| 190 |
-
if local_rank == 0:
|
| 191 |
-
log.info(f'Synthesizing EMA with sigma={cfg.ema.default_output_sigma}')
|
| 192 |
-
ema_sigma = cfg.ema.default_output_sigma
|
| 193 |
-
state_dict = synthesize_ema(cfg, ema_sigma, step=None)
|
| 194 |
-
save_dir = Path(run_dir) / f'{cfg.exp_id}_ema_final.pth'
|
| 195 |
-
torch.save(state_dict, save_dir)
|
| 196 |
-
log.info(f'Synthesized EMA saved to {save_dir}!')
|
| 197 |
-
distributed.barrier()
|
| 198 |
-
|
| 199 |
-
log.info(f'Evaluation: {eval_cfg}')
|
| 200 |
-
sample(eval_cfg)
|
| 201 |
-
|
| 202 |
-
# clean-up
|
| 203 |
-
log.complete()
|
| 204 |
-
distributed.barrier()
|
| 205 |
-
distributed.destroy_process_group()
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
if __name__ == '__main__':
|
| 209 |
-
train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|