Spaces:
Runtime error
Runtime error
Rex Cheng
commited on
Commit
·
9ac63db
1
Parent(s):
c8ca0bd
zeroGPU
Browse files- app.py +15 -16
- mmaudio/eval_utils.py +18 -1
- mmaudio/utils/download_utils.py +2 -1
- requirements.txt +4 -3
app.py
CHANGED
|
@@ -83,14 +83,15 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
|
|
| 83 |
audio = audios.float().cpu()[0]
|
| 84 |
|
| 85 |
# current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 86 |
-
video_save_path = tempfile.
|
| 87 |
# output_dir.mkdir(exist_ok=True, parents=True)
|
| 88 |
# video_save_path = output_dir / f'{current_time_string}.mp4'
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
| 94 |
return video_save_path
|
| 95 |
|
| 96 |
|
|
@@ -116,11 +117,9 @@ def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int,
|
|
| 116 |
cfg_strength=cfg_strength)
|
| 117 |
audio = audios.float().cpu()[0]
|
| 118 |
|
| 119 |
-
|
| 120 |
-
# output_dir.mkdir(exist_ok=True, parents=True)
|
| 121 |
-
# audio_save_path = output_dir / f'{current_time_string}.flac'
|
| 122 |
-
audio_save_path = tempfile.mktemp(suffix='.flac')
|
| 123 |
torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
|
|
|
|
| 124 |
return audio_save_path
|
| 125 |
|
| 126 |
|
|
@@ -140,8 +139,8 @@ video_to_audio_tab = gr.Interface(
|
|
| 140 |
title='MMAudio — Video-to-Audio Synthesis',
|
| 141 |
examples=[
|
| 142 |
[
|
| 143 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/
|
| 144 |
-
'',
|
| 145 |
'',
|
| 146 |
0,
|
| 147 |
25,
|
|
@@ -185,8 +184,8 @@ video_to_audio_tab = gr.Interface(
|
|
| 185 |
10,
|
| 186 |
],
|
| 187 |
[
|
| 188 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/
|
| 189 |
-
'waves,
|
| 190 |
'',
|
| 191 |
0,
|
| 192 |
25,
|
|
@@ -194,8 +193,8 @@ video_to_audio_tab = gr.Interface(
|
|
| 194 |
10,
|
| 195 |
],
|
| 196 |
[
|
| 197 |
-
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/
|
| 198 |
-
'
|
| 199 |
'',
|
| 200 |
0,
|
| 201 |
25,
|
|
|
|
| 83 |
audio = audios.float().cpu()[0]
|
| 84 |
|
| 85 |
# current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 86 |
+
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
|
| 87 |
# output_dir.mkdir(exist_ok=True, parents=True)
|
| 88 |
# video_save_path = output_dir / f'{current_time_string}.mp4'
|
| 89 |
+
make_video(video,
|
| 90 |
+
video_save_path,
|
| 91 |
+
audio,
|
| 92 |
+
sampling_rate=seq_cfg.sampling_rate,
|
| 93 |
+
duration_sec=seq_cfg.duration)
|
| 94 |
+
log.info(f'Saved video to {video_save_path}')
|
| 95 |
return video_save_path
|
| 96 |
|
| 97 |
|
|
|
|
| 117 |
cfg_strength=cfg_strength)
|
| 118 |
audio = audios.float().cpu()[0]
|
| 119 |
|
| 120 |
+
audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
|
|
|
|
|
|
|
|
|
|
| 121 |
torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
|
| 122 |
+
log.info(f'Saved audio to {audio_save_path}')
|
| 123 |
return audio_save_path
|
| 124 |
|
| 125 |
|
|
|
|
| 139 |
title='MMAudio — Video-to-Audio Synthesis',
|
| 140 |
examples=[
|
| 141 |
[
|
| 142 |
+
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
|
| 143 |
+
'waves, seagulls',
|
| 144 |
'',
|
| 145 |
0,
|
| 146 |
25,
|
|
|
|
| 184 |
10,
|
| 185 |
],
|
| 186 |
[
|
| 187 |
+
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
|
| 188 |
+
'waves, storm',
|
| 189 |
'',
|
| 190 |
0,
|
| 191 |
25,
|
|
|
|
| 193 |
10,
|
| 194 |
],
|
| 195 |
[
|
| 196 |
+
'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
|
| 197 |
+
'',
|
| 198 |
'',
|
| 199 |
0,
|
| 200 |
25,
|
mmaudio/eval_utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import torch
|
|
| 7 |
from colorlog import ColoredFormatter
|
| 8 |
from torchvision.transforms import v2
|
| 9 |
from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
|
|
|
|
| 10 |
|
| 11 |
from mmaudio.model.flow_matching import FlowMatching
|
| 12 |
from mmaudio.model.networks import MMAudio
|
|
@@ -169,11 +170,13 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
|
|
| 169 |
reader = StreamingMediaDecoder(video_path)
|
| 170 |
reader.add_basic_video_stream(
|
| 171 |
frames_per_chunk=int(_CLIP_FPS * duration_sec),
|
|
|
|
| 172 |
frame_rate=_CLIP_FPS,
|
| 173 |
format='rgb24',
|
| 174 |
)
|
| 175 |
reader.add_basic_video_stream(
|
| 176 |
frames_per_chunk=int(_SYNC_FPS * duration_sec),
|
|
|
|
| 177 |
frame_rate=_SYNC_FPS,
|
| 178 |
format='rgb24',
|
| 179 |
)
|
|
@@ -182,9 +185,14 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
|
|
| 182 |
data_chunk = reader.pop_chunks()
|
| 183 |
clip_chunk = data_chunk[0]
|
| 184 |
sync_chunk = data_chunk[1]
|
|
|
|
|
|
|
| 185 |
assert clip_chunk is not None
|
| 186 |
assert sync_chunk is not None
|
| 187 |
|
|
|
|
|
|
|
|
|
|
| 188 |
clip_frames = clip_transform(clip_chunk)
|
| 189 |
sync_frames = sync_transform(sync_chunk)
|
| 190 |
|
|
@@ -210,17 +218,26 @@ def load_video(video_path: Path, duration_sec: float) -> tuple[torch.Tensor, tor
|
|
| 210 |
def make_video(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int,
|
| 211 |
duration_sec: float):
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
approx_max_length = int(duration_sec * 60)
|
| 214 |
reader = StreamingMediaDecoder(video_path)
|
| 215 |
reader.add_basic_video_stream(
|
| 216 |
frames_per_chunk=approx_max_length,
|
|
|
|
| 217 |
format='rgb24',
|
| 218 |
)
|
| 219 |
reader.fill_buffer()
|
| 220 |
video_chunk = reader.pop_chunks()[0]
|
|
|
|
| 221 |
assert video_chunk is not None
|
| 222 |
|
| 223 |
-
fps = int(reader.get_out_stream_info(0).frame_rate)
|
|
|
|
|
|
|
|
|
|
| 224 |
if fps > 60:
|
| 225 |
log.warning(f'This code supports only up to 60 fps, but the video has {fps} fps')
|
| 226 |
log.warning(f'Just change the *60 above me')
|
|
|
|
| 7 |
from colorlog import ColoredFormatter
|
| 8 |
from torchvision.transforms import v2
|
| 9 |
from torio.io import StreamingMediaDecoder, StreamingMediaEncoder
|
| 10 |
+
import av
|
| 11 |
|
| 12 |
from mmaudio.model.flow_matching import FlowMatching
|
| 13 |
from mmaudio.model.networks import MMAudio
|
|
|
|
| 170 |
reader = StreamingMediaDecoder(video_path)
|
| 171 |
reader.add_basic_video_stream(
|
| 172 |
frames_per_chunk=int(_CLIP_FPS * duration_sec),
|
| 173 |
+
buffer_chunk_size=1,
|
| 174 |
frame_rate=_CLIP_FPS,
|
| 175 |
format='rgb24',
|
| 176 |
)
|
| 177 |
reader.add_basic_video_stream(
|
| 178 |
frames_per_chunk=int(_SYNC_FPS * duration_sec),
|
| 179 |
+
buffer_chunk_size=1,
|
| 180 |
frame_rate=_SYNC_FPS,
|
| 181 |
format='rgb24',
|
| 182 |
)
|
|
|
|
| 185 |
data_chunk = reader.pop_chunks()
|
| 186 |
clip_chunk = data_chunk[0]
|
| 187 |
sync_chunk = data_chunk[1]
|
| 188 |
+
print('clip', clip_chunk.shape, clip_chunk.dtype, clip_chunk.max())
|
| 189 |
+
print('sync', sync_chunk.shape, sync_chunk.dtype, sync_chunk.max())
|
| 190 |
assert clip_chunk is not None
|
| 191 |
assert sync_chunk is not None
|
| 192 |
|
| 193 |
+
for i in range(reader.num_out_streams):
|
| 194 |
+
print(reader.get_out_stream_info(i))
|
| 195 |
+
|
| 196 |
clip_frames = clip_transform(clip_chunk)
|
| 197 |
sync_frames = sync_transform(sync_chunk)
|
| 198 |
|
|
|
|
| 218 |
def make_video(video_path: Path, output_path: Path, audio: torch.Tensor, sampling_rate: int,
|
| 219 |
duration_sec: float):
|
| 220 |
|
| 221 |
+
av_video = av.open(video_path)
|
| 222 |
+
frame_rate = av_video.streams.video[0].guessed_rate
|
| 223 |
+
print('av frame rate', frame_rate)
|
| 224 |
+
|
| 225 |
approx_max_length = int(duration_sec * 60)
|
| 226 |
reader = StreamingMediaDecoder(video_path)
|
| 227 |
reader.add_basic_video_stream(
|
| 228 |
frames_per_chunk=approx_max_length,
|
| 229 |
+
buffer_chunk_size=1,
|
| 230 |
format='rgb24',
|
| 231 |
)
|
| 232 |
reader.fill_buffer()
|
| 233 |
video_chunk = reader.pop_chunks()[0]
|
| 234 |
+
print(video_chunk.shape, video_chunk.dtype, video_chunk.max())
|
| 235 |
assert video_chunk is not None
|
| 236 |
|
| 237 |
+
# fps = int(reader.get_out_stream_info(0).frame_rate)
|
| 238 |
+
fps = frame_rate
|
| 239 |
+
for i in range(reader.num_out_streams):
|
| 240 |
+
print(reader.get_out_stream_info(i))
|
| 241 |
if fps > 60:
|
| 242 |
log.warning(f'This code supports only up to 60 fps, but the video has {fps} fps')
|
| 243 |
log.warning(f'Just change the *60 above me')
|
mmaudio/utils/download_utils.py
CHANGED
|
@@ -30,7 +30,8 @@ links = [
|
|
| 30 |
},
|
| 31 |
{
|
| 32 |
'name': 'mmaudio_large_44k_v2.pth',
|
| 33 |
-
'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
|
|
|
|
| 34 |
'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
|
| 35 |
},
|
| 36 |
{
|
|
|
|
| 30 |
},
|
| 31 |
{
|
| 32 |
'name': 'mmaudio_large_44k_v2.pth',
|
| 33 |
+
# 'url': 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/weights/mmaudio_large_44k_v2.pth',
|
| 34 |
+
'url': 'https://databank.illinois.edu/datafiles/i1pd9/download',
|
| 35 |
'md5': '01ad4464f049b2d7efdaa4c1a59b8dfe'
|
| 36 |
},
|
| 37 |
{
|
requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
torch
|
| 2 |
-
torchaudio
|
| 3 |
torchvision
|
|
|
|
| 4 |
python-dotenv
|
| 5 |
cython
|
| 6 |
gitpython >= 3.1
|
|
@@ -23,4 +23,5 @@ hydra_colorlog
|
|
| 23 |
tensordict
|
| 24 |
colorlog
|
| 25 |
open_clip_torch
|
| 26 |
-
soundfile
|
|
|
|
|
|
| 1 |
+
torch == 2.4.0
|
|
|
|
| 2 |
torchvision
|
| 3 |
+
torchaudio
|
| 4 |
python-dotenv
|
| 5 |
cython
|
| 6 |
gitpython >= 3.1
|
|
|
|
| 23 |
tensordict
|
| 24 |
colorlog
|
| 25 |
open_clip_torch
|
| 26 |
+
soundfile
|
| 27 |
+
av
|