Alexandre Défossez
commited on
Commit
•
23fe483
1
Parent(s):
8e10a53
Improve demo (#51)
Browse files* allowing sharing directly, changelog, reduce volume.
* activate
* plop
- CHANGELOG.md +11 -2
- README.md +1 -1
- app.py +11 -9
- app_batched.py +3 -1
- audiocraft/__init__.py +1 -1
- audiocraft/data/audio.py +3 -1
- audiocraft/data/audio_utils.py +9 -4
- audiocraft/models/musicgen.py +2 -0
- audiocraft/modules/conditioners.py +6 -2
CHANGELOG.md
CHANGED
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
|
|
4 |
|
5 |
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
6 |
|
7 |
-
## [0.0.
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
6 |
|
7 |
+
## [0.0.2a] - TBD
|
8 |
|
9 |
+
Improved demo, fixed top p (thanks @jnordberg).
|
10 |
+
|
11 |
+
Compressor tanh on output to avoid clipping with some style (especially piano).
|
12 |
+
Now repeating the conditioning periodically if it is too short.
|
13 |
+
|
14 |
+
More options when launching Gradio app locally (thanks @ashleykleynhans).
|
15 |
+
|
16 |
+
## [0.0.1] - 2023-06-09
|
17 |
+
|
18 |
+
Initial release, with model evaluation only.
|
README.md
CHANGED
@@ -80,7 +80,7 @@ wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), s
|
|
80 |
|
81 |
for idx, one_wav in enumerate(wav):
|
82 |
# Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
|
83 |
-
audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness")
|
84 |
```
|
85 |
|
86 |
|
|
|
80 |
|
81 |
for idx, one_wav in enumerate(wav):
|
82 |
# Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
|
83 |
+
audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
|
84 |
```
|
85 |
|
86 |
|
app.py
CHANGED
@@ -13,7 +13,6 @@ import gradio as gr
|
|
13 |
from audiocraft.models import MusicGen
|
14 |
from audiocraft.data.audio import audio_write
|
15 |
|
16 |
-
|
17 |
MODEL = None
|
18 |
|
19 |
|
@@ -56,7 +55,9 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
|
|
56 |
|
57 |
output = output.detach().cpu().float()[0]
|
58 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
59 |
-
audio_write(
|
|
|
|
|
60 |
waveform_video = gr.make_waveform(file.name)
|
61 |
return waveform_video
|
62 |
|
@@ -66,7 +67,7 @@ def ui(**kwargs):
|
|
66 |
gr.Markdown(
|
67 |
"""
|
68 |
# MusicGen
|
69 |
-
|
70 |
This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
|
71 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
|
72 |
<br/>
|
@@ -129,19 +130,19 @@ def ui(**kwargs):
|
|
129 |
gr.Markdown(
|
130 |
"""
|
131 |
### More details
|
132 |
-
|
133 |
The model will generate a short music extract based on the description you provided.
|
134 |
You can generate up to 30 seconds of audio.
|
135 |
-
|
136 |
We present 4 model variations:
|
137 |
1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
|
138 |
2. Small -- a 300M transformer decoder conditioned on text only.
|
139 |
3. Medium -- a 1.5B transformer decoder conditioned on text only.
|
140 |
4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
|
141 |
-
|
142 |
When using `melody`, ou can optionaly provide a reference audio from
|
143 |
which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
|
144 |
-
|
145 |
You can also use your own GPU or a Google Colab by following the instructions on our repo.
|
146 |
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
|
147 |
for more details.
|
@@ -168,7 +169,8 @@ def ui(**kwargs):
|
|
168 |
if share:
|
169 |
launch_kwargs['share'] = share
|
170 |
|
171 |
-
interface.launch(**launch_kwargs)
|
|
|
172 |
|
173 |
if __name__ == "__main__":
|
174 |
# torch.cuda.set_per_process_memory_fraction(0.48)
|
@@ -207,4 +209,4 @@ if __name__ == "__main__":
|
|
207 |
server_port=args.server_port,
|
208 |
share=args.share,
|
209 |
listen=args.listen
|
210 |
-
)
|
|
|
13 |
from audiocraft.models import MusicGen
|
14 |
from audiocraft.data.audio import audio_write
|
15 |
|
|
|
16 |
MODEL = None
|
17 |
|
18 |
|
|
|
55 |
|
56 |
output = output.detach().cpu().float()[0]
|
57 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
58 |
+
audio_write(
|
59 |
+
file.name, output, MODEL.sample_rate, strategy="loudness",
|
60 |
+
loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
|
61 |
waveform_video = gr.make_waveform(file.name)
|
62 |
return waveform_video
|
63 |
|
|
|
67 |
gr.Markdown(
|
68 |
"""
|
69 |
# MusicGen
|
70 |
+
|
71 |
This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
|
72 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
|
73 |
<br/>
|
|
|
130 |
gr.Markdown(
|
131 |
"""
|
132 |
### More details
|
133 |
+
|
134 |
The model will generate a short music extract based on the description you provided.
|
135 |
You can generate up to 30 seconds of audio.
|
136 |
+
|
137 |
We present 4 model variations:
|
138 |
1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
|
139 |
2. Small -- a 300M transformer decoder conditioned on text only.
|
140 |
3. Medium -- a 1.5B transformer decoder conditioned on text only.
|
141 |
4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
|
142 |
+
|
143 |
When using `melody`, ou can optionaly provide a reference audio from
|
144 |
which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
|
145 |
+
|
146 |
You can also use your own GPU or a Google Colab by following the instructions on our repo.
|
147 |
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
|
148 |
for more details.
|
|
|
169 |
if share:
|
170 |
launch_kwargs['share'] = share
|
171 |
|
172 |
+
interface.queue().launch(**launch_kwargs, max_threads=1)
|
173 |
+
|
174 |
|
175 |
if __name__ == "__main__":
|
176 |
# torch.cuda.set_per_process_memory_fraction(0.48)
|
|
|
209 |
server_port=args.server_port,
|
210 |
share=args.share,
|
211 |
listen=args.listen
|
212 |
+
)
|
app_batched.py
CHANGED
@@ -57,7 +57,9 @@ def predict(texts, melodies):
|
|
57 |
out_files = []
|
58 |
for output in outputs:
|
59 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
60 |
-
audio_write(
|
|
|
|
|
61 |
waveform_video = gr.make_waveform(file.name)
|
62 |
out_files.append(waveform_video)
|
63 |
return [out_files]
|
|
|
57 |
out_files = []
|
58 |
for output in outputs:
|
59 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
60 |
+
audio_write(
|
61 |
+
file.name, output, MODEL.sample_rate, strategy="loudness",
|
62 |
+
loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
|
63 |
waveform_video = gr.make_waveform(file.name)
|
64 |
out_files.append(waveform_video)
|
65 |
return [out_files]
|
audiocraft/__init__.py
CHANGED
@@ -7,4 +7,4 @@
|
|
7 |
# flake8: noqa
|
8 |
from . import data, modules, models
|
9 |
|
10 |
-
__version__ = '0.0.
|
|
|
7 |
# flake8: noqa
|
8 |
from . import data, modules, models
|
9 |
|
10 |
+
__version__ = '0.0.2a1'
|
audiocraft/data/audio.py
CHANGED
@@ -155,6 +155,7 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
155 |
format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
|
156 |
strategy: str = 'peak', peak_clip_headroom_db: float = 1,
|
157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
|
|
158 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
159 |
add_suffix: bool = True) -> Path:
|
160 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
@@ -173,7 +174,8 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
173 |
rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
|
174 |
than the `peak_clip` one to avoid further clipping.
|
175 |
loudness_headroom_db (float): Target loudness for loudness normalization.
|
176 |
-
|
|
|
177 |
occurs despite strategy (only for 'rms').
|
178 |
make_parent_dir (bool): Make parent directory if it doesn't exist.
|
179 |
Returns:
|
|
|
155 |
format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
|
156 |
strategy: str = 'peak', peak_clip_headroom_db: float = 1,
|
157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
158 |
+
loudness_compressor: bool = False,
|
159 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
160 |
add_suffix: bool = True) -> Path:
|
161 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
|
|
174 |
rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
|
175 |
than the `peak_clip` one to avoid further clipping.
|
176 |
loudness_headroom_db (float): Target loudness for loudness normalization.
|
177 |
+
loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
|
178 |
+
when strategy is 'loudness'log_clipping (bool): If True, basic logging on stderr when clipping still
|
179 |
occurs despite strategy (only for 'rms').
|
180 |
make_parent_dir (bool): Make parent directory if it doesn't exist.
|
181 |
Returns:
|
audiocraft/data/audio_utils.py
CHANGED
@@ -54,8 +54,8 @@ def convert_audio(wav: torch.Tensor, from_rate: float,
|
|
54 |
return wav
|
55 |
|
56 |
|
57 |
-
def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float =
|
58 |
-
energy_floor: float = 2e-3):
|
59 |
"""Normalize an input signal to a user loudness in dB LKFS.
|
60 |
Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
|
61 |
|
@@ -63,6 +63,7 @@ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db
|
|
63 |
wav (torch.Tensor): Input multichannel audio data.
|
64 |
sample_rate (int): Sample rate.
|
65 |
loudness_headroom_db (float): Target loudness of the output in dB LUFS.
|
|
|
66 |
energy_floor (float): anything below that RMS level will not be rescaled.
|
67 |
Returns:
|
68 |
output (torch.Tensor): Loudness normalized output data.
|
@@ -76,6 +77,8 @@ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db
|
|
76 |
delta_loudness = -loudness_headroom_db - input_loudness_db
|
77 |
gain = 10.0 ** (delta_loudness / 20.0)
|
78 |
output = gain * wav
|
|
|
|
|
79 |
assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
|
80 |
return output
|
81 |
|
@@ -93,7 +96,8 @@ def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optio
|
|
93 |
def normalize_audio(wav: torch.Tensor, normalize: bool = True,
|
94 |
strategy: str = 'peak', peak_clip_headroom_db: float = 1,
|
95 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
96 |
-
|
|
|
97 |
stem_name: tp.Optional[str] = None) -> torch.Tensor:
|
98 |
"""Normalize the audio according to the prescribed strategy (see after).
|
99 |
|
@@ -109,6 +113,7 @@ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
|
|
109 |
rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
|
110 |
than the `peak_clip` one to avoid further clipping.
|
111 |
loudness_headroom_db (float): Target loudness for loudness normalization.
|
|
|
112 |
log_clipping (bool): If True, basic logging on stderr when clipping still
|
113 |
occurs despite strategy (only for 'rms').
|
114 |
sample_rate (int): Sample rate for the audio data (required for loudness).
|
@@ -132,7 +137,7 @@ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
|
|
132 |
_clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
|
133 |
elif strategy == 'loudness':
|
134 |
assert sample_rate is not None, "Loudness normalization requires sample rate."
|
135 |
-
wav = normalize_loudness(wav, sample_rate, loudness_headroom_db)
|
136 |
_clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
|
137 |
else:
|
138 |
assert wav.abs().max() < 1
|
|
|
54 |
return wav
|
55 |
|
56 |
|
57 |
+
def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
|
58 |
+
loudness_compressor: bool = False, energy_floor: float = 2e-3):
|
59 |
"""Normalize an input signal to a user loudness in dB LKFS.
|
60 |
Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
|
61 |
|
|
|
63 |
wav (torch.Tensor): Input multichannel audio data.
|
64 |
sample_rate (int): Sample rate.
|
65 |
loudness_headroom_db (float): Target loudness of the output in dB LUFS.
|
66 |
+
loudness_compressor (bool): Uses tanh for soft clipping.
|
67 |
energy_floor (float): anything below that RMS level will not be rescaled.
|
68 |
Returns:
|
69 |
output (torch.Tensor): Loudness normalized output data.
|
|
|
77 |
delta_loudness = -loudness_headroom_db - input_loudness_db
|
78 |
gain = 10.0 ** (delta_loudness / 20.0)
|
79 |
output = gain * wav
|
80 |
+
if loudness_compressor:
|
81 |
+
output = torch.tanh(output)
|
82 |
assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
|
83 |
return output
|
84 |
|
|
|
96 |
def normalize_audio(wav: torch.Tensor, normalize: bool = True,
|
97 |
strategy: str = 'peak', peak_clip_headroom_db: float = 1,
|
98 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
99 |
+
loudness_compressor: bool = False, log_clipping: bool = False,
|
100 |
+
sample_rate: tp.Optional[int] = None,
|
101 |
stem_name: tp.Optional[str] = None) -> torch.Tensor:
|
102 |
"""Normalize the audio according to the prescribed strategy (see after).
|
103 |
|
|
|
113 |
rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
|
114 |
than the `peak_clip` one to avoid further clipping.
|
115 |
loudness_headroom_db (float): Target loudness for loudness normalization.
|
116 |
+
loudness_compressor (bool): If True, uses tanh based soft clipping.
|
117 |
log_clipping (bool): If True, basic logging on stderr when clipping still
|
118 |
occurs despite strategy (only for 'rms').
|
119 |
sample_rate (int): Sample rate for the audio data (required for loudness).
|
|
|
137 |
_clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
|
138 |
elif strategy == 'loudness':
|
139 |
assert sample_rate is not None, "Loudness normalization requires sample rate."
|
140 |
+
wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
|
141 |
_clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
|
142 |
else:
|
143 |
assert wav.abs().max() < 1
|
audiocraft/models/musicgen.py
CHANGED
@@ -88,6 +88,8 @@ class MusicGen:
|
|
88 |
cache_dir = os.environ.get('MUSICGEN_ROOT', None)
|
89 |
compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
|
90 |
lm = load_lm_model(name, device=device, cache_dir=cache_dir)
|
|
|
|
|
91 |
|
92 |
return MusicGen(name, compression_model, lm)
|
93 |
|
|
|
88 |
cache_dir = os.environ.get('MUSICGEN_ROOT', None)
|
89 |
compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
|
90 |
lm = load_lm_model(name, device=device, cache_dir=cache_dir)
|
91 |
+
if name == 'melody' and True:
|
92 |
+
lm.condition_provider.conditioners['self_wav'].match_len_on_eval = True
|
93 |
|
94 |
return MusicGen(name, compression_model, lm)
|
95 |
|
audiocraft/modules/conditioners.py
CHANGED
@@ -9,6 +9,7 @@ from copy import deepcopy
|
|
9 |
from dataclasses import dataclass, field
|
10 |
from itertools import chain
|
11 |
import logging
|
|
|
12 |
import random
|
13 |
import re
|
14 |
import typing as tp
|
@@ -484,7 +485,7 @@ class ChromaStemConditioner(WaveformConditioner):
|
|
484 |
**kwargs: Additional parameters for the chroma extractor.
|
485 |
"""
|
486 |
def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
|
487 |
-
duration: float, match_len_on_eval: bool =
|
488 |
n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = "cpu", **kwargs):
|
489 |
from demucs import pretrained
|
490 |
super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
|
@@ -535,7 +536,10 @@ class ChromaStemConditioner(WaveformConditioner):
|
|
535 |
chroma = chroma[:, :self.chroma_len]
|
536 |
logger.debug(f'chroma was truncated! ({t} -> {chroma.shape[1]})')
|
537 |
elif t < self.chroma_len:
|
538 |
-
chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
|
|
|
|
|
|
|
539 |
logger.debug(f'chroma was zero-padded! ({t} -> {chroma.shape[1]})')
|
540 |
return chroma
|
541 |
|
|
|
9 |
from dataclasses import dataclass, field
|
10 |
from itertools import chain
|
11 |
import logging
|
12 |
+
import math
|
13 |
import random
|
14 |
import re
|
15 |
import typing as tp
|
|
|
485 |
**kwargs: Additional parameters for the chroma extractor.
|
486 |
"""
|
487 |
def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
|
488 |
+
duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
|
489 |
n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = "cpu", **kwargs):
|
490 |
from demucs import pretrained
|
491 |
super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
|
|
|
536 |
chroma = chroma[:, :self.chroma_len]
|
537 |
logger.debug(f'chroma was truncated! ({t} -> {chroma.shape[1]})')
|
538 |
elif t < self.chroma_len:
|
539 |
+
# chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
|
540 |
+
n_repeat = int(math.ceil(self.chroma_len / t))
|
541 |
+
chroma = chroma.repeat(1, n_repeat, 1)
|
542 |
+
chroma = chroma[:, :self.chroma_len]
|
543 |
logger.debug(f'chroma was zero-padded! ({t} -> {chroma.shape[1]})')
|
544 |
return chroma
|
545 |
|