Spaces:
Running
on
T4
Running
on
T4
Interrupt Button Update
Browse filesStereo wav file
Improved Melody guided, partial
- app.py +32 -11
- audiocraft/data/audio.py +4 -2
- audiocraft/utils/extend.py +9 -1
app.py
CHANGED
|
@@ -15,17 +15,20 @@ import time
|
|
| 15 |
import warnings
|
| 16 |
from audiocraft.models import MusicGen
|
| 17 |
from audiocraft.data.audio import audio_write
|
| 18 |
-
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image
|
| 19 |
import numpy as np
|
| 20 |
import random
|
| 21 |
|
| 22 |
MODEL = None
|
| 23 |
MODELS = None
|
| 24 |
-
IS_SHARED_SPACE = "
|
| 25 |
INTERRUPTED = False
|
| 26 |
UNLOAD_MODEL = False
|
| 27 |
MOVE_TO_CPU = False
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
def interrupt():
|
| 30 |
global INTERRUPTING
|
| 31 |
INTERRUPTING = True
|
|
@@ -63,9 +66,18 @@ def load_model(version):
|
|
| 63 |
|
| 64 |
|
| 65 |
def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
|
| 66 |
-
global MODEL, INTERRUPTED
|
| 67 |
output_segments = None
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
if MODEL is None or MODEL.name != model:
|
| 70 |
MODEL = load_model(model)
|
| 71 |
else:
|
|
@@ -92,6 +104,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
| 92 |
seed = random.randint(0, 0xffff_ffff_ffff)
|
| 93 |
torch.manual_seed(seed)
|
| 94 |
|
|
|
|
| 95 |
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
|
| 96 |
MODEL.set_generation_params(
|
| 97 |
use_sampling=True,
|
|
@@ -134,6 +147,12 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
| 134 |
duration -= segment_duration - overlap
|
| 135 |
output_segments.append(next_segment)
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
if output_segments:
|
| 138 |
try:
|
| 139 |
# Combine the output segments into one long audio file or stack tracks
|
|
@@ -143,21 +162,22 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
|
|
| 143 |
output = output_segments[0]
|
| 144 |
for i in range(1, len(output_segments)):
|
| 145 |
overlap_samples = overlap * MODEL.sample_rate
|
| 146 |
-
output = torch.cat([output[:, :, :-overlap_samples], output_segments[i]
|
| 147 |
output = output.detach().cpu().float()[0]
|
| 148 |
except Exception as e:
|
| 149 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
| 150 |
output = output_segments[0].detach().cpu().float()[0]
|
| 151 |
else:
|
| 152 |
output = output.detach().cpu().float()[0]
|
|
|
|
| 153 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
| 154 |
if include_settings:
|
| 155 |
-
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Melody File:#todo"
|
| 156 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
| 157 |
audio_write(
|
| 158 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
| 159 |
-
loudness_headroom_db=
|
| 160 |
-
waveform_video = make_waveform(file.name,bg_image=background, bar_count=
|
| 161 |
if MOVE_TO_CPU:
|
| 162 |
MODEL.to('cpu')
|
| 163 |
if UNLOAD_MODEL:
|
|
@@ -177,6 +197,7 @@ def ui(**kwargs):
|
|
| 177 |
# UnlimitedMusicGen
|
| 178 |
This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
|
| 179 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
|
|
|
|
| 180 |
|
| 181 |
Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
|
| 182 |
"""
|
|
@@ -208,12 +229,12 @@ def ui(**kwargs):
|
|
| 208 |
with gr.Row():
|
| 209 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
| 210 |
with gr.Row():
|
| 211 |
-
duration = gr.Slider(minimum=1, maximum=
|
| 212 |
overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
|
| 213 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
| 214 |
with gr.Row():
|
| 215 |
-
topk = gr.Number(label="Top-k", value=250, interactive=True)
|
| 216 |
-
topp = gr.Number(label="Top-p", value=0, interactive=True)
|
| 217 |
temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
|
| 218 |
cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
|
| 219 |
with gr.Row():
|
|
|
|
| 15 |
import warnings
|
| 16 |
from audiocraft.models import MusicGen
|
| 17 |
from audiocraft.data.audio import audio_write
|
| 18 |
+
from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
|
| 19 |
import numpy as np
|
| 20 |
import random
|
| 21 |
|
| 22 |
MODEL = None
|
| 23 |
MODELS = None
|
| 24 |
+
IS_SHARED_SPACE = "Surn/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
|
| 25 |
INTERRUPTED = False
|
| 26 |
UNLOAD_MODEL = False
|
| 27 |
MOVE_TO_CPU = False
|
| 28 |
|
| 29 |
+
def interrupt_callback():
|
| 30 |
+
return INTERRUPTED
|
| 31 |
+
|
| 32 |
def interrupt():
|
| 33 |
global INTERRUPTING
|
| 34 |
INTERRUPTING = True
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
|
| 69 |
+
global MODEL, INTERRUPTED, INTERRUPTING
|
| 70 |
output_segments = None
|
| 71 |
+
|
| 72 |
+
INTERRUPTED = False
|
| 73 |
+
INTERRUPTING = False
|
| 74 |
+
if temperature < 0:
|
| 75 |
+
raise gr.Error("Temperature must be >= 0.")
|
| 76 |
+
if topk < 0:
|
| 77 |
+
raise gr.Error("Topk must be non-negative.")
|
| 78 |
+
if topp < 0:
|
| 79 |
+
raise gr.Error("Topp must be non-negative.")
|
| 80 |
+
|
| 81 |
if MODEL is None or MODEL.name != model:
|
| 82 |
MODEL = load_model(model)
|
| 83 |
else:
|
|
|
|
| 104 |
seed = random.randint(0, 0xffff_ffff_ffff)
|
| 105 |
torch.manual_seed(seed)
|
| 106 |
|
| 107 |
+
|
| 108 |
print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
|
| 109 |
MODEL.set_generation_params(
|
| 110 |
use_sampling=True,
|
|
|
|
| 147 |
duration -= segment_duration - overlap
|
| 148 |
output_segments.append(next_segment)
|
| 149 |
|
| 150 |
+
if INTERRUPTING:
|
| 151 |
+
INTERRUPTED = True
|
| 152 |
+
INTERRUPTING = False
|
| 153 |
+
print("Function execution interrupted!")
|
| 154 |
+
raise gr.Error("Interrupted.")
|
| 155 |
+
|
| 156 |
if output_segments:
|
| 157 |
try:
|
| 158 |
# Combine the output segments into one long audio file or stack tracks
|
|
|
|
| 162 |
output = output_segments[0]
|
| 163 |
for i in range(1, len(output_segments)):
|
| 164 |
overlap_samples = overlap * MODEL.sample_rate
|
| 165 |
+
output = torch.cat([output[:, :, :-overlap_samples], output_segments[i]], dim=dimension)
|
| 166 |
output = output.detach().cpu().float()[0]
|
| 167 |
except Exception as e:
|
| 168 |
print(f"Error combining segments: {e}. Using the first segment only.")
|
| 169 |
output = output_segments[0].detach().cpu().float()[0]
|
| 170 |
else:
|
| 171 |
output = output.detach().cpu().float()[0]
|
| 172 |
+
|
| 173 |
with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
|
| 174 |
if include_settings:
|
| 175 |
+
video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody File:#todo"
|
| 176 |
background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
|
| 177 |
audio_write(
|
| 178 |
file.name, output, MODEL.sample_rate, strategy="loudness",
|
| 179 |
+
loudness_headroom_db=19, loudness_compressor=True, add_suffix=False, channels=2)
|
| 180 |
+
waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
|
| 181 |
if MOVE_TO_CPU:
|
| 182 |
MODEL.to('cpu')
|
| 183 |
if UNLOAD_MODEL:
|
|
|
|
| 197 |
# UnlimitedMusicGen
|
| 198 |
This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
|
| 199 |
presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
|
| 200 |
+
Todo: Working on improved Melody Conditioned Music Generation transitions.
|
| 201 |
|
| 202 |
Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
|
| 203 |
"""
|
|
|
|
| 229 |
with gr.Row():
|
| 230 |
model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
|
| 231 |
with gr.Row():
|
| 232 |
+
duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
|
| 233 |
overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
|
| 234 |
dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
|
| 235 |
with gr.Row():
|
| 236 |
+
topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
|
| 237 |
+
topp = gr.Number(label="Top-p", value=0, precision=0, interactive=True)
|
| 238 |
temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
|
| 239 |
cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
|
| 240 |
with gr.Row():
|
audiocraft/data/audio.py
CHANGED
|
@@ -22,7 +22,7 @@ import torchaudio as ta
|
|
| 22 |
|
| 23 |
import av
|
| 24 |
|
| 25 |
-
from .audio_utils import f32_pcm, i16_pcm, normalize_audio
|
| 26 |
|
| 27 |
|
| 28 |
_av_initialized = False
|
|
@@ -157,7 +157,7 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
| 157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
| 158 |
loudness_compressor: bool = False,
|
| 159 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
| 160 |
-
add_suffix: bool = True) -> Path:
|
| 161 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
| 162 |
|
| 163 |
Args:
|
|
@@ -190,6 +190,8 @@ def audio_write(stem_name: tp.Union[str, Path],
|
|
| 190 |
wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
|
| 191 |
rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
|
| 192 |
sample_rate=sample_rate, stem_name=str(stem_name))
|
|
|
|
|
|
|
| 193 |
kwargs: dict = {}
|
| 194 |
if format == 'mp3':
|
| 195 |
suffix = '.mp3'
|
|
|
|
| 22 |
|
| 23 |
import av
|
| 24 |
|
| 25 |
+
from .audio_utils import f32_pcm, i16_pcm, normalize_audio, convert_audio
|
| 26 |
|
| 27 |
|
| 28 |
_av_initialized = False
|
|
|
|
| 157 |
rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
|
| 158 |
loudness_compressor: bool = False,
|
| 159 |
log_clipping: bool = True, make_parent_dir: bool = True,
|
| 160 |
+
add_suffix: bool = True, channels:int = 1) -> Path:
|
| 161 |
"""Convenience function for saving audio to disk. Returns the filename the audio was written to.
|
| 162 |
|
| 163 |
Args:
|
|
|
|
| 190 |
wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
|
| 191 |
rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
|
| 192 |
sample_rate=sample_rate, stem_name=str(stem_name))
|
| 193 |
+
if channels > 1:
|
| 194 |
+
wav = convert_audio(wav,sample_rate, sample_rate, channels)
|
| 195 |
kwargs: dict = {}
|
| 196 |
if format == 'mp3':
|
| 197 |
suffix = '.mp3'
|
audiocraft/utils/extend.py
CHANGED
|
@@ -11,6 +11,9 @@ import requests
|
|
| 11 |
from io import BytesIO
|
| 12 |
from huggingface_hub import hf_hub_download
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
def separate_audio_segments(audio, segment_duration=30, overlap=1):
|
| 15 |
sr, audio_data = audio[0], audio[1]
|
| 16 |
|
|
@@ -65,6 +68,8 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
| 65 |
|
| 66 |
# Iterate over the segments to create list of Meldoy tensors
|
| 67 |
for segment_idx in range(total_segments):
|
|
|
|
|
|
|
| 68 |
print(f"segment {segment_idx + 1} of {total_segments} \r")
|
| 69 |
sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
|
| 70 |
|
|
@@ -77,6 +82,9 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
|
|
| 77 |
|
| 78 |
torch.manual_seed(seed)
|
| 79 |
for idx, verse in enumerate(melodys):
|
|
|
|
|
|
|
|
|
|
| 80 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
| 81 |
if output_segments:
|
| 82 |
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
|
@@ -166,7 +174,7 @@ def load_font(font_name, font_size=16):
|
|
| 166 |
|
| 167 |
if font is None:
|
| 168 |
try:
|
| 169 |
-
font_path = ImageFont.truetype(hf_hub_download(repo_id=
|
| 170 |
font = ImageFont.truetype(font_path, font_size)
|
| 171 |
except (FileNotFoundError, OSError):
|
| 172 |
print("Font not found. Trying to download from local assets folder...\n")
|
|
|
|
| 11 |
from io import BytesIO
|
| 12 |
from huggingface_hub import hf_hub_download
|
| 13 |
|
| 14 |
+
|
| 15 |
+
INTERRUPTING = False
|
| 16 |
+
|
| 17 |
def separate_audio_segments(audio, segment_duration=30, overlap=1):
|
| 18 |
sr, audio_data = audio[0], audio[1]
|
| 19 |
|
|
|
|
| 68 |
|
| 69 |
# Iterate over the segments to create list of Meldoy tensors
|
| 70 |
for segment_idx in range(total_segments):
|
| 71 |
+
if INTERRUPTING:
|
| 72 |
+
return [], duration
|
| 73 |
print(f"segment {segment_idx + 1} of {total_segments} \r")
|
| 74 |
sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
|
| 75 |
|
|
|
|
| 82 |
|
| 83 |
torch.manual_seed(seed)
|
| 84 |
for idx, verse in enumerate(melodys):
|
| 85 |
+
if INTERRUPTING:
|
| 86 |
+
return output_segments, duration - (segment_duration * len(output_segments))
|
| 87 |
+
|
| 88 |
print(f"Generating New Melody Segment {idx + 1}: {text}\r")
|
| 89 |
if output_segments:
|
| 90 |
# If this isn't the first segment, use the last chunk of the previous segment as the input
|
|
|
|
| 174 |
|
| 175 |
if font is None:
|
| 176 |
try:
|
| 177 |
+
font_path = ImageFont.truetype(hf_hub_download(repo_id=os.environ.get('SPACE_ID', ''), filename="assets/" + font_name, repo_type="space"), encoding="UTF-8")
|
| 178 |
font = ImageFont.truetype(font_path, font_size)
|
| 179 |
except (FileNotFoundError, OSError):
|
| 180 |
print("Font not found. Trying to download from local assets folder...\n")
|