Spaces:

Surn
/

UnlimitedMusicGen

Running on T4

App Files Files Community

Surn commited on Jun 17, 2023

Commit

1a6de5e

1 Parent(s): 50d48cc

Interrupt Button Update

Browse files

Stereo wav file
Improved Melody guided, partial

Files changed (3) hide show

app.py +32 -11
audiocraft/data/audio.py +4 -2
audiocraft/utils/extend.py +9 -1

app.py CHANGED Viewed

@@ -15,17 +15,20 @@ import time
 import warnings
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
-from audiocraft.utils.extend import generate_music_segments, add_settings_to_image
 import numpy as np
 import random
 MODEL = None
 MODELS = None
-IS_SHARED_SPACE = "musicgen/MusicGen" in os.environ.get('SPACE_ID', '')
 INTERRUPTED = False
 UNLOAD_MODEL = False
 MOVE_TO_CPU = False
 def interrupt():
     global INTERRUPTING
     INTERRUPTING = True
@@ -63,9 +66,18 @@ def load_model(version):
 def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
-    global MODEL, INTERRUPTED
     output_segments = None
-    topk = int(topk)
     if MODEL is None or MODEL.name != model:
         MODEL = load_model(model)
     else:
@@ -92,6 +104,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
             seed = random.randint(0, 0xffff_ffff_ffff)
         torch.manual_seed(seed)
         print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
         MODEL.set_generation_params(
             use_sampling=True,
@@ -134,6 +147,12 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
                 duration -= segment_duration - overlap
             output_segments.append(next_segment)
     if output_segments:
         try:
             # Combine the output segments into one long audio file or stack tracks
@@ -143,21 +162,22 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
             output = output_segments[0]
             for i in range(1, len(output_segments)):
                 overlap_samples = overlap * MODEL.sample_rate
-                output = torch.cat([output[:, :, :-overlap_samples], output_segments[i][:, :, overlap_samples:]], dim=dimension)
             output = output.detach().cpu().float()[0]
         except Exception as e:
             print(f"Error combining segments: {e}. Using the first segment only.")
             output = output_segments[0].detach().cpu().float()[0]
     else:
         output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         if include_settings:
-            video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Melody File:#todo"
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-        waveform_video = make_waveform(file.name,bg_image=background, bar_count=40)
     if MOVE_TO_CPU:
         MODEL.to('cpu')
     if UNLOAD_MODEL:
@@ -177,6 +197,7 @@ def ui(**kwargs):
             # UnlimitedMusicGen
             This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
             Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
             """
@@ -208,12 +229,12 @@ def ui(**kwargs):
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
-                    duration = gr.Slider(minimum=1, maximum=1000, value=10, label="Duration", interactive=True)
                     overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
                 with gr.Row():
-                    topk = gr.Number(label="Top-k", value=250, interactive=True)
-                    topp = gr.Number(label="Top-p", value=0, interactive=True)
                     temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
                 with gr.Row():

 import warnings
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
+from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
 import numpy as np
 import random
 MODEL = None
 MODELS = None
+IS_SHARED_SPACE = "Surn/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
 INTERRUPTED = False
 UNLOAD_MODEL = False
 MOVE_TO_CPU = False
+def interrupt_callback():
+    return INTERRUPTED
 def interrupt():
     global INTERRUPTING
     INTERRUPTING = True
 def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
+    global MODEL, INTERRUPTED, INTERRUPTING
     output_segments = None
+    INTERRUPTED = False
+    INTERRUPTING = False
+    if temperature < 0:
+        raise gr.Error("Temperature must be >= 0.")
+    if topk < 0:
+        raise gr.Error("Topk must be non-negative.")
+    if topp < 0:
+        raise gr.Error("Topp must be non-negative.")
     if MODEL is None or MODEL.name != model:
         MODEL = load_model(model)
     else:
             seed = random.randint(0, 0xffff_ffff_ffff)
         torch.manual_seed(seed)
         print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
         MODEL.set_generation_params(
             use_sampling=True,
                 duration -= segment_duration - overlap
             output_segments.append(next_segment)
+        if INTERRUPTING:
+            INTERRUPTED = True
+            INTERRUPTING = False
+            print("Function execution interrupted!")
+            raise gr.Error("Interrupted.")
     if output_segments:
         try:
             # Combine the output segments into one long audio file or stack tracks
             output = output_segments[0]
             for i in range(1, len(output_segments)):
                 overlap_samples = overlap * MODEL.sample_rate
+                output = torch.cat([output[:, :, :-overlap_samples], output_segments[i]], dim=dimension)
             output = output.detach().cpu().float()[0]
         except Exception as e:
             print(f"Error combining segments: {e}. Using the first segment only.")
             output = output_segments[0].detach().cpu().float()[0]
     else:
         output = output.detach().cpu().float()[0]
     with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
         if include_settings:
+            video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody File:#todo"
             background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
         audio_write(
             file.name, output, MODEL.sample_rate, strategy="loudness",
+            loudness_headroom_db=19, loudness_compressor=True, add_suffix=False, channels=2)
+        waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
     if MOVE_TO_CPU:
         MODEL.to('cpu')
     if UNLOAD_MODEL:
             # UnlimitedMusicGen
             This is your private demo for [UnlimitedMusicGen](https://github.com/Oncorporation/audiocraft), a simple and controllable model for music generation
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
+            Todo: Working on improved Melody Conditioned Music Generation transitions.
             Disclaimer: This won't run on CPU only. Clone this App and run on GPU instance!
             """
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
+                    duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
                     overlap = gr.Slider(minimum=1, maximum=29, value=5, step=1, label="Overlap", interactive=True)
                     dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
                 with gr.Row():
+                    topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
+                    topp = gr.Number(label="Top-p", value=0, precision=0, interactive=True)
                     temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
                 with gr.Row():

audiocraft/data/audio.py CHANGED Viewed

@@ -22,7 +22,7 @@ import torchaudio as ta
 import av
-from .audio_utils import f32_pcm, i16_pcm, normalize_audio
 _av_initialized = False
@@ -157,7 +157,7 @@ def audio_write(stem_name: tp.Union[str, Path],
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
                 loudness_compressor: bool = False,
                 log_clipping: bool = True, make_parent_dir: bool = True,
-                add_suffix: bool = True) -> Path:
     """Convenience function for saving audio to disk. Returns the filename the audio was written to.
     Args:
@@ -190,6 +190,8 @@ def audio_write(stem_name: tp.Union[str, Path],
     wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
                           rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
                           sample_rate=sample_rate, stem_name=str(stem_name))
     kwargs: dict = {}
     if format == 'mp3':
         suffix = '.mp3'

 import av
+from .audio_utils import f32_pcm, i16_pcm, normalize_audio, convert_audio
 _av_initialized = False
                 rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
                 loudness_compressor: bool = False,
                 log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True, channels:int = 1) -> Path:
     """Convenience function for saving audio to disk. Returns the filename the audio was written to.
     Args:
     wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
                           rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
                           sample_rate=sample_rate, stem_name=str(stem_name))
+    if channels > 1:
+        wav = convert_audio(wav,sample_rate, sample_rate, channels)
     kwargs: dict = {}
     if format == 'mp3':
         suffix = '.mp3'

audiocraft/utils/extend.py CHANGED Viewed

@@ -11,6 +11,9 @@ import requests
 from io import BytesIO
 from huggingface_hub import hf_hub_download
 def separate_audio_segments(audio, segment_duration=30, overlap=1):
     sr, audio_data = audio[0], audio[1]
@@ -65,6 +68,8 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
     # Iterate over the segments to create list of Meldoy tensors
     for segment_idx in range(total_segments):
         print(f"segment {segment_idx + 1} of {total_segments} \r")
         sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -77,6 +82,9 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
         if output_segments:
             # If this isn't the first segment, use the last chunk of the previous segment as the input
@@ -166,7 +174,7 @@ def load_font(font_name, font_size=16):
         if font is None:
             try:
-                font_path = ImageFont.truetype(hf_hub_download(repo_id="Surn/UnlimitedMusicGen", filename="assets/" + font_name, repo_type="space"), encoding="UTF-8")
                 font = ImageFont.truetype(font_path, font_size)
             except (FileNotFoundError, OSError):
                 print("Font not found. Trying to download from local assets folder...\n")

 from io import BytesIO
 from huggingface_hub import hf_hub_download
+INTERRUPTING = False
 def separate_audio_segments(audio, segment_duration=30, overlap=1):
     sr, audio_data = audio[0], audio[1]
     # Iterate over the segments to create list of Meldoy tensors
     for segment_idx in range(total_segments):
+        if INTERRUPTING:
+            return [], duration
         print(f"segment {segment_idx + 1} of {total_segments} \r")
         sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
+        if INTERRUPTING:
+            return output_segments, duration - (segment_duration * len(output_segments))
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
         if output_segments:
             # If this isn't the first segment, use the last chunk of the previous segment as the input
         if font is None:
             try:
+                font_path = ImageFont.truetype(hf_hub_download(repo_id=os.environ.get('SPACE_ID', ''), filename="assets/" + font_name, repo_type="space"), encoding="UTF-8")
                 font = ImageFont.truetype(font_path, font_size)
             except (FileNotFoundError, OSError):
                 print("Font not found. Trying to download from local assets folder...\n")