Spaces:

Surn
/

UnlimitedMusicGen

Running on T4

App Files Files Community

Surn commited on Jun 15, 2023

Commit

1dda6b6

•

1 Parent(s): aef5578

Update Overlap Action in Melody

Browse files

Files changed (3) hide show

app.py +5 -3
audiocraft/models/musicgen.py +55 -2
audiocraft/utils/extend.py +36 -23

app.py CHANGED Viewed

@@ -100,6 +100,8 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
             temperature=temperature,
             cfg_coef=cfg_coef,
             duration=segment_duration,
         )
         if melody:
@@ -201,7 +203,7 @@ def ui(**kwargs):
                     include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
                 with gr.Row():
                     title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
-                    settings_font = gr.Text(label="Settings Font", value="arial.ttf", interactive=True)
                     settings_font_color = gr.ColorPicker(label="Settings Font Color", value="#ffffff", interactive=True)
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
@@ -212,8 +214,8 @@ def ui(**kwargs):
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
-                    temperature = gr.Number(label="Randomness Temperature", value=1.0, precision=2, interactive=True)
-                    cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.0, precision=2, interactive=True)
                 with gr.Row():
                     seed = gr.Number(label="Seed", value=-1, precision=0, interactive=True)
                     gr.Button('\U0001f3b2\ufe0f').style(full_width=False).click(fn=lambda: -1, outputs=[seed], queue=False)

             temperature=temperature,
             cfg_coef=cfg_coef,
             duration=segment_duration,
+            two_step_cfg=False,
+            rep_penalty=0.5
         )
         if melody:
                     include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
                 with gr.Row():
                     title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
+                    settings_font = gr.Text(label="Settings Font", value="./assets/arial.ttf", interactive=True)
                     settings_font_color = gr.ColorPicker(label="Settings Font Color", value="#ffffff", interactive=True)
                 with gr.Row():
                     model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
                 with gr.Row():
                     topk = gr.Number(label="Top-k", value=250, interactive=True)
                     topp = gr.Number(label="Top-p", value=0, interactive=True)
+                    temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
+                    cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
                 with gr.Row():
                     seed = gr.Number(label="Seed", value=-1, precision=0, interactive=True)
                     gr.Button('\U0001f3b2\ufe0f').style(full_width=False).click(fn=lambda: -1, outputs=[seed], queue=False)

audiocraft/models/musicgen.py CHANGED Viewed

@@ -97,7 +97,7 @@ class MusicGen:
     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
-                              two_step_cfg: bool = False):
         """Set the generation parameters for MusicGen.
         Args:
@@ -110,6 +110,7 @@ class MusicGen:
             two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
                 instead of batching together the two. This has some impact on how things
                 are padded but seems to have little impact in practice.
         """
         assert duration <= 30, "The MusicGen cannot generate more than 30 seconds"
         self.generation_params = {
@@ -119,7 +120,7 @@ class MusicGen:
             'top_k': top_k,
             'top_p': top_p,
             'cfg_coef': cfg_coef,
-            'two_step_cfg': two_step_cfg,
         }
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
@@ -177,6 +178,58 @@ class MusicGen:
         assert prompt_tokens is None
         return self._generate_tokens(attributes, prompt_tokens, progress)
     def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
                               descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
                               progress: bool = False) -> torch.Tensor:

     def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
                               top_p: float = 0.0, temperature: float = 1.0,
                               duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, rep_penalty: float = None):
         """Set the generation parameters for MusicGen.
         Args:
             two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
                 instead of batching together the two. This has some impact on how things
                 are padded but seems to have little impact in practice.
+            rep_penalty (float, optional): If set, use repetition penalty during generation. Not Implemented.
         """
         assert duration <= 30, "The MusicGen cannot generate more than 30 seconds"
         self.generation_params = {
             'top_k': top_k,
             'top_p': top_p,
             'cfg_coef': cfg_coef,
+            'two_step_cfg': two_step_cfg,
         }
     def generate_unconditional(self, num_samples: int, progress: bool = False) -> torch.Tensor:
         assert prompt_tokens is None
         return self._generate_tokens(attributes, prompt_tokens, progress)
+    def generate_with_all(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             sample_rate: int, progress: bool = False, prompt: tp.Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Generate samples conditioned on text and melody and audio prompts.
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+           sample_rate: (int): Sample rate of the melody waveforms.
+           progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+           prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+        """
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError("Melody wavs should have a shape [B, C, T].")
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, "One melody in the list has the wrong number of dims."
+        melody_wavs = [
+            convert_audio(wav, sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        if prompt is not None:
+            if prompt.dim() == 2:
+                prompt = prompt[None]
+            if prompt.dim() != 3:
+                raise ValueError("prompt should have 3 dimensions: [B, C, T] (C = 1).")
+            prompt = convert_audio(prompt, sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        if prompt is not None:
+            attributes_gen, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        #attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=prompt,
+        #                                                                melody_wavs=melody_wavs)
+        if prompt is not None:
+            assert prompt_tokens is not None
+        else:
+            assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
     def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
                               descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
                               progress: bool = False) -> torch.Tensor:

audiocraft/utils/extend.py CHANGED Viewed

@@ -22,12 +22,15 @@ def separate_audio_segments(audio, segment_duration=30, overlap=1):
     start_sample = 0
     while total_samples >= segment_samples:
         end_sample = start_sample + segment_samples
         segment = audio_data[start_sample:end_sample]
         segments.append((sr, segment))
         start_sample += segment_samples - overlap_samples
-        total_samples -= segment_samples - overlap_samples
     # Collect the final segment
     if total_samples > 0:
@@ -38,17 +41,16 @@ def separate_audio_segments(audio, segment_duration=30, overlap=1):
 def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:int=1, segment_duration:int=30):
     # generate audio segments
-    melody_segments = separate_audio_segments(melody, segment_duration, overlap)
     # Create a list to store the melody tensors for each segment
     melodys = []
     output_segments = []
     # Calculate the total number of segments
     total_segments = max(math.ceil(duration / segment_duration),1)
-    # account for overlap
-    duration = duration + (max((total_segments - 1),0) * overlap)
-    total_segments = max(math.ceil(duration / segment_duration),1)
     #calc excess duration
     excess_duration = segment_duration - (total_segments * segment_duration - duration)
     print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
@@ -76,11 +78,15 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
-        output = MODEL.generate_with_chroma(
             descriptions=[text],
             melody_wavs=verse,
-            melody_sample_rate=sr,
-            progress=True
         )
         # Append the generated output to the list of segments
@@ -151,24 +157,31 @@ def load_font(font_name, font_size=16):
     Example:
         font = load_font("Arial.ttf", font_size=20)
     """
-    try:
-        font = ImageFont.truetype(font_name, font_size)
-    except (FileNotFoundError, OSError):
         try:
             font = ImageFont.truetype(font_name, font_size)
-            print("Font not found. Downloading from Hugging Face model hub...\n")
-        except:
             try:
-                req = requests.get(font_name)
-                font = ImageFont.truetype(BytesIO(req.content), font_size)
-                print("Font not found. Downloading from URL...\n")
-            except:
-                try:
-                    font = ImageFont.truetype(hf_hub_download("/assets", font_name), encoding="UTF-8")
-                    print(f"Font not found: {font_name} Using default font\n")
-                except:
-                    font = ImageFont.load_default()
     return font

     start_sample = 0
     while total_samples >= segment_samples:
+        # Collect the segment
+        # the end sample is the start sample plus the segment samples,
+        # the start sample, after 0, is minus the overlap samples to account for the overlap
         end_sample = start_sample + segment_samples
         segment = audio_data[start_sample:end_sample]
         segments.append((sr, segment))
         start_sample += segment_samples - overlap_samples
+        total_samples -= segment_samples
     # Collect the final segment
     if total_samples > 0:
 def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:int=1, segment_duration:int=30):
     # generate audio segments
+    melody_segments = separate_audio_segments(melody, segment_duration, 0)
     # Create a list to store the melody tensors for each segment
     melodys = []
     output_segments = []
+    last_chunk = []
+    text += ", seed=" + str(seed)
     # Calculate the total number of segments
     total_segments = max(math.ceil(duration / segment_duration),1)
     #calc excess duration
     excess_duration = segment_duration - (total_segments * segment_duration - duration)
     print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
     torch.manual_seed(seed)
     for idx, verse in enumerate(melodys):
         print(f"Generating New Melody Segment {idx + 1}: {text}\r")
+        if output_segments:
+            # If this isn't the first segment, use the last chunk of the previous segment as the input
+            last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
+        output = MODEL.generate_with_all(
             descriptions=[text],
             melody_wavs=verse,
+            sample_rate=sr,
+            progress=True,
+            prompt=last_chunk if len(last_chunk) > 0 else None,
         )
         # Append the generated output to the list of segments
     Example:
         font = load_font("Arial.ttf", font_size=20)
     """
+    font = None
+    if not "http" in font_name:
         try:
             font = ImageFont.truetype(font_name, font_size)
+        except (FileNotFoundError, OSError):
+            print("Font not found. Trying to download from local assets folder...\n")
+        if font is None:
             try:
+                font = ImageFont.truetype("assets/" + font_name, font_size)
+            except (FileNotFoundError, OSError):
+                print("Font not found. Trying to download from URL...\n")
+    if font is None:
+        try:
+            req = requests.get(font_name)
+            font = ImageFont.truetype(BytesIO(req.content), font_size)
+        except (FileNotFoundError, OSError):
+             print(f"Font found: {font_name} Using Hugging Face download font\n")
+    if font is None:
+        try:
+            font = ImageFont.truetype(hf_hub_download("assets", font_name), encoding="UTF-8")
+        except (FileNotFoundError, OSError):
+            font = ImageFont.load_default()
+            print(f"Font not found: {font_name} Using default font\n")
     return font