Surn commited on
Commit
a8a94b6
·
1 Parent(s): 95ec64d

Prompt conditioning sample segments ( -1 Conditions each verse

Browse files

Update title to Melody Conditioning file name upon load
Separate title and settings inclusions to background
Fixed a bug in my 6/19 code... stupid logical mistake

Files changed (3) hide show
  1. app.py +70 -16
  2. assets/favicon.ico +0 -0
  3. audiocraft/utils/extend.py +46 -33
app.py CHANGED
@@ -19,6 +19,8 @@ from audiocraft.data.audio_utils import apply_fade, apply_tafade
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
 
 
22
 
23
  MODEL = None
24
  MODELS = None
@@ -26,6 +28,7 @@ IS_SHARED_SPACE = "Surn/UnlimitedMusicGen" in os.environ.get('SPACE_ID', '')
26
  INTERRUPTED = False
27
  UNLOAD_MODEL = False
28
  MOVE_TO_CPU = False
 
29
 
30
  def interrupt_callback():
31
  return INTERRUPTED
@@ -65,11 +68,53 @@ def load_model(version):
65
  print("Cached model loaded in %.2fs" % (time.monotonic() - t1))
66
  return result
67
 
68
-
69
- def predict(model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap=1):
70
- global MODEL, INTERRUPTED, INTERRUPTING
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  output_segments = None
72
-
73
  INTERRUPTED = False
74
  INTERRUPTING = False
75
  if temperature < 0:
@@ -126,7 +171,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
126
  if melody:
127
  # todo return excess duration, load next model and continue in loop structure building up output_segments
128
  if duration > MODEL.lm.cfg.dataset.segment_duration:
129
- output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration)
130
  else:
131
  # pure original code
132
  sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -191,10 +236,10 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
191
  else:
192
  output = output.detach().cpu().float()[0]
193
 
194
- with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
195
- if include_settings:
196
- video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody File:#todo"
197
- background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
198
  audio_write(
199
  file.name, output, MODEL.sample_rate, strategy="loudness",
200
  loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
@@ -210,6 +255,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
210
  def ui(**kwargs):
211
  css="""
212
  #col-container {max-width: 910px; margin-left: auto; margin-right: auto;}
 
213
  a {text-decoration-line: underline; font-weight: 600;}
214
  """
215
  with gr.Blocks(title="UnlimitedMusicGen", css=css) as demo:
@@ -235,15 +281,20 @@ def ui(**kwargs):
235
  with gr.Row():
236
  with gr.Column():
237
  with gr.Row():
238
- text = gr.Text(label="Input Text", interactive=True, value="4/4 100bpm 320kbps 48khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi")
239
- melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
 
 
 
240
  with gr.Row():
241
  submit = gr.Button("Submit")
242
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
243
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
244
  with gr.Row():
245
  background= gr.Image(value="./assets/background.png", source="upload", label="Background", shape=(768,512), type="filepath", interactive=True)
246
- include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
 
 
247
  with gr.Row():
248
  title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
249
  settings_font = gr.Text(label="Settings Font", value="./assets/arial.ttf", interactive=True)
@@ -252,7 +303,7 @@ def ui(**kwargs):
252
  model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
253
  with gr.Row():
254
  duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
255
- overlap = gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Overlap", interactive=True)
256
  dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
257
  with gr.Row():
258
  topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
@@ -267,8 +318,10 @@ def ui(**kwargs):
267
  output = gr.Video(label="Generated Music")
268
  seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
269
 
270
- reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False)
271
- submit.click(predict, inputs=[model, text, melody, duration, dimension, topk, topp, temperature, cfg_coef, background, title, include_settings, settings_font, settings_font_color, seed, overlap], outputs=[output, seed_used])
 
 
272
  gr.Examples(
273
  fn=predict,
274
  examples=[
@@ -307,10 +360,11 @@ def ui(**kwargs):
307
  share = kwargs.get('share', False)
308
  if share:
309
  launch_kwargs['share'] = share
 
310
 
311
 
312
 
313
- demo.queue(max_size=15).launch(**launch_kwargs )
314
 
315
  if __name__ == "__main__":
316
  parser = argparse.ArgumentParser()
 
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
22
+ from pathlib import Path
23
+ from typing import List, Union
24
 
25
  MODEL = None
26
  MODELS = None
 
28
  INTERRUPTED = False
29
  UNLOAD_MODEL = False
30
  MOVE_TO_CPU = False
31
+ MAX_PROMPT_INDEX = 0
32
 
33
  def interrupt_callback():
34
  return INTERRUPTED
 
68
  print("Cached model loaded in %.2fs" % (time.monotonic() - t1))
69
  return result
70
 
71
+ def get_filename(file):
72
+ # extract filename from file object
73
+ filename = None
74
+ if file is not None:
75
+ filename = file.name
76
+ return filename
77
+
78
+ def get_filename_from_filepath(filepath):
79
+ file_name = os.path.basename(filepath)
80
+ file_base, file_extension = os.path.splitext(file_name)
81
+ return file_base, file_extension
82
+
83
+ def load_melody_filepath(melody_filepath, title):
84
+ # get melody filename
85
+ #$Union[str, os.PathLike]
86
+ symbols = ['_', '.', '-']
87
+ if melody_filepath is None:
88
+ return None, title
89
+
90
+ if (title is None) or ("MusicGen" in title) or (title == ""):
91
+ melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
92
+ # fix melody name for symbols
93
+ for symbol in symbols:
94
+ melody_name = melody_name.replace(symbol, ' ').title()
95
+ else:
96
+ melody_name = title
97
+
98
+ print(f"Melody name: {melody_name}, Melody Filepath: {melody_filepath}\n")
99
+
100
+ return gr.Audio.update(value=melody_filepath), gr.Textbox.update(value=melody_name)
101
+
102
+ def load_melody(melody, prompt_index):
103
+ # get melody length in number of segments and modify the UI
104
+ if melody is None:
105
+ return prompt_index
106
+ sr, melody_data = melody[0], melody[1]
107
+ segment_samples = sr * 30
108
+ total_melodys = max(min((len(melody_data) // segment_samples) - 1, 25), 0)
109
+ print(f"Melody length: {len(melody_data)}, Melody segments: {total_melodys}\n")
110
+ MAX_PROMPT_INDEX = total_melodys
111
+ return gr.Slider.update(maximum=MAX_PROMPT_INDEX, value=0, visible=True)
112
+
113
+
114
+ def predict(model, text, melody, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True):
115
+ global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
116
  output_segments = None
117
+ melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
118
  INTERRUPTED = False
119
  INTERRUPTING = False
120
  if temperature < 0:
 
171
  if melody:
172
  # todo return excess duration, load next model and continue in loop structure building up output_segments
173
  if duration > MODEL.lm.cfg.dataset.segment_duration:
174
+ output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration, prompt_index)
175
  else:
176
  # pure original code
177
  sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
 
236
  else:
237
  output = output.detach().cpu().float()[0]
238
 
239
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
240
+ video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}\n Model: {model}\n Melody Condition:{melody_name}\n Prompt index: {prompt_index}"
241
+ if include_settings or include_title:
242
+ background = add_settings_to_image(title if include_title else "", video_description if include_settings else "", background_path=background, font=settings_font, font_color=settings_font_color)
243
  audio_write(
244
  file.name, output, MODEL.sample_rate, strategy="loudness",
245
  loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
 
255
  def ui(**kwargs):
256
  css="""
257
  #col-container {max-width: 910px; margin-left: auto; margin-right: auto;}
258
+ #aud-melody {height: 0; width:0; visibility: hidden;}
259
  a {text-decoration-line: underline; font-weight: 600;}
260
  """
261
  with gr.Blocks(title="UnlimitedMusicGen", css=css) as demo:
 
281
  with gr.Row():
282
  with gr.Column():
283
  with gr.Row():
284
+ text = gr.Text(label="Prompt Text", interactive=True, value="4/4 100bpm 320kbps 48khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi")
285
+ with gr.Column():
286
+ melody_filepath = gr.Audio(source="upload", type="filepath", label="Melody Condition (optional)", interactive=True)
287
+ melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True, visible=True, elem_id="aud-melody")#.style("display: none;height: 0; width:0;")
288
+ prompt_index = gr.Slider(label="Melody Condition Sample Segment", minimum=-1, maximum=MAX_PROMPT_INDEX, step=1, value=0, interactive=True, info="Which 30 second segment to condition with, - 1 condition each segment independantly")
289
  with gr.Row():
290
  submit = gr.Button("Submit")
291
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
292
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
293
  with gr.Row():
294
  background= gr.Image(value="./assets/background.png", source="upload", label="Background", shape=(768,512), type="filepath", interactive=True)
295
+ with gr.Column():
296
+ include_title = gr.Checkbox(label="Add Title", value=True, interactive=True)
297
+ include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
298
  with gr.Row():
299
  title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
300
  settings_font = gr.Text(label="Settings Font", value="./assets/arial.ttf", interactive=True)
 
303
  model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
304
  with gr.Row():
305
  duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
306
+ overlap = gr.Slider(minimum=1, maximum=15, value=3, step=1, label="Overlap", interactive=True)
307
  dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
308
  with gr.Row():
309
  topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
 
318
  output = gr.Video(label="Generated Music")
319
  seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
320
 
321
+ melody_filepath.change(load_melody_filepath, inputs=[melody_filepath, title], outputs=[melody, title], api_name="melody_filepath_change").success(load_melody, inputs=[melody, prompt_index], outputs=[prompt_index])
322
+ melody.change(load_melody, inputs=[melody, prompt_index], outputs=[prompt_index], api_name="melody_change")
323
+ reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False, api_name="reuse_seed")
324
+ submit.click(predict, inputs=[model, text, melody, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap, prompt_index, include_title, include_settings], outputs=[output, seed_used], api_name="submit")
325
  gr.Examples(
326
  fn=predict,
327
  examples=[
 
360
  share = kwargs.get('share', False)
361
  if share:
362
  launch_kwargs['share'] = share
363
+ launch_kwargs['favicon_path']= "./assets/favicon.ico"
364
 
365
 
366
 
367
+ demo.queue(max_size=12).launch(**launch_kwargs)
368
 
369
  if __name__ == "__main__":
370
  parser = argparse.ArgumentParser()
assets/favicon.ico ADDED
audiocraft/utils/extend.py CHANGED
@@ -18,7 +18,7 @@ INTERRUPTING = False
18
  def separate_audio_segments(audio, segment_duration=30, overlap=1):
19
  sr, audio_data = audio[0], audio[1]
20
 
21
- total_samples = min(len(audio_data), 25)
22
  segment_samples = sr * segment_duration
23
  overlap_samples = sr * overlap
24
 
@@ -43,15 +43,16 @@ def separate_audio_segments(audio, segment_duration=30, overlap=1):
43
  print(f"separate_audio_segments: {len(segments)} segments")
44
  return segments
45
 
46
- def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30):
47
  # generate audio segments
48
  melody_segments = separate_audio_segments(melody, segment_duration, 0)
49
 
50
- # Create a list to store the melody tensors for each segment
51
  melodys = []
52
  output_segments = []
53
  last_chunk = []
54
  text += ", seed=" + str(seed)
 
55
 
56
  # Calculate the total number of segments
57
  total_segments = max(math.ceil(duration / segment_duration),1)
@@ -94,55 +95,63 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
94
  melodys.append(verse)
95
 
96
  torch.manual_seed(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  for idx, verse in enumerate(melodys):
98
  if INTERRUPTING:
99
  return output_segments, duration
100
 
101
  print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
102
  # Compensate for the length of final segment
103
- if (idx + 1) == len(melodys):
104
- print(f'Modify Last verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
 
105
  MODEL.set_generation_params(
106
  use_sampling=True,
107
  top_k=MODEL.generation_params["top_k"],
108
  top_p=MODEL.generation_params["top_p"],
109
  temperature=MODEL.generation_params["temp"],
110
  cfg_coef=MODEL.generation_params["cfg_coef"],
111
- duration=duration,
112
  two_step_cfg=False,
113
  rep_penalty=0.5
114
  )
115
  try:
116
  # get last chunk
117
- verse = verse[:, :, -duration*MODEL.sample_rate:]
118
- prompt_segment = prompt_segment[:, :, -duration*MODEL.sample_rate:]
119
  except:
120
  # get first chunk
121
- verse = verse[:, :, :duration*MODEL.sample_rate]
122
- prompt_segment = prompt_segment[:, :, :duration*MODEL.sample_rate]
123
-
124
- else:
125
- MODEL.set_generation_params(
126
- use_sampling=True,
127
- top_k=MODEL.generation_params["top_k"],
128
- top_p=MODEL.generation_params["top_p"],
129
- temperature=MODEL.generation_params["temp"],
130
- cfg_coef=MODEL.generation_params["cfg_coef"],
131
- duration=segment_duration,
132
- two_step_cfg=False,
133
- rep_penalty=0.5
134
- )
135
-
136
- # Generate a new prompt segment based on the first verse. This will be applied to all segments for consistency
137
- if idx == 0:
138
- print(f"Generating New Prompt Segment: {text}\r")
139
- prompt_segment = MODEL.generate_with_all(
140
- descriptions=[text],
141
- melody_wavs=verse,
142
- sample_rate=sr,
143
- progress=False,
144
- prompt=None,
145
- )
146
 
147
  print(f"Generating New Melody Segment {idx + 1}: {text}\r")
148
  output = MODEL.generate_with_all(
@@ -152,6 +161,10 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
152
  progress=False,
153
  prompt=prompt_segment,
154
  )
 
 
 
 
155
 
156
  # Append the generated output to the list of segments
157
  #output_segments.append(output[:, :segment_duration])
 
18
  def separate_audio_segments(audio, segment_duration=30, overlap=1):
19
  sr, audio_data = audio[0], audio[1]
20
 
21
+ total_samples = len(audio_data)
22
  segment_samples = sr * segment_duration
23
  overlap_samples = sr * overlap
24
 
 
43
  print(f"separate_audio_segments: {len(segments)} segments")
44
  return segments
45
 
46
+ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0):
47
  # generate audio segments
48
  melody_segments = separate_audio_segments(melody, segment_duration, 0)
49
 
50
+ # Create lists to store the melody tensors for each segment
51
  melodys = []
52
  output_segments = []
53
  last_chunk = []
54
  text += ", seed=" + str(seed)
55
+ prompt_segment = None
56
 
57
  # Calculate the total number of segments
58
  total_segments = max(math.ceil(duration / segment_duration),1)
 
95
  melodys.append(verse)
96
 
97
  torch.manual_seed(seed)
98
+
99
+ # If user selects a prompt segment, generate a new prompt segment to use on all segments
100
+ #default to the first segment for prompt conditioning
101
+ prompt_verse = melodys[0]
102
+ if prompt_index > 0:
103
+ # Get a prompt segment from the selected verse, normally the first verse
104
+ prompt_verse = melodys[prompt_index if prompt_index <= (total_segments - 1) else (total_segments -1)]
105
+
106
+ # set the prompt segment MODEL generation params
107
+ MODEL.set_generation_params(
108
+ use_sampling=True,
109
+ top_k=MODEL.generation_params["top_k"],
110
+ top_p=MODEL.generation_params["top_p"],
111
+ temperature=MODEL.generation_params["temp"],
112
+ cfg_coef=MODEL.generation_params["cfg_coef"],
113
+ duration=segment_duration,
114
+ two_step_cfg=False,
115
+ rep_penalty=0.5
116
+ )
117
+ # Generate a new prompt segment. This will be applied to all segments for consistency
118
+ print(f"Generating New Prompt Segment: {text} from verse {prompt_index}\r")
119
+ prompt_segment = MODEL.generate_with_all(
120
+ descriptions=[text],
121
+ melody_wavs=prompt_verse,
122
+ sample_rate=sr,
123
+ progress=False,
124
+ prompt=None,
125
+ )
126
+
127
  for idx, verse in enumerate(melodys):
128
  if INTERRUPTING:
129
  return output_segments, duration
130
 
131
  print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss}')
132
  # Compensate for the length of final segment
133
+ if ((idx + 1) == len(melodys)) or (duration < segment_duration):
134
+ mod_duration = max(min(duration, segment_duration),1)
135
+ print(f'Modify verse length, duration: {duration}, overlap: {overlap} Overlap Loss: {duration_loss} to mod duration: {mod_duration}')
136
  MODEL.set_generation_params(
137
  use_sampling=True,
138
  top_k=MODEL.generation_params["top_k"],
139
  top_p=MODEL.generation_params["top_p"],
140
  temperature=MODEL.generation_params["temp"],
141
  cfg_coef=MODEL.generation_params["cfg_coef"],
142
+ duration=mod_duration,
143
  two_step_cfg=False,
144
  rep_penalty=0.5
145
  )
146
  try:
147
  # get last chunk
148
+ verse = verse[:, :, -mod_duration*MODEL.sample_rate:]
149
+ prompt_segment = prompt_segment[:, :, -mod_duration*MODEL.sample_rate:]
150
  except:
151
  # get first chunk
152
+ verse = verse[:, :, :mod_duration*MODEL.sample_rate]
153
+ prompt_segment = prompt_segment[:, :, :mod_duration*MODEL.sample_rate]
154
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  print(f"Generating New Melody Segment {idx + 1}: {text}\r")
157
  output = MODEL.generate_with_all(
 
161
  progress=False,
162
  prompt=prompt_segment,
163
  )
164
+ # If user selects a prompt segment, use the prompt segment for all segments
165
+ # Otherwise, use the previous segment as the prompt
166
+ if prompt_index < 0:
167
+ prompt_segment = output
168
 
169
  # Append the generated output to the list of segments
170
  #output_segments.append(output[:, :segment_duration])