Surn commited on
Commit
1028cad
1 Parent(s): 74894bc

Add Harmony / Drum separation

Browse files
app.py CHANGED
@@ -19,8 +19,9 @@ from audiocraft.data.audio_utils import apply_fade, apply_tafade
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
22
- from pathlib import Path
23
- from typing import List, Union
 
24
 
25
  MODEL = None
26
  MODELS = None
@@ -80,12 +81,18 @@ def get_filename_from_filepath(filepath):
80
  file_base, file_extension = os.path.splitext(file_name)
81
  return file_base, file_extension
82
 
 
 
 
 
 
 
83
  def load_melody_filepath(melody_filepath, title):
84
  # get melody filename
85
  #$Union[str, os.PathLike]
86
  symbols = ['_', '.', '-']
87
  if melody_filepath is None:
88
- return None, title
89
 
90
  if (title is None) or ("MusicGen" in title) or (title == ""):
91
  melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
@@ -97,26 +104,25 @@ def load_melody_filepath(melody_filepath, title):
97
 
98
  print(f"Melody name: {melody_name}, Melody Filepath: {melody_filepath}\n")
99
 
100
- return gr.Audio.update(value=melody_filepath), gr.Textbox.update(value=melody_name)
101
-
102
- def load_melody(melody, prompt_index):
103
  # get melody length in number of segments and modify the UI
104
- if melody is None:
105
- return gr.Slider.update(maximum=0, value=0) , gr.Radio.update(value="melody", interactive=True)
106
  sr, melody_data = melody[0], melody[1]
107
  segment_samples = sr * 30
108
- total_melodys = max(min((len(melody_data) // segment_samples) - 1, 25), 0)
109
  print(f"Melody length: {len(melody_data)}, Melody segments: {total_melodys}\n")
110
- MAX_PROMPT_INDEX = total_melodys
111
- return gr.Slider.update(maximum=MAX_PROMPT_INDEX, value=0), gr.Radio.update(value="melody", interactive=False)
112
-
113
 
114
- def predict(model, text, melody, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True):
115
  global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
116
  output_segments = None
117
  melody_name = "Not Used"
 
118
  if melody_filepath:
119
  melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
 
 
120
  INTERRUPTED = False
121
  INTERRUPTING = False
122
  if temperature < 0:
@@ -173,7 +179,7 @@ def predict(model, text, melody, melody_filepath, duration, dimension, topk, top
173
  if melody:
174
  # todo return excess duration, load next model and continue in loop structure building up output_segments
175
  if duration > MODEL.lm.cfg.dataset.segment_duration:
176
- output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration, prompt_index)
177
  else:
178
  # pure original code
179
  sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
@@ -217,12 +223,12 @@ def predict(model, text, melody, melody_filepath, duration, dimension, topk, top
217
  overlap_samples = overlap * MODEL.sample_rate
218
  #stack tracks and fade out/in
219
  overlapping_output_fadeout = output[:, :, -overlap_samples:]
220
- overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
221
- #overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="exponential")
222
 
223
  overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
224
- overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
225
- #overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
226
 
227
  overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
228
  print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
@@ -244,7 +250,7 @@ def predict(model, text, melody, melody_filepath, duration, dimension, topk, top
244
  background = add_settings_to_image(title if include_title else "", video_description if include_settings else "", background_path=background, font=settings_font, font_color=settings_font_color)
245
  audio_write(
246
  file.name, output, MODEL.sample_rate, strategy="loudness",
247
- loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
248
  waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
249
  if MOVE_TO_CPU:
250
  MODEL.to('cpu')
@@ -252,12 +258,11 @@ def predict(model, text, melody, melody_filepath, duration, dimension, topk, top
252
  MODEL = None
253
  torch.cuda.empty_cache()
254
  torch.cuda.ipc_collect()
255
- return waveform_video, seed
256
 
257
  def ui(**kwargs):
258
  css="""
259
- #col-container {max-width: 910px; margin-left: auto; margin-right: auto;}
260
- #aud-melody {height: 0; width:0; visibility: hidden;}
261
  a {text-decoration-line: underline; font-weight: 600;}
262
  """
263
  with gr.Blocks(title="UnlimitedMusicGen", css=css) as demo:
@@ -283,47 +288,49 @@ def ui(**kwargs):
283
  with gr.Row():
284
  with gr.Column():
285
  with gr.Row():
286
- text = gr.Text(label="Prompt Text", interactive=True, value="4/4 100bpm 320kbps 48khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi")
287
  with gr.Column():
288
- melody_filepath = gr.Audio(source="upload", type="filepath", label="Melody Condition (optional)", interactive=True)
289
- melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True, visible=True, elem_id="aud-melody")#.style("display: none;height: 0; width:0;")
 
 
 
290
  prompt_index = gr.Slider(label="Melody Condition Sample Segment", minimum=-1, maximum=MAX_PROMPT_INDEX, step=1, value=0, interactive=True, info="Which 30 second segment to condition with, - 1 condition each segment independantly")
 
291
  with gr.Row():
292
  submit = gr.Button("Submit")
293
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
294
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
295
- with gr.Row():
296
- background= gr.Image(value="./assets/background.png", source="upload", label="Background", shape=(768,512), type="filepath", interactive=True)
297
- with gr.Column():
298
- include_title = gr.Checkbox(label="Add Title", value=True, interactive=True)
299
- include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
300
- with gr.Row():
301
- title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
302
- settings_font = gr.Text(label="Settings Font", value="./assets/arial.ttf", interactive=True)
303
- settings_font_color = gr.ColorPicker(label="Settings Font Color", value="#c87f05", interactive=True)
304
- with gr.Row():
305
- model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
306
- with gr.Row():
307
- duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
308
- overlap = gr.Slider(minimum=1, maximum=15, value=3, step=1, label="Overlap", interactive=True)
309
- dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
310
- with gr.Row():
311
- topk = gr.Number(label="Top-k", value=250, precision=0, interactive=True)
312
- topp = gr.Number(label="Top-p", value=0, precision=0, interactive=True)
313
- temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
314
- cfg_coef = gr.Number(label="Classifier Free Guidance", value=5.5, precision=None, interactive=True)
315
- with gr.Row():
316
- seed = gr.Number(label="Seed", value=-1, precision=0, interactive=True)
317
- gr.Button('\U0001f3b2\ufe0f').style(full_width=False).click(fn=lambda: -1, outputs=[seed], queue=False)
318
- reuse_seed = gr.Button('\u267b\ufe0f').style(full_width=False)
319
  with gr.Column() as c:
320
  output = gr.Video(label="Generated Music")
 
321
  seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
322
 
323
- melody_filepath.change(load_melody_filepath, inputs=[melody_filepath, title], outputs=[melody, title], api_name="melody_filepath_change").success(load_melody, inputs=[melody, prompt_index], outputs=[prompt_index, model])
324
- melody.change(load_melody, inputs=[melody, prompt_index], outputs=[prompt_index], api_name="melody_change")
325
  reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False, api_name="reuse_seed")
326
- submit.click(predict, inputs=[model, text, melody, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap, prompt_index, include_title, include_settings], outputs=[output, seed_used], api_name="submit")
327
  gr.Examples(
328
  fn=predict,
329
  examples=[
@@ -353,7 +360,7 @@ def ui(**kwargs):
353
  "medium",
354
  ],
355
  ],
356
- inputs=[text, melody, model],
357
  outputs=[output]
358
  )
359
 
 
19
  from audiocraft.utils.extend import generate_music_segments, add_settings_to_image, INTERRUPTING
20
  import numpy as np
21
  import random
22
+ #from pathlib import Path
23
+ #from typing import List, Union
24
+ import librosa
25
 
26
  MODEL = None
27
  MODELS = None
 
81
  file_base, file_extension = os.path.splitext(file_name)
82
  return file_base, file_extension
83
 
84
+ def get_melody(melody_filepath):
85
+ audio_data= list(librosa.load(melody_filepath, sr=None))
86
+ audio_data[0], audio_data[1] = audio_data[1], audio_data[0]
87
+ melody = tuple(audio_data)
88
+ return melody
89
+
90
  def load_melody_filepath(melody_filepath, title):
91
  # get melody filename
92
  #$Union[str, os.PathLike]
93
  symbols = ['_', '.', '-']
94
  if melody_filepath is None:
95
+ return None, title, gr.Slider.update(maximum=0, value=0) , gr.Radio.update(value="melody", interactive=True)
96
 
97
  if (title is None) or ("MusicGen" in title) or (title == ""):
98
  melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
 
104
 
105
  print(f"Melody name: {melody_name}, Melody Filepath: {melody_filepath}\n")
106
 
 
 
 
107
  # get melody length in number of segments and modify the UI
108
+ melody = get_melody(melody_filepath)
 
109
  sr, melody_data = melody[0], melody[1]
110
  segment_samples = sr * 30
111
+ total_melodys = max(min((len(melody_data) // segment_samples), 25), 0)
112
  print(f"Melody length: {len(melody_data)}, Melody segments: {total_melodys}\n")
113
+ MAX_PROMPT_INDEX = total_melodys
114
+
115
+ return gr.Textbox.update(value=melody_name), gr.Slider.update(maximum=MAX_PROMPT_INDEX, value=0), gr.Radio.update(value="melody", interactive=False)
116
 
117
+ def predict(model, text, melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap=1, prompt_index = 0, include_title = True, include_settings = True, harmony_only = False):
118
  global MODEL, INTERRUPTED, INTERRUPTING, MOVE_TO_CPU
119
  output_segments = None
120
  melody_name = "Not Used"
121
+ melody = None
122
  if melody_filepath:
123
  melody_name, melody_extension = get_filename_from_filepath(melody_filepath)
124
+ melody = get_melody(melody_filepath)
125
+
126
  INTERRUPTED = False
127
  INTERRUPTING = False
128
  if temperature < 0:
 
179
  if melody:
180
  # todo return excess duration, load next model and continue in loop structure building up output_segments
181
  if duration > MODEL.lm.cfg.dataset.segment_duration:
182
+ output_segments, duration = generate_music_segments(text, melody, seed, MODEL, duration, overlap, MODEL.lm.cfg.dataset.segment_duration, prompt_index, harmony_only=False)
183
  else:
184
  # pure original code
185
  sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
 
223
  overlap_samples = overlap * MODEL.sample_rate
224
  #stack tracks and fade out/in
225
  overlapping_output_fadeout = output[:, :, -overlap_samples:]
226
+ #overlapping_output_fadeout = apply_fade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True, curve_end=0.0, current_device=MODEL.device)
227
+ overlapping_output_fadeout = apply_tafade(overlapping_output_fadeout,sample_rate=MODEL.sample_rate,duration=overlap,out=True,start=True,shape="linear")
228
 
229
  overlapping_output_fadein = output_segments[i][:, :, :overlap_samples]
230
+ #overlapping_output_fadein = apply_fade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, curve_start=0.0, current_device=MODEL.device)
231
+ overlapping_output_fadein = apply_tafade(overlapping_output_fadein,sample_rate=MODEL.sample_rate,duration=overlap,out=False,start=False, shape="linear")
232
 
233
  overlapping_output = torch.cat([overlapping_output_fadeout[:, :, :-(overlap_samples // 2)], overlapping_output_fadein],dim=2)
234
  print(f" overlap size Fade:{overlapping_output.size()}\n output: {output.size()}\n segment: {output_segments[i].size()}")
 
250
  background = add_settings_to_image(title if include_title else "", video_description if include_settings else "", background_path=background, font=settings_font, font_color=settings_font_color)
251
  audio_write(
252
  file.name, output, MODEL.sample_rate, strategy="loudness",
253
+ loudness_headroom_db=18, loudness_compressor=True, add_suffix=False, channels=2)
254
  waveform_video = make_waveform(file.name,bg_image=background, bar_count=45)
255
  if MOVE_TO_CPU:
256
  MODEL.to('cpu')
 
258
  MODEL = None
259
  torch.cuda.empty_cache()
260
  torch.cuda.ipc_collect()
261
+ return waveform_video, file.name, seed
262
 
263
  def ui(**kwargs):
264
  css="""
265
+ #col-container {max-width: 910px; margin-left: auto; margin-right: auto;}
 
266
  a {text-decoration-line: underline; font-weight: 600;}
267
  """
268
  with gr.Blocks(title="UnlimitedMusicGen", css=css) as demo:
 
288
  with gr.Row():
289
  with gr.Column():
290
  with gr.Row():
 
291
  with gr.Column():
292
+ text = gr.Text(label="Prompt Text", interactive=True, value="4/4 100bpm 320kbps 48khz, Industrial/Electronic Soundtrack, Dark, Intense, Sci-Fi")
293
+ duration = gr.Slider(minimum=1, maximum=720, value=10, label="Duration", interactive=True)
294
+ model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
295
+ with gr.Column():
296
+ melody_filepath = gr.Audio(source="upload", type="filepath", label="Melody Condition (optional)", interactive=True)
297
  prompt_index = gr.Slider(label="Melody Condition Sample Segment", minimum=-1, maximum=MAX_PROMPT_INDEX, step=1, value=0, interactive=True, info="Which 30 second segment to condition with, - 1 condition each segment independantly")
298
+ harmony_only = gr.Radio(label="Harmony Only",choices=["No", "Yes"], value="No", interactive=True, info="Remove Drums?")
299
  with gr.Row():
300
  submit = gr.Button("Submit")
301
  # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
302
  _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
303
+ with gr.Accordion("Video", open=False):
304
+ with gr.Row():
305
+ background= gr.Image(value="./assets/background.png", source="upload", label="Background", shape=(768,512), type="filepath", interactive=True)
306
+ with gr.Column():
307
+ include_title = gr.Checkbox(label="Add Title", value=True, interactive=True)
308
+ include_settings = gr.Checkbox(label="Add Settings to background", value=True, interactive=True)
309
+ with gr.Row():
310
+ title = gr.Textbox(label="Title", value="UnlimitedMusicGen", interactive=True)
311
+ settings_font = gr.Text(label="Settings Font", value="./assets/arial.ttf", interactive=True)
312
+ settings_font_color = gr.ColorPicker(label="Settings Font Color", value="#c87f05", interactive=True)
313
+ with gr.Accordion("Expert", open=False):
314
+ with gr.Row():
315
+ overlap = gr.Slider(minimum=1, maximum=15, value=2, step=1, label="Verse Overlap", interactive=True)
316
+ dimension = gr.Slider(minimum=-2, maximum=2, value=2, step=1, label="Dimension", info="determines which direction to add new segements of audio. (1 = stack tracks, 2 = lengthen, -2..0 = ?)", interactive=True)
317
+ with gr.Row():
318
+ topk = gr.Number(label="Top-k", value=280, precision=0, interactive=True)
319
+ topp = gr.Number(label="Top-p", value=1450, precision=0, interactive=True)
320
+ temperature = gr.Number(label="Randomness Temperature", value=0.75, precision=None, interactive=True)
321
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=8.5, precision=None, interactive=True)
322
+ with gr.Row():
323
+ seed = gr.Number(label="Seed", value=-1, precision=0, interactive=True)
324
+ gr.Button('\U0001f3b2\ufe0f').style(full_width=False).click(fn=lambda: -1, outputs=[seed], queue=False)
325
+ reuse_seed = gr.Button('\u267b\ufe0f').style(full_width=False)
 
326
  with gr.Column() as c:
327
  output = gr.Video(label="Generated Music")
328
+ wave_file = gr.File(label=".wav file", elem_id="output_wavefile", interactive=True)
329
  seed_used = gr.Number(label='Seed used', value=-1, interactive=False)
330
 
331
+ melody_filepath.change(load_melody_filepath, inputs=[melody_filepath, title], outputs=[title, prompt_index , model], api_name="melody_filepath_change")
 
332
  reuse_seed.click(fn=lambda x: x, inputs=[seed_used], outputs=[seed], queue=False, api_name="reuse_seed")
333
+ submit.click(predict, inputs=[model, text,melody_filepath, duration, dimension, topk, topp, temperature, cfg_coef, background, title, settings_font, settings_font_color, seed, overlap, prompt_index, include_title, include_settings, harmony_only], outputs=[output, wave_file, seed_used], api_name="submit")
334
  gr.Examples(
335
  fn=predict,
336
  examples=[
 
360
  "medium",
361
  ],
362
  ],
363
+ inputs=[text, melody_filepath, model],
364
  outputs=[output]
365
  )
366
 
audiocraft/data/audio_utils.py CHANGED
@@ -173,7 +173,7 @@ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
173
  assert wav.dtype == torch.int16
174
  return wav
175
 
176
- def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear") -> torch.Tensor:
177
  """
178
  Apply fade-in and/or fade-out effects to the audio tensor.
179
 
@@ -192,11 +192,12 @@ def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start
192
  fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
193
 
194
  # Create the fade transform
195
- fade_transform = torchaudio.transforms.Fade(fade_in_len=fade_samples, fade_out_len=fade_samples, fade_shape=shape)
196
 
197
  if out:
198
  fade_transform.fade_out_len = fade_samples
199
- fade_transform.fade_out_shape = shape
 
200
 
201
  # Select the portion of the audio to apply the fade
202
  if start:
@@ -213,9 +214,12 @@ def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start
213
  else:
214
  audio_faded[:, -fade_samples:] = audio_fade_section
215
 
216
- return audio_faded
 
 
 
217
 
218
- def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu") -> torch.Tensor:
219
  """
220
  Apply fade-in and/or fade-out effects to the audio tensor.
221
 
@@ -256,4 +260,6 @@ def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=T
256
  else:
257
  audio_faded[:, -fade_samples:] = audio_fade_section
258
 
259
- return audio_faded
 
 
 
173
  assert wav.dtype == torch.int16
174
  return wav
175
 
176
+ def apply_tafade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, shape: str = "linear", stem_name: tp.Optional[str] = None) -> torch.Tensor:
177
  """
178
  Apply fade-in and/or fade-out effects to the audio tensor.
179
 
 
192
  fade_samples = int(sample_rate * duration) # Number of samples for the fade duration
193
 
194
  # Create the fade transform
195
+ fade_transform = torchaudio.transforms.Fade(fade_in_len=0, fade_out_len=0, fade_shape=shape)
196
 
197
  if out:
198
  fade_transform.fade_out_len = fade_samples
199
+ else:
200
+ fade_transform.fade_in_len = fade_samples
201
 
202
  # Select the portion of the audio to apply the fade
203
  if start:
 
214
  else:
215
  audio_faded[:, -fade_samples:] = audio_fade_section
216
 
217
+ wav = normalize_loudness(audio_faded,sample_rate, loudness_headroom_db=18, loudness_compressor=True)
218
+ _clip_wav(wav, log_clipping=False, stem_name=stem_name)
219
+ return wav
220
+
221
 
222
+ def apply_fade(audio: torch.Tensor, sample_rate, duration=3.0, out=True, start=True, curve_start:float=0.0, curve_end:float=1.0, current_device:str="cpu", stem_name: tp.Optional[str] = None) -> torch.Tensor:
223
  """
224
  Apply fade-in and/or fade-out effects to the audio tensor.
225
 
 
260
  else:
261
  audio_faded[:, -fade_samples:] = audio_fade_section
262
 
263
+ wav = normalize_loudness(audio_faded,sample_rate, loudness_headroom_db=18, loudness_compressor=True)
264
+ _clip_wav(wav, log_clipping=False, stem_name=stem_name)
265
+ return wav
audiocraft/utils/extend.py CHANGED
@@ -11,6 +11,7 @@ import textwrap
11
  import requests
12
  from io import BytesIO
13
  from huggingface_hub import hf_hub_download
 
14
 
15
 
16
  INTERRUPTING = False
@@ -43,7 +44,7 @@ def separate_audio_segments(audio, segment_duration=30, overlap=1):
43
  print(f"separate_audio_segments: {len(segments)} segments")
44
  return segments
45
 
46
- def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0):
47
  # generate audio segments
48
  melody_segments = separate_audio_segments(melody, segment_duration, 0)
49
 
@@ -85,12 +86,23 @@ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:
85
  if INTERRUPTING:
86
  return [], duration
87
  print(f"segment {segment_idx + 1} of {total_segments} \r")
88
- sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
89
 
90
  print(f"shape:{verse.shape} dim:{verse.dim()}")
91
  if verse.dim() == 2:
92
  verse = verse[None]
93
  verse = verse[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
 
94
  # Append the segment to the melodys list
95
  melodys.append(verse)
96
 
 
11
  import requests
12
  from io import BytesIO
13
  from huggingface_hub import hf_hub_download
14
+ import librosa
15
 
16
 
17
  INTERRUPTING = False
 
44
  print(f"separate_audio_segments: {len(segments)} segments")
45
  return segments
46
 
47
+ def generate_music_segments(text, melody, seed, MODEL, duration:int=10, overlap:int=1, segment_duration:int=30, prompt_index:int=0, harmony_only:bool= False):
48
  # generate audio segments
49
  melody_segments = separate_audio_segments(melody, segment_duration, 0)
50
 
 
86
  if INTERRUPTING:
87
  return [], duration
88
  print(f"segment {segment_idx + 1} of {total_segments} \r")
89
+
90
+ if harmony_only:
91
+ # REMOVE PERCUSION FROM MELODY
92
+ # Apply HPSS using librosa
93
+ verse_harmonic, verse_percussive = librosa.effects.hpss(melody_segments[segment_idx][1])
94
+ # Convert the separated components back to torch.Tensor
95
+ #harmonic_tensor = torch.from_numpy(verse_harmonic)
96
+ #percussive_tensor = torch.from_numpy(verse_percussive)
97
+ sr, verse = melody_segments[segment_idx][0], torch.from_numpy(verse_harmonic).to(MODEL.device).float().t().unsqueeze(0)
98
+ else:
99
+ sr, verse = melody_segments[segment_idx][0], torch.from_numpy(melody_segments[segment_idx][1]).to(MODEL.device).float().t().unsqueeze(0)
100
 
101
  print(f"shape:{verse.shape} dim:{verse.dim()}")
102
  if verse.dim() == 2:
103
  verse = verse[None]
104
  verse = verse[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
105
+
106
  # Append the segment to the melodys list
107
  melodys.append(verse)
108