Surn commited on
Commit
e3f64dd
1 Parent(s): c228235

Integrate unlimited non-melody guided music (no 30 second limit)

Browse files
Files changed (2) hide show
  1. app.py +67 -37
  2. audiocraft/utils/extend.py +6 -2
app.py CHANGED
@@ -33,47 +33,77 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
33
  if MODEL is None or MODEL.name != model:
34
  MODEL = load_model(model)
35
 
36
- if duration > MODEL.lm.cfg.dataset.segment_duration:
37
- segment_duration = MODEL.lm.cfg.dataset.segment_duration
38
- else:
39
- segment_duration = duration
40
- # implement seed
41
- if seed < 0:
42
- seed = random.randint(0, 0xffff_ffff_ffff)
43
- torch.manual_seed(seed)
44
- MODEL.set_generation_params(
45
- use_sampling=True,
46
- top_k=topk,
47
- top_p=topp,
48
- temperature=temperature,
49
- cfg_coef=cfg_coef,
50
- duration=segment_duration,
51
- )
 
 
 
52
 
53
- if melody:
54
- if duration > MODEL.lm.cfg.dataset.segment_duration:
55
- output_segments = generate_music_segments(text, melody, MODEL, seed, duration, overlap, MODEL.lm.cfg.dataset.segment_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  else:
57
- # pure original code
58
- sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
59
- print(melody.shape)
60
- if melody.dim() == 2:
61
- melody = melody[None]
62
- melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
63
- output = MODEL.generate_with_chroma(
64
- descriptions=[text],
65
- melody_wavs=melody,
66
- melody_sample_rate=sr,
67
- progress=True
68
- )
69
- else:
70
- output = MODEL.generate(descriptions=[text], progress=False)
71
 
72
  if output_segments:
73
  try:
74
- # Combine the output segments into one long audio file
75
- output_segments = [segment.detach().cpu().float()[0] for segment in output_segments]
76
- output = torch.cat(output_segments, dim=dimension)
 
 
 
 
 
 
77
  except Exception as e:
78
  print(f"Error combining segments: {e}. Using the first segment only.")
79
  output = output_segments[0].detach().cpu().float()[0]
@@ -81,7 +111,7 @@ def predict(model, text, melody, duration, dimension, topk, topp, temperature, c
81
  output = output.detach().cpu().float()[0]
82
  with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
83
  if include_settings:
84
- video_description = f"{text}\n Duration: {str(duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}"
85
  background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
86
  audio_write(
87
  file.name, output, MODEL.sample_rate, strategy="loudness",
 
33
  if MODEL is None or MODEL.name != model:
34
  MODEL = load_model(model)
35
 
36
+ output = None
37
+ segment_duration = duration
38
+ initial_duration = duration
39
+ output_segments = []
40
+ while duration > 0:
41
+ if not output_segments: # first pass of long or short song
42
+ if segment_duration > MODEL.lm.cfg.dataset.segment_duration:
43
+ segment_duration = MODEL.lm.cfg.dataset.segment_duration
44
+ else:
45
+ segment_duration = duration
46
+ else: # next pass of long song
47
+ if duration + overlap < MODEL.lm.cfg.dataset.segment_duration:
48
+ segment_duration = duration + overlap
49
+ else:
50
+ segment_duration = MODEL.lm.cfg.dataset.segment_duration
51
+ # implement seed
52
+ if seed < 0:
53
+ seed = random.randint(0, 0xffff_ffff_ffff)
54
+ torch.manual_seed(seed)
55
 
56
+ print(f'Segment duration: {segment_duration}, duration: {duration}, overlap: {overlap}')
57
+ MODEL.set_generation_params(
58
+ use_sampling=True,
59
+ top_k=topk,
60
+ top_p=topp,
61
+ temperature=temperature,
62
+ cfg_coef=cfg_coef,
63
+ duration=segment_duration,
64
+ )
65
+
66
+ if melody:
67
+ # todo return excess duration, load next model and continue in loop structure building up output_segments
68
+ if duration > MODEL.lm.cfg.dataset.segment_duration:
69
+ output_segments, duration = generate_music_segments(text, melody, MODEL, seed, duration, overlap, MODEL.lm.cfg.dataset.segment_duration)
70
+ else:
71
+ # pure original code
72
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
73
+ print(melody.shape)
74
+ if melody.dim() == 2:
75
+ melody = melody[None]
76
+ melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
77
+ output = MODEL.generate_with_chroma(
78
+ descriptions=[text],
79
+ melody_wavs=melody,
80
+ melody_sample_rate=sr,
81
+ progress=True
82
+ )
83
+ # All output_segments are populated, so we can break the loop or set duration to 0
84
+ break
85
  else:
86
+ #output = MODEL.generate(descriptions=[text], progress=False)
87
+ if not output_segments:
88
+ next_segment = MODEL.generate(descriptions=[text], progress=True)
89
+ duration -= segment_duration
90
+ else:
91
+ last_chunk = output_segments[-1][:, :, -overlap*MODEL.sample_rate:]
92
+ next_segment = MODEL.generate_continuation(last_chunk, MODEL.sample_rate, descriptions=[text], progress=True)
93
+ duration -= segment_duration - overlap
94
+ output_segments.append(next_segment)
 
 
 
 
 
95
 
96
  if output_segments:
97
  try:
98
+ # Combine the output segments into one long audio file or stack tracks
99
+ #output_segments = [segment.detach().cpu().float()[0] for segment in output_segments]
100
+ #output = torch.cat(output_segments, dim=dimension)
101
+
102
+ output = output_segments[0]
103
+ for i in range(1, len(output_segments)):
104
+ overlap_samples = overlap * MODEL.sample_rate
105
+ output = torch.cat([output[:, :, :-overlap_samples], output_segments[i][:, :, overlap_samples:]], dim=2)
106
+ output = output.detach().cpu().float()[0]
107
  except Exception as e:
108
  print(f"Error combining segments: {e}. Using the first segment only.")
109
  output = output_segments[0].detach().cpu().float()[0]
 
111
  output = output.detach().cpu().float()[0]
112
  with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
113
  if include_settings:
114
+ video_description = f"{text}\n Duration: {str(initial_duration)} Dimension: {dimension}\n Top-k:{topk} Top-p:{topp}\n Randomness:{temperature}\n cfg:{cfg_coef} overlap: {overlap}\n Seed: {seed}"
115
  background = add_settings_to_image(title, video_description, background_path=background, font=settings_font, font_color=settings_font_color)
116
  audio_write(
117
  file.name, output, MODEL.sample_rate, strategy="loudness",
audiocraft/utils/extend.py CHANGED
@@ -43,10 +43,14 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
43
 
44
  # Calculate the total number of segments
45
  total_segments = max(math.ceil(duration / segment_duration),1)
46
- print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds")
 
 
 
47
 
48
  # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
49
  if len(melody_segments) < total_segments:
 
50
  for i in range(total_segments - len(melody_segments)):
51
  segment = melody_segments[i]
52
  melody_segments.append(segment)
@@ -78,7 +82,7 @@ def generate_music_segments(text, melody, MODEL, seed, duration:int=10, overlap:
78
  #output_segments.append(output[:, :segment_duration])
79
  output_segments.append(output)
80
  print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
81
- return output_segments
82
 
83
  def save_image(image):
84
  """
 
43
 
44
  # Calculate the total number of segments
45
  total_segments = max(math.ceil(duration / segment_duration),1)
46
+
47
+ #calc excess duration
48
+ excess_duration = total_segments * segment_duration - duration
49
+ print(f"total Segments to Generate: {total_segments} for {duration} seconds. Each segment is {segment_duration} seconds. Excess {excess_duration}")
50
 
51
  # If melody_segments is shorter than total_segments, repeat the segments until the total_segments is reached
52
  if len(melody_segments) < total_segments:
53
+ #fix melody_segments
54
  for i in range(total_segments - len(melody_segments)):
55
  segment = melody_segments[i]
56
  melody_segments.append(segment)
 
82
  #output_segments.append(output[:, :segment_duration])
83
  output_segments.append(output)
84
  print(f"output_segments: {len(output_segments)}: shape: {output.shape} dim {output.dim()}")
85
+ return output_segments, excess_duration
86
 
87
  def save_image(image):
88
  """