nateraw commited on
Commit
3856316
1 Parent(s): e60b850

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ nate_is_humming.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###########################################
2
+ # For fast downloads from Hugging Face Hub
3
+ # **Requires the hf_transfer package**
4
+ ###########################################
5
+ import os
6
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
7
+ ###########################################
8
+
9
+ import json
10
+ import random
11
+ import typing as tp
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from functools import partial
15
+
16
+ import gradio as gr
17
+ import torch
18
+ import torchaudio
19
+ import numpy as np
20
+
21
+ from audiocraft.models import musicgen
22
+ from audiocraft.data.audio import audio_write
23
+ from audiocraft.utils.notebook import display_audio
24
+
25
+ from pitch_correction_utils import autotune, closest_pitch, aclosest_pitch_from_scale
26
+
27
+
28
+ def ta_to_librosa_format(waveform):
29
+ """
30
+ Convert an audio tensor from torchaudio format to librosa format.
31
+
32
+ Args:
33
+ waveform (torch.Tensor): Audio tensor from torchaudio with shape (n_channels, n_samples).
34
+
35
+ Returns:
36
+ np.ndarray: Audio array in librosa format with shape (n_samples,) or (2, n_samples).
37
+ """
38
+ # Ensure waveform is in CPU and convert to numpy
39
+ waveform_np = waveform.numpy()
40
+
41
+ # Check if audio is mono or stereo and transpose if necessary
42
+ if waveform_np.shape[0] == 1:
43
+ # Remove the channel dimension for mono
44
+ waveform_np = waveform_np.squeeze(0)
45
+ else:
46
+ # Transpose to switch from (n_channels, n_samples) to (n_samples, n_channels)
47
+ waveform_np = waveform_np.transpose()
48
+
49
+ # Normalize to [-1, 1] if not already
50
+ if waveform_np.dtype in [np.int16, np.int32]:
51
+ waveform_np = waveform_np / np.iinfo(waveform_np.dtype).max
52
+
53
+ return waveform_np
54
+
55
+
56
+ def librosa_to_ta_format(waveform_np):
57
+ """
58
+ Convert an audio array from librosa format to torchaudio format.
59
+
60
+ Args:
61
+ waveform_np (np.ndarray): Audio array from librosa with shape (n_samples,) or (2, n_samples).
62
+
63
+ Returns:
64
+ torch.Tensor: Audio tensor in torchaudio format with shape (n_channels, n_samples).
65
+ """
66
+ # Ensure it is a float32 array normalized to [-1, 1]
67
+ waveform_np = np.array(waveform_np, dtype=np.float32)
68
+
69
+ if waveform_np.ndim == 1:
70
+ # Add a channel dimension for mono
71
+ waveform_np = waveform_np[np.newaxis, :]
72
+ else:
73
+ # Transpose to switch from (n_samples, n_channels) to (n_channels, n_samples)
74
+ waveform_np = waveform_np.transpose()
75
+
76
+ # Convert numpy array to PyTorch tensor
77
+ waveform = torch.from_numpy(waveform_np)
78
+ return waveform
79
+
80
+
81
+ def run_autotune(y, sr, correction_method="closest", scale=None):
82
+ # Only mono-files are handled. If stereo files are supplied, only the first channel is used.
83
+ if y.ndim > 1:
84
+ y = y[0, :]
85
+
86
+ # Pick the pitch adjustment strategy according to the arguments.
87
+ correction_function = closest_pitch if correction_method == 'closest' else \
88
+ partial(aclosest_pitch_from_scale, scale=scale)
89
+
90
+ # Torchaudio -> librosa
91
+ y = ta_to_librosa_format(y)
92
+ # Autotune
93
+ pitch_corrected_y = autotune(y, sr, correction_function, plot=False)
94
+ # Librosa -> torchaudio
95
+ pitch_corrected_y = librosa_to_ta_format(pitch_corrected_y)
96
+
97
+ return pitch_corrected_y
98
+
99
+
100
+ def set_all_seeds(seed):
101
+ random.seed(seed)
102
+ os.environ["PYTHONHASHSEED"] = str(seed)
103
+ np.random.seed(seed)
104
+ torch.manual_seed(seed)
105
+ torch.cuda.manual_seed(seed)
106
+ torch.backends.cudnn.deterministic = True
107
+
108
+
109
+ def _preprocess_audio(
110
+ audio_path, model: musicgen.MusicGen, duration: tp.Optional[int] = None
111
+ ):
112
+ wav, sr = torchaudio.load(audio_path)
113
+ wav = torchaudio.functional.resample(wav, sr, model.sample_rate)
114
+ wav = wav.mean(dim=0, keepdim=True)
115
+
116
+ # Calculate duration in seconds if not provided
117
+ if duration is None:
118
+ duration = wav.shape[1] / model.sample_rate
119
+
120
+ # Check if duration is more than 30 seconds
121
+ if duration > 30:
122
+ raise ValueError("Duration cannot be more than 30 seconds")
123
+
124
+ end_sample = int(model.sample_rate * duration)
125
+ wav = wav[:, :end_sample]
126
+
127
+ assert wav.shape[0] == 1
128
+ assert wav.shape[1] == model.sample_rate * duration
129
+
130
+ wav = wav.cuda()
131
+ wav = wav.unsqueeze(1)
132
+
133
+ with torch.no_grad():
134
+ gen_audio = model.compression_model.encode(wav)
135
+
136
+ codes, scale = gen_audio
137
+
138
+ assert scale is None
139
+
140
+ return codes
141
+
142
+
143
+ def _get_stemmed_wav_patched(wav, sample_rate):
144
+ print("Skipping stem separation!")
145
+ return wav
146
+
147
+
148
+ class Pipeline:
149
+ def __init__(self, model_id, max_batch_size=4, do_skip_demucs=True):
150
+ self.model = musicgen.MusicGen.get_pretrained(model_id)
151
+ self.max_batch_size = max_batch_size
152
+ self.do_skip_demucs = do_skip_demucs
153
+
154
+ if self.do_skip_demucs:
155
+ self.model.lm.condition_provider.conditioners.self_wav._get_stemmed_wav = _get_stemmed_wav_patched
156
+
157
+ def __call__(
158
+ self,
159
+ prompt,
160
+ input_audio=None,
161
+ scale=None,
162
+ continuation=False,
163
+ batch_size=1,
164
+ duration=15,
165
+ use_sampling=True,
166
+ temperature=1.0,
167
+ top_k=250,
168
+ top_p=0.0,
169
+ cfg_coef=3.0,
170
+ output_dir="./samples", # change to google drive if you'd like
171
+ normalization_strategy="loudness",
172
+ seed=-1,
173
+ continuation_start=0,
174
+ continuation_end=None,
175
+ ):
176
+ print("Prompt:", prompt)
177
+ if scale == "closest":
178
+ scale = None
179
+
180
+ set_generation_params = lambda duration: self.model.set_generation_params(
181
+ duration=duration,
182
+ top_k=top_k,
183
+ top_p=top_p,
184
+ temperature=temperature,
185
+ cfg_coef=cfg_coef,
186
+ )
187
+
188
+ if not seed or seed == -1:
189
+ seed = torch.seed() % 2 ** 32 - 1
190
+ set_all_seeds(seed)
191
+ set_all_seeds(seed)
192
+ print(f"Using seed {seed}")
193
+ if not input_audio:
194
+ set_generation_params(duration)
195
+ wav, tokens = self.model.generate([prompt] * batch_size, progress=True, return_tokens=True)
196
+ else:
197
+ input_audio, sr = torchaudio.load(input_audio)
198
+ # Save a copy of the original input audio
199
+ original_input_audio = input_audio.clone()
200
+ print("Input audio shape:", input_audio.shape)
201
+ if scale is None:
202
+ print("Running pitch correction for 'closest' pitch")
203
+ input_audio = run_autotune(input_audio, sr, correction_method="closest")
204
+ else:
205
+ print("Running pitch correction for 'scale' pitch")
206
+ input_audio = run_autotune(input_audio, sr, correction_method="scale", scale=scale)
207
+ print(f"...Done running pitch correction. Shape after is {input_audio.shape}.\n")
208
+ input_audio = input_audio[None] if input_audio.dim() == 2 else input_audio
209
+
210
+ continuation_start = 0 if not continuation_start else continuation_start
211
+ if continuation_end is None or continuation_end == -1:
212
+ continuation_end = input_audio.shape[2] / sr
213
+
214
+ if continuation_start > continuation_end:
215
+ raise ValueError(
216
+ "`continuation_start` must be less than or equal to `continuation_end`"
217
+ )
218
+
219
+ input_audio_wavform = input_audio[
220
+ ..., int(sr * continuation_start) : int(sr * continuation_end)
221
+ ]
222
+ input_audio_wavform = input_audio_wavform.repeat(batch_size, 1, 1)
223
+ # TODO - not using this - is that wrong??
224
+ input_audio_duration = input_audio_wavform.shape[-1] / sr
225
+
226
+ if continuation:
227
+ set_generation_params(duration) # + input_audio_duration) # SEE TODO above
228
+ print("Continuation wavform shape!", input_audio_wavform.shape)
229
+ wav, tokens = self.model.generate_continuation(
230
+ prompt=input_audio_wavform,
231
+ prompt_sample_rate=sr,
232
+ descriptions=[prompt] * batch_size,
233
+ progress=True,
234
+ return_tokens=True
235
+ )
236
+ else:
237
+ print("Melody wavform shape!", input_audio_wavform.shape)
238
+ set_generation_params(duration)
239
+ wav, tokens = self.model.generate_with_chroma(
240
+ [prompt] * batch_size, input_audio_wavform, sr, progress=True, return_tokens=True
241
+ )
242
+ wav, tokens = wav.cpu(), tokens.cpu()
243
+ # Write to files
244
+ output_dir = Path(output_dir)
245
+ output_dir.mkdir(exist_ok=True, parents=True)
246
+ dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
247
+ if input_audio is not None:
248
+ outfile_path = output_dir / f"{dt_str}_input_raw"
249
+ audio_write(
250
+ outfile_path,
251
+ original_input_audio,
252
+ sr,
253
+ strategy=normalization_strategy,
254
+ )
255
+ outfile_path = output_dir / f"{dt_str}_input_pitch_corrected"
256
+ audio_write(
257
+ outfile_path,
258
+ input_audio_wavform[0],
259
+ sr,
260
+ strategy=normalization_strategy,
261
+ )
262
+
263
+ for i in range(batch_size):
264
+ outfile_path = output_dir / f"{dt_str}_{i:02d}"
265
+ audio_write(
266
+ outfile_path,
267
+ wav[i],
268
+ self.model.sample_rate,
269
+ strategy=normalization_strategy,
270
+ )
271
+ json_out_path = output_dir / f"{dt_str}.json"
272
+ json_out_path.write_text(json.dumps(dict(
273
+ prompt=prompt,
274
+ batch_size=batch_size,
275
+ duration=duration,
276
+ use_sampling=use_sampling,
277
+ temperature=temperature,
278
+ top_k=top_k,
279
+ cfg_coef=cfg_coef,
280
+ )))
281
+
282
+ to_return = [None] * (self.max_batch_size + 1)
283
+ if input_audio is not None:
284
+ print(f"trying to return input audio wavform of shape: {input_audio_wavform.shape}")
285
+ to_return[0] = (sr, input_audio_wavform[0].T.numpy())
286
+
287
+ for i in range(batch_size):
288
+ to_return[i + 1] = (self.model.sample_rate, wav[i].T.numpy())
289
+ print(wav[i].shape)
290
+ return to_return
291
+
292
+
293
+ def main(model_id="nateraw/musicgen-songstarter-v0.2", max_batch_size=4, share=False, debug=False):
294
+ pipeline = Pipeline(model_id, max_batch_size)
295
+ interface = gr.Interface(
296
+ fn=pipeline.__call__,
297
+ inputs=[
298
+ gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
299
+ gr.Audio(
300
+ sources=["microphone"],
301
+ waveform_options=gr.WaveformOptions(
302
+ waveform_color="#01C6FF",
303
+ waveform_progress_color="#0066B4",
304
+ skip_length=2,
305
+ show_controls=False,
306
+ ),
307
+ type="filepath",
308
+ ),
309
+ gr.Dropdown(["closest", "A:maj", "A:min", "Bb:maj", "Bb:min", "B:maj", "B:min", "C:maj", "C:min", "Db:maj", "Db:min", "D:maj", "D:min", "Eb:maj", "Eb:min", "E:maj", "E:min", "F:maj", "F:min", "Gb:maj", "Gb:min", "G:maj", "G:min", "Ab:maj", "Ab:min"], label="Scale for pitch correction.", value="closest"),
310
+ gr.Checkbox(label="Is Continuation", value=False),
311
+ gr.Slider(label="Batch Size", value=1, minimum=1, maximum=pipeline.max_batch_size, step=1),
312
+ gr.Slider(label="Duration", value=15, minimum=4, maximum=30),
313
+ gr.Checkbox(label="Use Sampling", value=True),
314
+ gr.Slider(label="Temperature", value=1.0, minimum=0.0, maximum=2.0),
315
+ gr.Slider(label="Top K", value=250, minimum=0, maximum=1000),
316
+ gr.Slider(label="Top P", value=0.0, minimum=0.0, maximum=1.0),
317
+ gr.Slider(label="CFG Coef", value=3.0, minimum=0.0, maximum=10.0),
318
+ gr.Textbox(label="Output Dir", value="./samples"),
319
+ gr.Dropdown(["loudness", "clip", "peak", "rms"], value="loudness", label="Strategy for normalizing audio."),
320
+ gr.Slider(label="random seed", minimum=-1, maximum=9e8),
321
+ ],
322
+ outputs=[gr.Audio(label=("Input " if i == 0 else "") + f"Audio {i}") for i in range(pipeline.max_batch_size + 1)],
323
+ title="🎶 Generate song ideas with musicgen-songstarter-v0.2 🎶",
324
+ description="Check out the repo [here](https://huggingface.co/nateraw/musicgen-songstarter-v0.2)",
325
+ examples=[
326
+ ["hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
327
+ ["acoustic, guitar, melody, rnb, trap, E minor, 85 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
328
+ ["synth, dark, hip hop, melody, trap, Gb minor, 140 bpm", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
329
+ ["drill, layered, melody, songstarters, trap, C# minor, 130 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
330
+ ["hip hop, soul, rnb, neo soul, songstarters, B minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
331
+ ["music, mallets, bells, melody, dancehall, african, afropop & afrobeats", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 4.5, "./samples", "loudness", -1],
332
+ ]
333
+ )
334
+ interface.launch(share=share, debug=debug)
335
+
336
+
337
+ if __name__ == '__main__':
338
+ from fire import Fire
339
+ Fire(main)
340
+
341
+ # For testing
342
+
343
+ # pipe = Pipeline("nateraw/musicgen-songstarter-v0.2", max_batch_size=4)
344
+ # example_input = (
345
+ # "hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm",
346
+ # "nate_is_humming.wav",
347
+ # "closest",
348
+ # False,
349
+ # 1,
350
+ # 8,
351
+ # True,
352
+ # 1.0,
353
+ # 250,
354
+ # 0.0,
355
+ # 3.0,
356
+ # "./samples",
357
+ # "loudness",
358
+ # -1,
359
+ # 0,
360
+ # None
361
+ # )
362
+ # out = pipe(*example_input)
nate_is_humming.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a62520e3026bc71b06fa75a8120c3b46524a0a34dcac9661e3e27632e294b11f
3
+ size 1196036
nate_is_singing_Gb_minor.wav ADDED
Binary file (619 kB). View file
 
pitch_correction_utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from pathlib import Path
3
+ import argparse
4
+ import librosa
5
+ import librosa.display
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import soundfile as sf
9
+ import scipy.signal as sig
10
+ import psola
11
+
12
+
13
+ SEMITONES_IN_OCTAVE = 12
14
+
15
+
16
+ def degrees_from(scale: str):
17
+ """Return the pitch classes (degrees) that correspond to the given scale"""
18
+ degrees = librosa.key_to_degrees(scale)
19
+ # To properly perform pitch rounding to the nearest degree from the scale, we need to repeat
20
+ # the first degree raised by an octave. Otherwise, pitches slightly lower than the base degree
21
+ # would be incorrectly assigned.
22
+ degrees = np.concatenate((degrees, [degrees[0] + SEMITONES_IN_OCTAVE]))
23
+ return degrees
24
+
25
+
26
+ def closest_pitch(f0):
27
+ """Round the given pitch values to the nearest MIDI note numbers"""
28
+ midi_note = np.around(librosa.hz_to_midi(f0))
29
+ # To preserve the nan values.
30
+ nan_indices = np.isnan(f0)
31
+ midi_note[nan_indices] = np.nan
32
+ # Convert back to Hz.
33
+ return librosa.midi_to_hz(midi_note)
34
+
35
+
36
+ def closest_pitch_from_scale(f0, scale):
37
+ """Return the pitch closest to f0 that belongs to the given scale"""
38
+ # Preserve nan.
39
+ if np.isnan(f0):
40
+ return np.nan
41
+ degrees = degrees_from(scale)
42
+ midi_note = librosa.hz_to_midi(f0)
43
+ # Subtract the multiplicities of 12 so that we have the real-valued pitch class of the
44
+ # input pitch.
45
+ degree = midi_note % SEMITONES_IN_OCTAVE
46
+ # Find the closest pitch class from the scale.
47
+ degree_id = np.argmin(np.abs(degrees - degree))
48
+ # Calculate the difference between the input pitch class and the desired pitch class.
49
+ degree_difference = degree - degrees[degree_id]
50
+ # Shift the input MIDI note number by the calculated difference.
51
+ midi_note -= degree_difference
52
+ # Convert to Hz.
53
+ return librosa.midi_to_hz(midi_note)
54
+
55
+
56
+ def aclosest_pitch_from_scale(f0, scale):
57
+ """Map each pitch in the f0 array to the closest pitch belonging to the given scale."""
58
+ sanitized_pitch = np.zeros_like(f0)
59
+ for i in np.arange(f0.shape[0]):
60
+ sanitized_pitch[i] = closest_pitch_from_scale(f0[i], scale)
61
+ # Perform median filtering to additionally smooth the corrected pitch.
62
+ smoothed_sanitized_pitch = sig.medfilt(sanitized_pitch, kernel_size=11)
63
+ # Remove the additional NaN values after median filtering.
64
+ smoothed_sanitized_pitch[np.isnan(smoothed_sanitized_pitch)] = \
65
+ sanitized_pitch[np.isnan(smoothed_sanitized_pitch)]
66
+ return smoothed_sanitized_pitch
67
+
68
+
69
+ def autotune(audio, sr, correction_function, plot=False):
70
+ # Set some basis parameters.
71
+ frame_length = 2048
72
+ hop_length = frame_length // 4
73
+ fmin = librosa.note_to_hz('C2')
74
+ fmax = librosa.note_to_hz('C7')
75
+
76
+ # Pitch tracking using the PYIN algorithm.
77
+ f0, voiced_flag, voiced_probabilities = librosa.pyin(audio,
78
+ frame_length=frame_length,
79
+ hop_length=hop_length,
80
+ sr=sr,
81
+ fmin=fmin,
82
+ fmax=fmax)
83
+
84
+ # Apply the chosen adjustment strategy to the pitch.
85
+ corrected_f0 = correction_function(f0)
86
+
87
+ if plot:
88
+ # Plot the spectrogram, overlaid with the original pitch trajectory and the adjusted
89
+ # pitch trajectory.
90
+ stft = librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)
91
+ time_points = librosa.times_like(stft, sr=sr, hop_length=hop_length)
92
+ log_stft = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
93
+ fig, ax = plt.subplots()
94
+ img = librosa.display.specshow(log_stft, x_axis='time', y_axis='log', ax=ax, sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax)
95
+ fig.colorbar(img, ax=ax, format="%+2.f dB")
96
+ ax.plot(time_points, f0, label='original pitch', color='cyan', linewidth=2)
97
+ ax.plot(time_points, corrected_f0, label='corrected pitch', color='orange', linewidth=1)
98
+ ax.legend(loc='upper right')
99
+ plt.ylabel('Frequency [Hz]')
100
+ plt.xlabel('Time [M:SS]')
101
+ plt.savefig('pitch_correction.png', dpi=300, bbox_inches='tight')
102
+
103
+ # Pitch-shifting using the PSOLA algorithm.
104
+ return psola.vocode(audio, sample_rate=int(sr), target_pitch=corrected_f0, fmin=fmin, fmax=fmax)
105
+
106
+
107
+ def main(
108
+ vocals_file,
109
+ plot=False,
110
+ correction_method="closest",
111
+ scale=None
112
+ ):
113
+ """Run autotune-like pitch correction on the given audio file.
114
+
115
+ Args:
116
+ vocals_file (str): Filepath to the audio file to be pitch-corrected.
117
+ plot (bool, optional): Whether to plot the results. Defaults to False.
118
+ correction_method (str, optional): The pitch correction method to use. Defaults to `"closest"`. If set to "closest", the pitch will be rounded to the nearest MIDI note.
119
+ If set to "scale", the pitch will be rounded to the nearest note in the given `scale`.
120
+ scale (str, optional): The scale to use for pitch correction. ex. `"C:min"` / `"A:maj"`. Defaults to None.
121
+ """
122
+
123
+ # Parse the command line arguments.
124
+ # ap = argparse.ArgumentParser()
125
+ # ap.add_argument('vocals_file')
126
+ # ap.add_argument('--plot', '-p', action='store_true', default=False,
127
+ # help='if set, will produce a plot of the results')
128
+ # ap.add_argument('--correction-method', '-c', choices=['closest', 'scale'], default='closest')
129
+ # ap.add_argument('--scale', '-s', type=str, help='see librosa.key_to_degrees;'
130
+ # ' used only for the \"scale\" correction'
131
+ # ' method')
132
+ # args = ap.parse_args(args=args)
133
+
134
+ filepath = Path(vocals_file)
135
+
136
+ # Load the audio file.
137
+ y, sr = librosa.load(str(filepath), sr=None, mono=False)
138
+
139
+ # Only mono-files are handled. If stereo files are supplied, only the first channel is used.
140
+ if y.ndim > 1:
141
+ y = y[0, :]
142
+
143
+ # Pick the pitch adjustment strategy according to the arguments.
144
+ correction_function = closest_pitch if correction_method == 'closest' else \
145
+ partial(aclosest_pitch_from_scale, scale=scale)
146
+
147
+ # Perform the auto-tuning.
148
+ pitch_corrected_y = autotune(y, sr, correction_function, plot)
149
+
150
+ # Write the corrected audio to an output file.
151
+ filepath = filepath.parent / (filepath.stem + '_pitch_corrected' + filepath.suffix)
152
+ sf.write(str(filepath), pitch_corrected_y, sr)
153
+ return pitch_corrected_y
154
+
155
+
156
+ if __name__=='__main__':
157
+ # main("./singing_music_idea.wav --plot -c closest".split())
158
+ # python pitch_correction_utils.py --vocals_file "./nate_is_humming.wav" --plot -c closest
159
+ from fire import Fire
160
+ Fire(main)
161
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft
2
+ hf_transfer
3
+ gradio
4
+ psola
5
+ torchvision==0.16.0
6
+ fire
singing_songstarter_demo.ipynb ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "machine_shape": "hm",
8
+ "gpuType": "A100",
9
+ "authorship_tag": "ABX9TyMm+2HEY3Dh8UBT+NJ/CIoa",
10
+ "include_colab_link": true
11
+ },
12
+ "kernelspec": {
13
+ "name": "python3",
14
+ "display_name": "Python 3"
15
+ },
16
+ "language_info": {
17
+ "name": "python"
18
+ },
19
+ "accelerator": "GPU"
20
+ },
21
+ "cells": [
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {
25
+ "id": "view-in-github",
26
+ "colab_type": "text"
27
+ },
28
+ "source": [
29
+ "<a href=\"https://colab.research.google.com/github/nateraw/singing-songstarter/blob/main/singing_songstarter_demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "markdown",
34
+ "source": [
35
+ "# Singing Songstarter Demo\n",
36
+ "\n",
37
+ "This is a demo of using [`musicgen-songstarter-v0.2`](https://hf.co/nateraw/musicgen-songstarter-v0.2), a large stereo musicgen trained to be useful for music producers, for the task of voice-to-music.\n",
38
+ "\n",
39
+ "**Hum an idea, get a music sample!** 🚀\n",
40
+ "\n",
41
+ "### Usage\n",
42
+ "\n",
43
+ "1. Run the cell below.\n",
44
+ "\n",
45
+ "2. You can ignore \"restart this runtime\" message when it pops up\n",
46
+ "3. Click the public share link. Should look like: `\"Running on public URL: https://<your-link-here>\"`\n",
47
+ "4. Enjoy 🔥\n",
48
+ "\n",
49
+ "\n",
50
+ "### If you think this notebook is cool, consider supporting me by:\n",
51
+ " - giving [the model](https://hf.co/nateraw/musicgen-songstarter-v0.2) a heart on Hugging Face ❤️\n",
52
+ " - following me on [GitHub](https://github.com/nateraw) 👨‍💻\n",
53
+ " - following me on [X/twitter](https://twitter.com/nateraw) X\n",
54
+ " - giving [the demo repo](https://github.com/nateraw/singing-songstarter) a star ⭐️\n",
55
+ "\n",
56
+ "If you have any questions/concerns about this demo, please [file an issue on GitHub](https://github.com/nateraw/singing-songstarter)."
57
+ ],
58
+ "metadata": {
59
+ "id": "hBsE8AuVsgG8"
60
+ }
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {
66
+ "id": "-fw0bpXysAUG"
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "%cd /content\n",
71
+ "! git clone https://github.com/nateraw/singing-songstarter\n",
72
+ "%cd /content/singing-songstarter\n",
73
+ "! pip install -r requirements.txt\n",
74
+ "! python app.py --share --debug"
75
+ ]
76
+ }
77
+ ]
78
+ }