File size: 14,138 Bytes
3856316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258f222
3856316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258f222
 
 
 
 
 
 
 
3856316
258f222
3856316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258f222
 
 
 
 
 
 
 
3856316
 
 
 
 
258f222
3856316
258f222
3856316
 
 
 
 
 
 
 
258f222
3856316
 
 
 
 
 
 
 
 
 
 
 
 
 
258f222
3856316
03372c9
3856316
61570c5
 
3856316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61570c5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
###########################################
# For fast downloads from Hugging Face Hub
# **Requires the hf_transfer package**
###########################################
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
###########################################

import json
import random
import typing as tp
from datetime import datetime
from pathlib import Path
from functools import partial

import gradio as gr
import torch
import torchaudio
import numpy as np

from audiocraft.models import musicgen
from audiocraft.data.audio import audio_write
from audiocraft.utils.notebook import display_audio

from pitch_correction_utils import autotune, closest_pitch, aclosest_pitch_from_scale


def ta_to_librosa_format(waveform):
    """
    Convert an audio tensor from torchaudio format to librosa format.

    Args:
    waveform (torch.Tensor): Audio tensor from torchaudio with shape (n_channels, n_samples).

    Returns:
    np.ndarray: Audio array in librosa format with shape (n_samples,) or (2, n_samples).
    """
    # Ensure waveform is in CPU and convert to numpy
    waveform_np = waveform.numpy()

    # Check if audio is mono or stereo and transpose if necessary
    if waveform_np.shape[0] == 1:
        # Remove the channel dimension for mono
        waveform_np = waveform_np.squeeze(0)
    else:
        # Transpose to switch from (n_channels, n_samples) to (n_samples, n_channels)
        waveform_np = waveform_np.transpose()

    # Normalize to [-1, 1] if not already
    if waveform_np.dtype in [np.int16, np.int32]:
        waveform_np = waveform_np / np.iinfo(waveform_np.dtype).max

    return waveform_np


def librosa_to_ta_format(waveform_np):
    """
    Convert an audio array from librosa format to torchaudio format.

    Args:
    waveform_np (np.ndarray): Audio array from librosa with shape (n_samples,) or (2, n_samples).

    Returns:
    torch.Tensor: Audio tensor in torchaudio format with shape (n_channels, n_samples).
    """
    # Ensure it is a float32 array normalized to [-1, 1]
    waveform_np = np.array(waveform_np, dtype=np.float32)

    if waveform_np.ndim == 1:
        # Add a channel dimension for mono
        waveform_np = waveform_np[np.newaxis, :]
    else:
        # Transpose to switch from (n_samples, n_channels) to (n_channels, n_samples)
        waveform_np = waveform_np.transpose()

    # Convert numpy array to PyTorch tensor
    waveform = torch.from_numpy(waveform_np)
    return waveform


def run_autotune(y, sr, correction_method="closest", scale=None):
    # Only mono-files are handled. If stereo files are supplied, only the first channel is used.
    if y.ndim > 1:
        y = y[0, :]

    # Pick the pitch adjustment strategy according to the arguments.
    correction_function = closest_pitch if correction_method == 'closest' else \
        partial(aclosest_pitch_from_scale, scale=scale)

    # Torchaudio -> librosa
    y = ta_to_librosa_format(y)
    # Autotune
    pitch_corrected_y = autotune(y, sr, correction_function, plot=False)
    # Librosa -> torchaudio
    pitch_corrected_y = librosa_to_ta_format(pitch_corrected_y)

    return pitch_corrected_y


def set_all_seeds(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def _preprocess_audio(
    audio_path, model: musicgen.MusicGen, duration: tp.Optional[int] = None
):
    wav, sr = torchaudio.load(audio_path)
    wav = torchaudio.functional.resample(wav, sr, model.sample_rate)
    wav = wav.mean(dim=0, keepdim=True)

    # Calculate duration in seconds if not provided
    if duration is None:
        duration = wav.shape[1] / model.sample_rate

    # Check if duration is more than 30 seconds
    if duration > 30:
        raise ValueError("Duration cannot be more than 30 seconds")

    end_sample = int(model.sample_rate * duration)
    wav = wav[:, :end_sample]

    assert wav.shape[0] == 1
    assert wav.shape[1] == model.sample_rate * duration

    wav = wav.cuda()
    wav = wav.unsqueeze(1)

    with torch.no_grad():
        gen_audio = model.compression_model.encode(wav)

    codes, scale = gen_audio

    assert scale is None

    return codes


def _get_stemmed_wav_patched(wav, sample_rate):
    print("Skipping stem separation!")
    return wav


class Pipeline:
    def __init__(self, model_id, max_batch_size=4, do_skip_demucs=True):
        self.model = musicgen.MusicGen.get_pretrained(model_id)
        self.max_batch_size = max_batch_size
        self.do_skip_demucs = do_skip_demucs

        if self.do_skip_demucs:
            self.model.lm.condition_provider.conditioners.self_wav._get_stemmed_wav = _get_stemmed_wav_patched

    def __call__(
        self,
        prompt,
        input_audio=None,
        scale="closest",
        continuation=False,
        batch_size=1,
        duration=15,
        use_sampling=True,
        temperature=1.0,
        top_k=250,
        top_p=0.0,
        cfg_coef=3.0,
        output_dir="./samples",  # change to google drive if you'd like
        normalization_strategy="loudness",
        seed=-1,
        continuation_start=0,
        continuation_end=None,
    ):
        print("Prompt:", prompt)

        set_generation_params = lambda duration: self.model.set_generation_params(
            duration=duration,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            cfg_coef=cfg_coef,
        )

        if not seed or seed == -1:
            seed = torch.seed() % 2 ** 32 - 1
            set_all_seeds(seed)
        set_all_seeds(seed)
        print(f"Using seed {seed}")
        if not input_audio:
            set_generation_params(duration)
            wav, tokens = self.model.generate([prompt] * batch_size, progress=True, return_tokens=True)
        else:
            input_audio, sr = torchaudio.load(input_audio)
            # Save a copy of the original input audio
            original_input_audio = input_audio.clone()
            print("Input audio shape:", input_audio.shape)
            if scale != "none":
                if scale == "closest":
                    print("Running pitch correction for 'closest' pitch")
                    input_audio = run_autotune(input_audio, sr, correction_method="closest")
                else:
                    print("Running pitch correction for 'scale' pitch")
                    input_audio = run_autotune(input_audio, sr, correction_method="scale", scale=scale)
                print(f"...Done running pitch correction. Shape after is {input_audio.shape}.\n")
            else:
                print("Skipping pitch correction, as 'scale' was set to none")
            input_audio = input_audio[None] if input_audio.dim() == 2 else input_audio

            continuation_start = 0 if not continuation_start else continuation_start
            if continuation_end is None or continuation_end == -1:
                continuation_end = input_audio.shape[2] / sr

            if continuation_start > continuation_end:
                raise ValueError(
                    "`continuation_start` must be less than or equal to `continuation_end`"
                )

            input_audio_wavform = input_audio[
                ..., int(sr * continuation_start) : int(sr * continuation_end)
            ]
            input_audio_wavform = input_audio_wavform.repeat(batch_size, 1, 1)
            # TODO - not using this - is that wrong??
            input_audio_duration = input_audio_wavform.shape[-1] / sr

            if continuation:
                set_generation_params(duration)  # + input_audio_duration)  # SEE TODO above
                print("Continuation wavform shape!", input_audio_wavform.shape)
                wav, tokens = self.model.generate_continuation(
                    prompt=input_audio_wavform,
                    prompt_sample_rate=sr,
                    descriptions=[prompt] * batch_size,
                    progress=True,
                    return_tokens=True
                )
            else:
                print("Melody wavform shape!", input_audio_wavform.shape)
                set_generation_params(duration)
                wav, tokens = self.model.generate_with_chroma(
                    [prompt] * batch_size, input_audio_wavform, sr, progress=True, return_tokens=True
                )
        wav, tokens = wav.cpu(), tokens.cpu()
        # Write to files
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        if input_audio is not None:
            outfile_path = output_dir / f"{dt_str}_input_raw"
            audio_write(
                outfile_path,
                original_input_audio,
                sr,
                strategy=normalization_strategy,
            )
            outfile_path = output_dir / f"{dt_str}_input_pitch_corrected"
            audio_write(
                outfile_path,
                input_audio_wavform[0],
                sr,
                strategy=normalization_strategy,
            )

        for i in range(batch_size):
            outfile_path = output_dir / f"{dt_str}_{i:02d}"
            audio_write(
                outfile_path,
                wav[i],
                self.model.sample_rate,
                strategy=normalization_strategy,
            )
        json_out_path = output_dir / f"{dt_str}.json"
        json_out_path.write_text(json.dumps(dict(
            prompt=prompt,
            batch_size=batch_size,
            duration=duration,
            use_sampling=use_sampling,
            temperature=temperature,
            top_k=top_k,
            cfg_coef=cfg_coef,
        )))

        to_return = [None] * (self.max_batch_size + 1)
        if input_audio is not None:
            print(f"trying to return input audio wavform of shape: {input_audio_wavform.shape}")
            to_return[0] = (sr, input_audio_wavform[0].T.numpy())

        for i in range(batch_size):
            to_return[i + 1] = (self.model.sample_rate, wav[i].T.numpy())
            print(wav[i].shape)
        return to_return


_description = """\
Hum an idea ➡️ get an AI generated music sample. Check out the model [here](https://huggingface.co/nateraw/musicgen-songstarter-v0.2) and the source code [here](https://github.com/nateraw/singing-songstarter).

The input audio will be pitch corrected unless you set `scale` to `"none"`. Set `scale` to `"closest"` to correct to nearest note (if unsure, use this). \
Ideally, you figure out what key you're singing in and set `scale` to that, so it corrects to only notes in that scale. \
It is incredibly important the audio passed to the model (which you'll get back as the first output) is clean in order to get good results. 🗑 in = 🗑 out.

Enjoy ❤️"""
def main(model_id="nateraw/musicgen-songstarter-v0.2", max_batch_size=4, share=False, debug=False):
    pipeline = Pipeline(model_id, max_batch_size)
    interface = gr.Interface(
        fn=pipeline.__call__,
        inputs=[
            gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", value="synth, hip hop, melody, dark"),
            gr.Audio(
                sources=["microphone", "upload"],
                waveform_options=gr.WaveformOptions(
                    waveform_color="#01C6FF",
                    waveform_progress_color="#0066B4",
                    skip_length=2,
                    show_controls=False,
                ),
                type="filepath",
            ),
            gr.Dropdown(["closest", "none", "A:maj", "A:min", "Bb:maj", "Bb:min", "B:maj", "B:min", "C:maj", "C:min", "Db:maj", "Db:min", "D:maj", "D:min", "Eb:maj", "Eb:min", "E:maj", "E:min", "F:maj", "F:min", "Gb:maj", "Gb:min", "G:maj", "G:min", "Ab:maj", "Ab:min"], label="Scale for pitch correction. Set to 'closest' if you don't know.", value="closest"),
            gr.Checkbox(label="Is Continuation", value=False),
            gr.Slider(label="Batch Size", value=1, minimum=1, maximum=pipeline.max_batch_size, step=1),
            gr.Slider(label="Duration", value=15, minimum=4, maximum=30),
            gr.Checkbox(label="Use Sampling", value=True),
            gr.Slider(label="Temperature", value=1.0, minimum=0.0, maximum=2.0),
            gr.Slider(label="Top K", value=250, minimum=0, maximum=1000),
            gr.Slider(label="Top P", value=0.0, minimum=0.0, maximum=1.0),
            gr.Slider(label="CFG Coef", value=3.0, minimum=0.0, maximum=10.0),
            gr.Textbox(label="Output Dir", value="./samples"),
            gr.Dropdown(["loudness", "clip", "peak", "rms"], value="loudness", label="Strategy for normalizing audio."),
            gr.Slider(label="random seed", minimum=-1, maximum=9e8),
        ],
        outputs=[gr.Audio(label=("Input " if i == 0 else "") + f"Audio {i}") for i in range(pipeline.max_batch_size + 1)],
        title="🎶 Generate song ideas with musicgen-songstarter-v0.2 🎶",
        description=_description,
        examples=[
            ["synth, dark, hip hop, melody, trap", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
            ["music, mallets, bells, melody, dancehall, african, afropop & afrobeats", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 4.5, "./samples", "loudness", -1],
        ],
        cache_examples=False
    )
    interface.launch(share=share, debug=debug)


if __name__ == '__main__':
    from fire import Fire
    Fire(main)

    # For testing

    # pipe = Pipeline("nateraw/musicgen-songstarter-v0.2", max_batch_size=4)
    # example_input = (
    #     "hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm",
    #     "nate_is_humming.wav",
    #     "closest",
    #     False,
    #     1,
    #     8,
    #     True,
    #     1.0,
    #     250,
    #     0.0,
    #     3.0,
    #     "./samples",
    #     "loudness",
    #     -1,
    #     0,
    #     None
    # )
    # out = pipe(*example_input)