########################################### # For fast downloads from Hugging Face Hub # **Requires the hf_transfer package** ########################################### import os os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" ########################################### import json import random import typing as tp from datetime import datetime from pathlib import Path from functools import partial import gradio as gr import torch import torchaudio import numpy as np from audiocraft.models import musicgen from audiocraft.data.audio import audio_write from audiocraft.utils.notebook import display_audio from pitch_correction_utils import autotune, closest_pitch, aclosest_pitch_from_scale def ta_to_librosa_format(waveform): """ Convert an audio tensor from torchaudio format to librosa format. Args: waveform (torch.Tensor): Audio tensor from torchaudio with shape (n_channels, n_samples). Returns: np.ndarray: Audio array in librosa format with shape (n_samples,) or (2, n_samples). """ # Ensure waveform is in CPU and convert to numpy waveform_np = waveform.numpy() # Check if audio is mono or stereo and transpose if necessary if waveform_np.shape[0] == 1: # Remove the channel dimension for mono waveform_np = waveform_np.squeeze(0) else: # Transpose to switch from (n_channels, n_samples) to (n_samples, n_channels) waveform_np = waveform_np.transpose() # Normalize to [-1, 1] if not already if waveform_np.dtype in [np.int16, np.int32]: waveform_np = waveform_np / np.iinfo(waveform_np.dtype).max return waveform_np def librosa_to_ta_format(waveform_np): """ Convert an audio array from librosa format to torchaudio format. Args: waveform_np (np.ndarray): Audio array from librosa with shape (n_samples,) or (2, n_samples). Returns: torch.Tensor: Audio tensor in torchaudio format with shape (n_channels, n_samples). """ # Ensure it is a float32 array normalized to [-1, 1] waveform_np = np.array(waveform_np, dtype=np.float32) if waveform_np.ndim == 1: # Add a channel dimension for mono waveform_np = waveform_np[np.newaxis, :] else: # Transpose to switch from (n_samples, n_channels) to (n_channels, n_samples) waveform_np = waveform_np.transpose() # Convert numpy array to PyTorch tensor waveform = torch.from_numpy(waveform_np) return waveform def run_autotune(y, sr, correction_method="closest", scale=None): # Only mono-files are handled. If stereo files are supplied, only the first channel is used. if y.ndim > 1: y = y[0, :] # Pick the pitch adjustment strategy according to the arguments. correction_function = closest_pitch if correction_method == 'closest' else \ partial(aclosest_pitch_from_scale, scale=scale) # Torchaudio -> librosa y = ta_to_librosa_format(y) # Autotune pitch_corrected_y = autotune(y, sr, correction_function, plot=False) # Librosa -> torchaudio pitch_corrected_y = librosa_to_ta_format(pitch_corrected_y) return pitch_corrected_y def set_all_seeds(seed): random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True def _preprocess_audio( audio_path, model: musicgen.MusicGen, duration: tp.Optional[int] = None ): wav, sr = torchaudio.load(audio_path) wav = torchaudio.functional.resample(wav, sr, model.sample_rate) wav = wav.mean(dim=0, keepdim=True) # Calculate duration in seconds if not provided if duration is None: duration = wav.shape[1] / model.sample_rate # Check if duration is more than 30 seconds if duration > 30: raise ValueError("Duration cannot be more than 30 seconds") end_sample = int(model.sample_rate * duration) wav = wav[:, :end_sample] assert wav.shape[0] == 1 assert wav.shape[1] == model.sample_rate * duration wav = wav.cuda() wav = wav.unsqueeze(1) with torch.no_grad(): gen_audio = model.compression_model.encode(wav) codes, scale = gen_audio assert scale is None return codes def _get_stemmed_wav_patched(wav, sample_rate): print("Skipping stem separation!") return wav class Pipeline: def __init__(self, model_id, max_batch_size=4, do_skip_demucs=True): self.model = musicgen.MusicGen.get_pretrained(model_id) self.max_batch_size = max_batch_size self.do_skip_demucs = do_skip_demucs if self.do_skip_demucs: self.model.lm.condition_provider.conditioners.self_wav._get_stemmed_wav = _get_stemmed_wav_patched def __call__( self, prompt, input_audio=None, scale="closest", continuation=False, batch_size=1, duration=15, use_sampling=True, temperature=1.0, top_k=250, top_p=0.0, cfg_coef=3.0, output_dir="./samples", # change to google drive if you'd like normalization_strategy="loudness", seed=-1, continuation_start=0, continuation_end=None, ): print("Prompt:", prompt) set_generation_params = lambda duration: self.model.set_generation_params( duration=duration, top_k=top_k, top_p=top_p, temperature=temperature, cfg_coef=cfg_coef, ) if not seed or seed == -1: seed = torch.seed() % 2 ** 32 - 1 set_all_seeds(seed) set_all_seeds(seed) print(f"Using seed {seed}") if not input_audio: set_generation_params(duration) wav, tokens = self.model.generate([prompt] * batch_size, progress=True, return_tokens=True) else: input_audio, sr = torchaudio.load(input_audio) # Save a copy of the original input audio original_input_audio = input_audio.clone() print("Input audio shape:", input_audio.shape) if scale != "none": if scale == "closest": print("Running pitch correction for 'closest' pitch") input_audio = run_autotune(input_audio, sr, correction_method="closest") else: print("Running pitch correction for 'scale' pitch") input_audio = run_autotune(input_audio, sr, correction_method="scale", scale=scale) print(f"...Done running pitch correction. Shape after is {input_audio.shape}.\n") else: print("Skipping pitch correction, as 'scale' was set to none") input_audio = input_audio[None] if input_audio.dim() == 2 else input_audio continuation_start = 0 if not continuation_start else continuation_start if continuation_end is None or continuation_end == -1: continuation_end = input_audio.shape[2] / sr if continuation_start > continuation_end: raise ValueError( "`continuation_start` must be less than or equal to `continuation_end`" ) input_audio_wavform = input_audio[ ..., int(sr * continuation_start) : int(sr * continuation_end) ] input_audio_wavform = input_audio_wavform.repeat(batch_size, 1, 1) # TODO - not using this - is that wrong?? input_audio_duration = input_audio_wavform.shape[-1] / sr if continuation: set_generation_params(duration) # + input_audio_duration) # SEE TODO above print("Continuation wavform shape!", input_audio_wavform.shape) wav, tokens = self.model.generate_continuation( prompt=input_audio_wavform, prompt_sample_rate=sr, descriptions=[prompt] * batch_size, progress=True, return_tokens=True ) else: print("Melody wavform shape!", input_audio_wavform.shape) set_generation_params(duration) wav, tokens = self.model.generate_with_chroma( [prompt] * batch_size, input_audio_wavform, sr, progress=True, return_tokens=True ) wav, tokens = wav.cpu(), tokens.cpu() # Write to files output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") if input_audio is not None: outfile_path = output_dir / f"{dt_str}_input_raw" audio_write( outfile_path, original_input_audio, sr, strategy=normalization_strategy, ) outfile_path = output_dir / f"{dt_str}_input_pitch_corrected" audio_write( outfile_path, input_audio_wavform[0], sr, strategy=normalization_strategy, ) for i in range(batch_size): outfile_path = output_dir / f"{dt_str}_{i:02d}" audio_write( outfile_path, wav[i], self.model.sample_rate, strategy=normalization_strategy, ) json_out_path = output_dir / f"{dt_str}.json" json_out_path.write_text(json.dumps(dict( prompt=prompt, batch_size=batch_size, duration=duration, use_sampling=use_sampling, temperature=temperature, top_k=top_k, cfg_coef=cfg_coef, ))) to_return = [None] * (self.max_batch_size + 1) if input_audio is not None: print(f"trying to return input audio wavform of shape: {input_audio_wavform.shape}") to_return[0] = (sr, input_audio_wavform[0].T.numpy()) for i in range(batch_size): to_return[i + 1] = (self.model.sample_rate, wav[i].T.numpy()) print(wav[i].shape) return to_return _description = """\ Hum an idea ➡️ get an AI generated music sample. Check out the model [here](https://huggingface.co/nateraw/musicgen-songstarter-v0.2) and the source code [here](https://github.com/nateraw/singing-songstarter). The input audio will be pitch corrected unless you set `scale` to `"none"`. Set `scale` to `"closest"` to correct to nearest note (if unsure, use this). \ Ideally, you figure out what key you're singing in and set `scale` to that, so it corrects to only notes in that scale. \ It is incredibly important the audio passed to the model (which you'll get back as the first output) is clean in order to get good results. 🗑 in = 🗑 out. Enjoy ❤️""" def main(model_id="nateraw/musicgen-songstarter-v0.2", max_batch_size=4, share=False, debug=False): pipeline = Pipeline(model_id, max_batch_size) interface = gr.Interface( fn=pipeline.__call__, inputs=[ gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", value="synth, hip hop, melody, dark"), gr.Audio( sources=["microphone", "upload"], waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False, ), type="filepath", ), gr.Dropdown(["closest", "none", "A:maj", "A:min", "Bb:maj", "Bb:min", "B:maj", "B:min", "C:maj", "C:min", "Db:maj", "Db:min", "D:maj", "D:min", "Eb:maj", "Eb:min", "E:maj", "E:min", "F:maj", "F:min", "Gb:maj", "Gb:min", "G:maj", "G:min", "Ab:maj", "Ab:min"], label="Scale for pitch correction. Set to 'closest' if you don't know.", value="closest"), gr.Checkbox(label="Is Continuation", value=False), gr.Slider(label="Batch Size", value=1, minimum=1, maximum=pipeline.max_batch_size, step=1), gr.Slider(label="Duration", value=15, minimum=4, maximum=30), gr.Checkbox(label="Use Sampling", value=True), gr.Slider(label="Temperature", value=1.0, minimum=0.0, maximum=2.0), gr.Slider(label="Top K", value=250, minimum=0, maximum=1000), gr.Slider(label="Top P", value=0.0, minimum=0.0, maximum=1.0), gr.Slider(label="CFG Coef", value=3.0, minimum=0.0, maximum=10.0), gr.Textbox(label="Output Dir", value="./samples"), gr.Dropdown(["loudness", "clip", "peak", "rms"], value="loudness", label="Strategy for normalizing audio."), gr.Slider(label="random seed", minimum=-1, maximum=9e8), ], outputs=[gr.Audio(label=("Input " if i == 0 else "") + f"Audio {i}") for i in range(pipeline.max_batch_size + 1)], title="🎶 Generate song ideas with musicgen-songstarter-v0.2 🎶", description=_description, examples=[ ["synth, dark, hip hop, melody, trap", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1], ["music, mallets, bells, melody, dancehall, african, afropop & afrobeats", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 4.5, "./samples", "loudness", -1], ], cache_examples=False ) interface.launch(share=share, debug=debug) if __name__ == '__main__': from fire import Fire Fire(main) # For testing # pipe = Pipeline("nateraw/musicgen-songstarter-v0.2", max_batch_size=4) # example_input = ( # "hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm", # "nate_is_humming.wav", # "closest", # False, # 1, # 8, # True, # 1.0, # 250, # 0.0, # 3.0, # "./samples", # "loudness", # -1, # 0, # None # ) # out = pipe(*example_input)