from typing import Dict, List, Any from datasets import load_dataset from transformers import AutoProcessor, MusicgenForConditionalGeneration import torch, numpy as np import io import soundfile as sf from audiocraft.models import MusicGen import yaml import math import torchaudio import torch from audiocraft.utils.notebook import display_audio def get_bip_bip( bip_duration=0.125, frequency=440, duration=0.5, sample_rate=32000, device="cuda"): """Generates a series of bip bip at the given frequency.""" t = torch.arange( int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate wav = torch.cos(2 * math.pi * 440 * t)[None] tp = (t % (2 * bip_duration)) / (2 * bip_duration) envelope = (tp >= 0.5).float() return wav * envelope def load_conf(conf): with open(conf,'r') as f: conf= yaml.safeload(f) return conf class generator: def __init__(self, conf_file): """ conf{ model sampling_rate } """ self.conf = load_conf(conf_file) self.processor = AutoProcessor.from_pretrained(self.conf['model']) self.model = MusicGen.get_pretrained(self.conf['model']) self.model.set_generation_params( use_sampling=True, top_k=250, duration=self.conf['duration'] ) device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(device) self.sampling_rate = self.model.config.audio_encoder.sampling_rate def preprocess(self, text, audio): audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])] def generate(self, text:list, audio: np.array, **kwargs): """ text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"] audio (np.array) """ # inputs = self.processor( # audio=audio, # sampling_rate=self.conf["sampling_rate"], # text=text, # padding=True, # return_tensors="pt", # ) output = self.model.generate_with_chroma( descriptions=[ text ], melody_wavs=audio, melody_sample_rate=self.conf['sampling_rate'], progress=True ) return output class EndpointHandler: def __init__(self, path=""): # load model and processor from path self.processor = AutoProcessor.from_pretrained(path) self.model = MusicgenForConditionalGeneration.from_pretrained( path, torch_dtype=torch.float16).to("cuda") self.generator = generator('conf.yaml') def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: """ Args: data (:dict:): The payload with the text prompt and generation parameters. """ prompt_duration = 2 # process input text = data.pop("text", data) audio = data.pop("audio", data) parameters = data.pop("parameters", None) audio, sr = sf.read(io.BytesIO(audio)) output = self.generate(text, audio, sr) # # pass inputs with all kwargs in data # if parameters is not None: # with torch.autocast("cuda"): # outputs = self.model.generate(**inputs, **parameters) # else: # with torch.autocast("cuda"): # outputs = self.model.generate(**inputs,) # postprocess the prediction prediction = output.squeeze().cpu().numpy().tolist() return [{"generated_audio": prediction}]