import subprocess import os import gradio as gr from utils import * from unidecode import unidecode from transformers import AutoTokenizer description = """
Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \ Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ This demo should be used for research purposes only. Commercial use is strictly prohibited. \ The model output is not censored and the authors do not endorse the opinions in the generated content. \ Use at your own risk. """ article = """ ## 🌎 Foreign Language Bark supports various languages out-of-the-box and automatically determines language from input text. \ When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice. Try the prompt: ``` Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. ``` ## 🤭 Non-Speech Sounds Below is a list of some known non-speech sounds, but we are finding more every day. \ Please let us know if you find patterns that work particularly well on Discord! * [laughter] * [laughs] * [sighs] * [music] * [gasps] * [clears throat] * — or ... for hesitations * ♪ for song lyrics * capitalization for emphasis of a word * MAN/WOMAN: for bias towards speaker Try the prompt: ``` " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪." ``` ## 🎶 Music Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \ Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics. Try the prompt: ``` ♪ In the jungle, the mighty jungle, the lion barks tonight ♪ ``` ## 🧬 Voice Cloning Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \ The model also attempts to preserve music, ambient noise, etc. from input audio. \ However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from. ## 👥 Speaker Prompts You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \ Please note that these are not always respected, especially if a conflicting audio history prompt is given. Try the prompt: ``` WOMAN: I would like an oatmilk latte please. MAN: Wow, that's expensive! ``` ## Details Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \ Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark). """ CLAMP_MODEL_NAME = 'clamp-small-512' QUERY_MODAL = 'music' KEY_MODAL = 'text' TOP_N = 1 TEXT_MODEL_NAME = 'distilroberta-base' TEXT_LENGTH = 128 device = torch.device("cpu") # load CLaMP model model = CLaMP.from_pretrained(CLAMP_MODEL_NAME) music_length = model.config.max_length model = model.to(device) model.eval() # initialize patchilizer, tokenizer, and softmax patchilizer = MusicPatchilizer() tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME) softmax = torch.nn.Softmax(dim=1) def compute_values(Q_e, K_e, t=1): """ Compute the values for the attention matrix Args: Q_e (torch.Tensor): Query embeddings K_e (torch.Tensor): Key embeddings t (float): Temperature for the softmax Returns: values (torch.Tensor): Values for the attention matrix """ # Normalize the feature representations Q_e = torch.nn.functional.normalize(Q_e, dim=1) K_e = torch.nn.functional.normalize(K_e, dim=1) # Scaled pairwise cosine similarities [1, n] logits = torch.mm(Q_e, K_e.T) * torch.exp(torch.tensor(t)) values = softmax(logits) return values.squeeze() def encoding_data(data, modal): """ Encode the data into ids Args: data (list): List of strings modal (str): "music" or "text" Returns: ids_list (list): List of ids """ ids_list = [] if modal=="music": for item in data: patches = patchilizer.encode(item, music_length=music_length, add_eos_patch=True) ids_list.append(torch.tensor(patches).reshape(-1)) else: for item in data: text_encodings = tokenizer(item, return_tensors='pt', truncation=True, max_length=TEXT_LENGTH) ids_list.append(text_encodings['input_ids'].squeeze(0)) return ids_list def abc_filter(lines): """ Filter out the metadata from the abc file Args: lines (list): List of lines in the abc file Returns: music (str): Music string """ music = "" for line in lines: if line[:2] in ['A:', 'B:', 'C:', 'D:', 'F:', 'G', 'H:', 'N:', 'O:', 'R:', 'r:', 'S:', 'T:', 'W:', 'w:', 'X:', 'Z:'] \ or line=='\n' \ or (line.startswith('%') and not line.startswith('%%score')): continue else: if "%" in line and not line.startswith('%%score'): line = "%".join(line.split('%')[:-1]) music += line[:-1] + '\n' else: music += line + '\n' return music def load_music(filename): """ Load the music from the xml file Args: file (Union[str, bytes, BinaryIO, TextIO]): Input file object containing the xml file Returns: music (str): Music string """ # Get absolute path of xml2abc.py script_dir = os.path.dirname(os.path.abspath(__file__)) xml2abc_path = os.path.join(script_dir, 'xml2abc.py') # Use absolute path in Popen() p = subprocess.Popen(['python', xml2abc_path, '-m', '2', '-c', '6', '-x', filename], stdout=subprocess.PIPE) result = p.communicate()[0] output = result.decode('utf-8').replace('\r', '') music = unidecode(output).split('\n') music = abc_filter(music) return music def get_features(ids_list, modal): """ Get the features from the CLaMP model Args: ids_list (list): List of ids modal (str): "music" or "text" Returns: features_list (torch.Tensor): Tensor of features with a shape of (batch_size, hidden_size) """ features_list = [] print("Extracting "+modal+" features...") with torch.no_grad(): for ids in tqdm(ids_list): ids = ids.unsqueeze(0) if modal=="text": masks = torch.tensor([1]*len(ids[0])).unsqueeze(0) features = model.text_enc(ids.to(device), attention_mask=masks.to(device))['last_hidden_state'] features = model.avg_pooling(features, masks) features = model.text_proj(features) else: masks = torch.tensor([1]*(int(len(ids[0])/PATCH_LENGTH))).unsqueeze(0) features = model.music_enc(ids, masks)['last_hidden_state'] features = model.avg_pooling(features, masks) features = model.music_proj(features) features_list.append(features[0]) return torch.stack(features_list).to(device) def zero_shot_music_classification(file, class1, class2, class3, class4, class5, class6, class7, class8, class9, class10): """ Classify music based on the given classes Args: file (Union[str, bytes, BinaryIO, TextIO]): Input file object containing the xml file classNum(str): Class Num Returns: output (str): Output string """ query = load_music(file.name) # encode query query_ids = encoding_data([query], QUERY_MODAL) query_feature = get_features(query_ids, QUERY_MODAL) keys = [class1, class2, class3, class4, class5, class6, class7, class8, class9, class10] keys = [key for key in keys if key != ''] key_features = get_features(encoding_data(keys, KEY_MODAL), KEY_MODAL) # compute values values = compute_values(query_feature, key_features) idxs = torch.argsort(values, descending=True) results = {} for i in range(len(idxs)): results[keys[idxs[i]]] = values[idxs[i]].item() return results input_file = gr.inputs.File(label="Upload MusicXML file") input_class1 = gr.inputs.Textbox(label="Class 1", placeholder="Description of class 1") input_class2 = gr.inputs.Textbox(label="Class 2", placeholder="Description of class 2") input_class3 = gr.inputs.Textbox(label="Class 3", placeholder="Description of class 3") input_class4 = gr.inputs.Textbox(label="Class 4", placeholder="Description of class 4") input_class5 = gr.inputs.Textbox(label="Class 5", placeholder="Description of class 5") input_class6 = gr.inputs.Textbox(label="Class 6", placeholder="Description of class 6") input_class7 = gr.inputs.Textbox(label="Class 7", placeholder="Description of class 7") input_class8 = gr.inputs.Textbox(label="Class 8", placeholder="Description of class 8") input_class9 = gr.inputs.Textbox(label="Class 9", placeholder="Description of class 9") input_class10 = gr.inputs.Textbox(label="Class 10", placeholder="Description of class 10") # output labels with their probabilities output_class = gr.outputs.Label(num_top_classes=10, label="Predicted Results") gr.Interface(zero_shot_music_classification, inputs=[input_file, input_class1, input_class2, input_class3, input_class4, input_class5, input_class6, input_class7, input_class8, input_class9, input_class10], outputs=output_class, title="🗜️ CLaMP: Zero-Shot Music Classification", description="Upload a MusicXML file and get the class of the music", description=description, article=article).launch()