import subprocess import os import gradio as gr import json from utils import * from unidecode import unidecode from transformers import AutoTokenizer description = """

Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \ Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ This demo should be used for research purposes only. Commercial use is strictly prohibited. \ The model output is not censored and the authors do not endorse the opinions in the generated content. \ Use at your own risk. """ article = """ ## 🌎 Foreign Language Bark supports various languages out-of-the-box and automatically determines language from input text. \ When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice. Try the prompt: ``` Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. ``` ## 🤭 Non-Speech Sounds Below is a list of some known non-speech sounds, but we are finding more every day. \ Please let us know if you find patterns that work particularly well on Discord! * [laughter] * [laughs] * [sighs] * [music] * [gasps] * [clears throat] * — or ... for hesitations * ♪ for song lyrics * capitalization for emphasis of a word * MAN/WOMAN: for bias towards speaker Try the prompt: ``` " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪." ``` ## 🎶 Music Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \ Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics. Try the prompt: ``` ♪ In the jungle, the mighty jungle, the lion barks tonight ♪ ``` ## 🧬 Voice Cloning Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \ The model also attempts to preserve music, ambient noise, etc. from input audio. \ However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from. ## 👥 Speaker Prompts You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \ Please note that these are not always respected, especially if a conflicting audio history prompt is given. Try the prompt: ``` WOMAN: I would like an oatmilk latte please. MAN: Wow, that's expensive! ``` ## Details Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \ Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark). """ examples = [ "Jazz standard in Minor key with a swing feel.", "Jazz standard in Major key with a fast tempo.", "Jazz standard in Blues form with a soulfoul melody.", "a painting of a starry night with the moon in the sky", "a green field with a blue sky and clouds", "a beach with a castle on top of it" ] CLAMP_MODEL_NAME = 'clamp-small-512' QUERY_MODAL = 'text' KEY_MODAL = 'music' TOP_N = 1 TEXT_MODEL_NAME = 'distilroberta-base' TEXT_LENGTH = 128 device = torch.device("cpu") # load CLaMP model model = CLaMP.from_pretrained(CLAMP_MODEL_NAME) music_length = model.config.max_length model = model.to(device) model.eval() # initialize patchilizer, tokenizer, and softmax patchilizer = MusicPatchilizer() tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME) softmax = torch.nn.Softmax(dim=1) def compute_values(Q_e, K_e, t=1): """ Compute the values for the attention matrix Args: Q_e (torch.Tensor): Query embeddings K_e (torch.Tensor): Key embeddings t (float): Temperature for the softmax Returns: values (torch.Tensor): Values for the attention matrix """ # Normalize the feature representations Q_e = torch.nn.functional.normalize(Q_e, dim=1) K_e = torch.nn.functional.normalize(K_e, dim=1) # Scaled pairwise cosine similarities [1, n] logits = torch.mm(Q_e, K_e.T) * torch.exp(torch.tensor(t)) values = softmax(logits) return values.squeeze() def encoding_data(data, modal): """ Encode the data into ids Args: data (list): List of strings modal (str): "music" or "text" Returns: ids_list (list): List of ids """ ids_list = [] if modal=="music": for item in data: patches = patchilizer.encode(item, music_length=music_length, add_eos_patch=True) ids_list.append(torch.tensor(patches).reshape(-1)) else: for item in data: text_encodings = tokenizer(item, return_tensors='pt', truncation=True, max_length=TEXT_LENGTH) ids_list.append(text_encodings['input_ids'].squeeze(0)) return ids_list def get_features(ids_list, modal): """ Get the features from the CLaMP model Args: ids_list (list): List of ids modal (str): "music" or "text" Returns: features_list (torch.Tensor): Tensor of features with a shape of (batch_size, hidden_size) """ features_list = [] print("Extracting "+modal+" features...") with torch.no_grad(): for ids in tqdm(ids_list): ids = ids.unsqueeze(0) if modal=="text": masks = torch.tensor([1]*len(ids[0])).unsqueeze(0) features = model.text_enc(ids.to(device), attention_mask=masks.to(device))['last_hidden_state'] features = model.avg_pooling(features, masks) features = model.text_proj(features) else: masks = torch.tensor([1]*(int(len(ids[0])/PATCH_LENGTH))).unsqueeze(0) features = model.music_enc(ids, masks)['last_hidden_state'] features = model.avg_pooling(features, masks) features = model.music_proj(features) features_list.append(features[0]) return torch.stack(features_list).to(device) def semantic_music_search(query): """ Semantic music search Args: query (str): Query string Returns: output (str): Search result """ with open(KEY_MODAL+"_key_cache_"+str(music_length)+".pth", 'rb') as f: key_cache = torch.load(f) # encode query query_ids = encoding_data([query], QUERY_MODAL) query_feature = get_features(query_ids, QUERY_MODAL) key_filenames = key_cache["filenames"] key_features = key_cache["features"] # compute values values = compute_values(query_feature, key_features) idx = torch.argsort(values)[-1] filename = key_filenames[idx].split('/')[-1][:-4] with open("wikimusictext.json", 'r') as f: wikimusictext = json.load(f) for item in wikimusictext: if item['title']==filename: # output = "Title:\n" + item['title']+'\n\n' # output += "Artist:\n" + item['artist']+ '\n\n' # output += "Genre:\n" + item['genre']+ '\n\n' # output += "Description:\n" + item['text']+ '\n\n' # output += "ABC notation:\n" + item['music']+ '\n\n' return item["title"], item["artist"], item["genre"], item["text"], item["music"] output_title = gr.outputs.Textbox(label="Title") output_artist = gr.outputs.Textbox(label="Artist") output_genre = gr.outputs.Textbox(label="Genre") output_description = gr.outputs.Textbox(label="Description") output_abc = gr.outputs.Textbox(label="ABC notation") gr.Interface( fn=semantic_music_search, inputs=gr.Textbox(lines=2, placeholder="Describe the music you want to search...", label="Query"), outputs=[output_title, output_artist, output_genre, output_description, output_abc], title="🗜️ CLaMP: Semantic Music Search", description=description, article=article, examples=examples).launch()