import gradio as gr import torch import random from unidecode import unidecode from transformers import GPT2LMHeadModel from samplings import top_p_sampling, temperature_sampling device = torch.device("cpu") description = """
Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \ Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ This demo should be used for research purposes only. Commercial use is strictly prohibited. \ The model output is not censored and the authors do not endorse the opinions in the generated content. \ Use at your own risk. """ article = """ ## 🌎 Foreign Language Bark supports various languages out-of-the-box and automatically determines language from input text. \ When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice. Try the prompt: ``` Buenos dÃas Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. ``` ## 🤠Non-Speech Sounds Below is a list of some known non-speech sounds, but we are finding more every day. \ Please let us know if you find patterns that work particularly well on Discord! * [laughter] * [laughs] * [sighs] * [music] * [gasps] * [clears throat] * — or ... for hesitations * ♪ for song lyrics * capitalization for emphasis of a word * MAN/WOMAN: for bias towards speaker Try the prompt: ``` " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪." ``` ## 🎶 Music Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \ Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics. Try the prompt: ``` ♪ In the jungle, the mighty jungle, the lion barks tonight ♪ ``` ## 🧬 Voice Cloning Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \ The model also attempts to preserve music, ambient noise, etc. from input audio. \ However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from. ## 👥 Speaker Prompts You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \ Please note that these are not always respected, especially if a conflicting audio history prompt is given. Try the prompt: ``` WOMAN: I would like an oatmilk latte please. MAN: Wow, that's expensive! ``` ## Details Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \ Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark). """ # examples = [ # "Jazz standard in Minor key with a swing feel.", # "Jazz standard in Major key with a fast tempo.", # "Jazz standard in Blues form with a soulfoul melody.", # "a painting of a starry night with the moon in the sky", # "a green field with a blue sky and clouds", # "a beach with a castle on top of it" # ] class ABCTokenizer(): def __init__(self): self.pad_token_id = 0 self.bos_token_id = 2 self.eos_token_id = 3 self.merged_tokens = [] for i in range(8): self.merged_tokens.append('[SECS_'+str(i+1)+']') for i in range(32): self.merged_tokens.append('[BARS_'+str(i+1)+']') for i in range(11): self.merged_tokens.append('[SIM_'+str(i)+']') def __len__(self): return 128+len(self.merged_tokens) def encode(self, text): encodings = {} encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens)) encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids'])) return encodings def decode(self, ids, skip_special_tokens=False): txt = "" for i in ids: if i>=128: if not skip_special_tokens: txt += self.merged_tokens[i-128] elif i!=self.bos_token_id and i!=self.eos_token_id: txt += chr(i) return txt def txt2ids(self, text, merged_tokens): ids = ["\""+str(ord(c))+"\"" for c in text] txt_ids = ' '.join(ids) for t_idx, token in enumerate(merged_tokens): token_ids = ["\""+str(ord(c))+"\"" for c in token] token_txt_ids = ' '.join(token_ids) txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"") txt_ids = txt_ids.split(' ') txt_ids = [int(i[1:-1]) for i in txt_ids] return [self.bos_token_id]+txt_ids+[self.eos_token_id] def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed): try: seed = int(seed) except: seed = None prefix = unidecode(control_codes + prefix) tokenizer = ABCTokenizer() model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device) if prefix: ids = tokenizer.encode(prefix)['input_ids'][:-1] else: ids = torch.tensor([tokenizer.bos_token_id]) random.seed(seed) tunes = "" for c_idx in range(num_tunes): print("\nX:"+str(c_idx+1)+"\n", end="") print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="") input_ids = ids.unsqueeze(0) for t_idx in range(max_length): if seed!=None: n_seed = random.randint(0, 1000000) random.seed(n_seed) else: n_seed = None outputs = model(input_ids=input_ids.to(device)) probs = outputs.logits[0][-1] probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy() sampled_id = temperature_sampling(probs=top_p_sampling(probs, top_p=top_p, seed=n_seed, return_probs=True), seed=n_seed, temperature=temperature) input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1) if sampled_id!=tokenizer.eos_token_id: print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="") continue else: tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True) tunes += tune+"\n\n" print("\n") break return tunes input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]") input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"") input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=1, label="Number of Tunes") input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length") input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P") input_temperature = gr.inputs.Slider(minimum=0.0, maximum=2.0, step=0.1, default=1.0, label="Temperature") input_seed = gr.inputs.Textbox(lines=1, label="Seed (int)", default="None") output_abc = gr.outputs.Textbox(label="Generated Tunes") gr.Interface(generate_abc, [input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed], output_abc, title="TunesFormer: Forming Tunes with Control Codes", description=description, article=article).launch()