Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import random | |
from unidecode import unidecode | |
from transformers import GPT2LMHeadModel | |
from samplings import top_p_sampling, temperature_sampling | |
device = torch.device("cpu") | |
description = """ | |
<div> | |
<a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a> | |
<a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a> | |
<a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a> | |
</div> | |
Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \ | |
Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ | |
This demo should be used for research purposes only. Commercial use is strictly prohibited. \ | |
The model output is not censored and the authors do not endorse the opinions in the generated content. \ | |
Use at your own risk. | |
""" | |
article = """ | |
## 🌎 Foreign Language | |
Bark supports various languages out-of-the-box and automatically determines language from input text. \ | |
When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice. | |
Try the prompt: | |
``` | |
Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. | |
``` | |
## 🤭 Non-Speech Sounds | |
Below is a list of some known non-speech sounds, but we are finding more every day. \ | |
Please let us know if you find patterns that work particularly well on Discord! | |
* [laughter] | |
* [laughs] | |
* [sighs] | |
* [music] | |
* [gasps] | |
* [clears throat] | |
* — or ... for hesitations | |
* ♪ for song lyrics | |
* capitalization for emphasis of a word | |
* MAN/WOMAN: for bias towards speaker | |
Try the prompt: | |
``` | |
" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪." | |
``` | |
## 🎶 Music | |
Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \ | |
Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics. | |
Try the prompt: | |
``` | |
♪ In the jungle, the mighty jungle, the lion barks tonight ♪ | |
``` | |
## 🧬 Voice Cloning | |
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \ | |
The model also attempts to preserve music, ambient noise, etc. from input audio. \ | |
However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from. | |
## 👥 Speaker Prompts | |
You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \ | |
Please note that these are not always respected, especially if a conflicting audio history prompt is given. | |
Try the prompt: | |
``` | |
WOMAN: I would like an oatmilk latte please. | |
MAN: Wow, that's expensive! | |
``` | |
## Details | |
Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \ | |
Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark). | |
""" | |
# examples = [ | |
# "Jazz standard in Minor key with a swing feel.", | |
# "Jazz standard in Major key with a fast tempo.", | |
# "Jazz standard in Blues form with a soulfoul melody.", | |
# "a painting of a starry night with the moon in the sky", | |
# "a green field with a blue sky and clouds", | |
# "a beach with a castle on top of it" | |
# ] | |
class ABCTokenizer(): | |
def __init__(self): | |
self.pad_token_id = 0 | |
self.bos_token_id = 2 | |
self.eos_token_id = 3 | |
self.merged_tokens = [] | |
for i in range(8): | |
self.merged_tokens.append('[SECS_'+str(i+1)+']') | |
for i in range(32): | |
self.merged_tokens.append('[BARS_'+str(i+1)+']') | |
for i in range(11): | |
self.merged_tokens.append('[SIM_'+str(i)+']') | |
def __len__(self): | |
return 128+len(self.merged_tokens) | |
def encode(self, text): | |
encodings = {} | |
encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens)) | |
encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids'])) | |
return encodings | |
def decode(self, ids, skip_special_tokens=False): | |
txt = "" | |
for i in ids: | |
if i>=128: | |
if not skip_special_tokens: | |
txt += self.merged_tokens[i-128] | |
elif i!=self.bos_token_id and i!=self.eos_token_id: | |
txt += chr(i) | |
return txt | |
def txt2ids(self, text, merged_tokens): | |
ids = ["\""+str(ord(c))+"\"" for c in text] | |
txt_ids = ' '.join(ids) | |
for t_idx, token in enumerate(merged_tokens): | |
token_ids = ["\""+str(ord(c))+"\"" for c in token] | |
token_txt_ids = ' '.join(token_ids) | |
txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"") | |
txt_ids = txt_ids.split(' ') | |
txt_ids = [int(i[1:-1]) for i in txt_ids] | |
return [self.bos_token_id]+txt_ids+[self.eos_token_id] | |
def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed): | |
try: | |
seed = int(seed) | |
except: | |
seed = None | |
prefix = unidecode(control_codes + prefix) | |
tokenizer = ABCTokenizer() | |
model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device) | |
if prefix: | |
ids = tokenizer.encode(prefix)['input_ids'][:-1] | |
else: | |
ids = torch.tensor([tokenizer.bos_token_id]) | |
random.seed(seed) | |
tunes = "" | |
for c_idx in range(num_tunes): | |
print("\nX:"+str(c_idx+1)+"\n", end="") | |
print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="") | |
input_ids = ids.unsqueeze(0) | |
for t_idx in range(max_length): | |
if seed!=None: | |
n_seed = random.randint(0, 1000000) | |
random.seed(n_seed) | |
else: | |
n_seed = None | |
outputs = model(input_ids=input_ids.to(device)) | |
probs = outputs.logits[0][-1] | |
probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy() | |
sampled_id = temperature_sampling(probs=top_p_sampling(probs, | |
top_p=top_p, | |
seed=n_seed, | |
return_probs=True), | |
seed=n_seed, | |
temperature=temperature) | |
input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1) | |
if sampled_id!=tokenizer.eos_token_id: | |
print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="") | |
continue | |
else: | |
tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True) | |
tunes += tune+"\n\n" | |
print("\n") | |
break | |
return tunes | |
input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]") | |
input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"") | |
input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=1, label="Number of Tunes") | |
input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length") | |
input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P") | |
input_temperature = gr.inputs.Slider(minimum=0.0, maximum=2.0, step=0.1, default=1.0, label="Temperature") | |
input_seed = gr.inputs.Textbox(lines=1, label="Seed (int)", default="None") | |
output_abc = gr.outputs.Textbox(label="Generated Tunes") | |
gr.Interface(generate_abc, | |
[input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed], | |
output_abc, | |
title="TunesFormer: Forming Tunes with Control Codes", | |
description=description, | |
article=article).launch() | |