Spaces:
Runtime error
Runtime error
sander-wood
commited on
Commit
•
2c2a2e1
1
Parent(s):
60735e7
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import random
|
4 |
+
from unidecode import unidecode
|
5 |
+
from transformers import GPT2LMHeadModel
|
6 |
+
from samplings import top_p_sampling, temperature_sampling
|
7 |
+
|
8 |
+
device = torch.device("cpu")
|
9 |
+
|
10 |
+
description = """
|
11 |
+
<div>
|
12 |
+
<a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a>
|
13 |
+
<a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a>
|
14 |
+
<a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
|
15 |
+
</div>
|
16 |
+
Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \
|
17 |
+
Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
|
18 |
+
This demo should be used for research purposes only. Commercial use is strictly prohibited. \
|
19 |
+
The model output is not censored and the authors do not endorse the opinions in the generated content. \
|
20 |
+
Use at your own risk.
|
21 |
+
"""
|
22 |
+
|
23 |
+
article = """
|
24 |
+
## 🌎 Foreign Language
|
25 |
+
Bark supports various languages out-of-the-box and automatically determines language from input text. \
|
26 |
+
When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
|
27 |
+
Try the prompt:
|
28 |
+
```
|
29 |
+
Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
|
30 |
+
```
|
31 |
+
## 🤭 Non-Speech Sounds
|
32 |
+
Below is a list of some known non-speech sounds, but we are finding more every day. \
|
33 |
+
Please let us know if you find patterns that work particularly well on Discord!
|
34 |
+
* [laughter]
|
35 |
+
* [laughs]
|
36 |
+
* [sighs]
|
37 |
+
* [music]
|
38 |
+
* [gasps]
|
39 |
+
* [clears throat]
|
40 |
+
* — or ... for hesitations
|
41 |
+
* ♪ for song lyrics
|
42 |
+
* capitalization for emphasis of a word
|
43 |
+
* MAN/WOMAN: for bias towards speaker
|
44 |
+
Try the prompt:
|
45 |
+
```
|
46 |
+
" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
|
47 |
+
```
|
48 |
+
## 🎶 Music
|
49 |
+
Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
|
50 |
+
Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
|
51 |
+
Try the prompt:
|
52 |
+
```
|
53 |
+
♪ In the jungle, the mighty jungle, the lion barks tonight ♪
|
54 |
+
```
|
55 |
+
## 🧬 Voice Cloning
|
56 |
+
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
|
57 |
+
The model also attempts to preserve music, ambient noise, etc. from input audio. \
|
58 |
+
However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
|
59 |
+
## 👥 Speaker Prompts
|
60 |
+
You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
|
61 |
+
Please note that these are not always respected, especially if a conflicting audio history prompt is given.
|
62 |
+
Try the prompt:
|
63 |
+
```
|
64 |
+
WOMAN: I would like an oatmilk latte please.
|
65 |
+
MAN: Wow, that's expensive!
|
66 |
+
```
|
67 |
+
## Details
|
68 |
+
Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
|
69 |
+
Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
|
70 |
+
"""
|
71 |
+
|
72 |
+
# examples = [
|
73 |
+
# "Jazz standard in Minor key with a swing feel.",
|
74 |
+
# "Jazz standard in Major key with a fast tempo.",
|
75 |
+
# "Jazz standard in Blues form with a soulfoul melody.",
|
76 |
+
# "a painting of a starry night with the moon in the sky",
|
77 |
+
# "a green field with a blue sky and clouds",
|
78 |
+
# "a beach with a castle on top of it"
|
79 |
+
# ]
|
80 |
+
|
81 |
+
class ABCTokenizer():
|
82 |
+
def __init__(self):
|
83 |
+
self.pad_token_id = 0
|
84 |
+
self.bos_token_id = 2
|
85 |
+
self.eos_token_id = 3
|
86 |
+
self.merged_tokens = []
|
87 |
+
|
88 |
+
for i in range(8):
|
89 |
+
self.merged_tokens.append('[SECS_'+str(i+1)+']')
|
90 |
+
for i in range(32):
|
91 |
+
self.merged_tokens.append('[BARS_'+str(i+1)+']')
|
92 |
+
for i in range(11):
|
93 |
+
self.merged_tokens.append('[SIM_'+str(i)+']')
|
94 |
+
|
95 |
+
def __len__(self):
|
96 |
+
return 128+len(self.merged_tokens)
|
97 |
+
|
98 |
+
def encode(self, text):
|
99 |
+
encodings = {}
|
100 |
+
encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens))
|
101 |
+
encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids']))
|
102 |
+
return encodings
|
103 |
+
|
104 |
+
def decode(self, ids, skip_special_tokens=False):
|
105 |
+
txt = ""
|
106 |
+
for i in ids:
|
107 |
+
if i>=128:
|
108 |
+
if not skip_special_tokens:
|
109 |
+
txt += self.merged_tokens[i-128]
|
110 |
+
elif i!=self.bos_token_id and i!=self.eos_token_id:
|
111 |
+
txt += chr(i)
|
112 |
+
return txt
|
113 |
+
|
114 |
+
def txt2ids(self, text, merged_tokens):
|
115 |
+
ids = ["\""+str(ord(c))+"\"" for c in text]
|
116 |
+
txt_ids = ' '.join(ids)
|
117 |
+
for t_idx, token in enumerate(merged_tokens):
|
118 |
+
token_ids = ["\""+str(ord(c))+"\"" for c in token]
|
119 |
+
token_txt_ids = ' '.join(token_ids)
|
120 |
+
txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"")
|
121 |
+
|
122 |
+
txt_ids = txt_ids.split(' ')
|
123 |
+
txt_ids = [int(i[1:-1]) for i in txt_ids]
|
124 |
+
return [self.bos_token_id]+txt_ids+[self.eos_token_id]
|
125 |
+
|
126 |
+
def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed):
|
127 |
+
|
128 |
+
prefix = unidecode(control_codes + prefix)
|
129 |
+
tokenizer = ABCTokenizer()
|
130 |
+
model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device)
|
131 |
+
|
132 |
+
if prefix:
|
133 |
+
ids = tokenizer.encode(prefix)['input_ids'][:-1]
|
134 |
+
else:
|
135 |
+
ids = torch.tensor([tokenizer.bos_token_id])
|
136 |
+
|
137 |
+
random.seed(seed)
|
138 |
+
tunes = ""
|
139 |
+
|
140 |
+
for c_idx in range(num_tunes):
|
141 |
+
print("\nX:"+str(c_idx+1)+"\n", end="")
|
142 |
+
print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="")
|
143 |
+
input_ids = ids.unsqueeze(0)
|
144 |
+
for t_idx in range(max_length):
|
145 |
+
if seed!=None:
|
146 |
+
n_seed = random.randint(0, 1000000)
|
147 |
+
random.seed(n_seed)
|
148 |
+
else:
|
149 |
+
n_seed = None
|
150 |
+
|
151 |
+
outputs = model(input_ids=input_ids.to(device))
|
152 |
+
probs = outputs.logits[0][-1]
|
153 |
+
probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy()
|
154 |
+
sampled_id = temperature_sampling(probs=top_p_sampling(probs,
|
155 |
+
top_p=top_p,
|
156 |
+
seed=n_seed,
|
157 |
+
return_probs=True),
|
158 |
+
seed=n_seed,
|
159 |
+
temperature=temperature)
|
160 |
+
input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1)
|
161 |
+
if sampled_id!=tokenizer.eos_token_id:
|
162 |
+
print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="")
|
163 |
+
continue
|
164 |
+
else:
|
165 |
+
tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
|
166 |
+
tunes += tune+"\n\n"
|
167 |
+
print("\n")
|
168 |
+
break
|
169 |
+
|
170 |
+
return tunes
|
171 |
+
|
172 |
+
input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]")
|
173 |
+
input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"")
|
174 |
+
input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=3, label="Number of Tunes")
|
175 |
+
input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length")
|
176 |
+
input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P")
|
177 |
+
input_temperature = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Temperature")
|
178 |
+
input_seed = gr.inputs.Textbox(lines=1, label="Seed", default="None")
|
179 |
+
output_abc = gr.outputs.Textbox(label="Generated Tunes")
|
180 |
+
|
181 |
+
gr.Interface(generate_abc,
|
182 |
+
[input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed],
|
183 |
+
output_abc,
|
184 |
+
title="TunesFormer: Forming Tunes with Control Codes",
|
185 |
+
description=description,
|
186 |
+
article=article).launch()
|