sander-wood commited on
Commit
2c2a2e1
1 Parent(s): 60735e7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -0
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import random
4
+ from unidecode import unidecode
5
+ from transformers import GPT2LMHeadModel
6
+ from samplings import top_p_sampling, temperature_sampling
7
+
8
+ device = torch.device("cpu")
9
+
10
+ description = """
11
+ <div>
12
+ <a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a>
13
+ <a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a>
14
+ <a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
15
+ </div>
16
+ Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \
17
+ Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
18
+ This demo should be used for research purposes only. Commercial use is strictly prohibited. \
19
+ The model output is not censored and the authors do not endorse the opinions in the generated content. \
20
+ Use at your own risk.
21
+ """
22
+
23
+ article = """
24
+ ## 🌎 Foreign Language
25
+ Bark supports various languages out-of-the-box and automatically determines language from input text. \
26
+ When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
27
+ Try the prompt:
28
+ ```
29
+ Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
30
+ ```
31
+ ## 🤭 Non-Speech Sounds
32
+ Below is a list of some known non-speech sounds, but we are finding more every day. \
33
+ Please let us know if you find patterns that work particularly well on Discord!
34
+ * [laughter]
35
+ * [laughs]
36
+ * [sighs]
37
+ * [music]
38
+ * [gasps]
39
+ * [clears throat]
40
+ * — or ... for hesitations
41
+ * ♪ for song lyrics
42
+ * capitalization for emphasis of a word
43
+ * MAN/WOMAN: for bias towards speaker
44
+ Try the prompt:
45
+ ```
46
+ " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
47
+ ```
48
+ ## 🎶 Music
49
+ Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
50
+ Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
51
+ Try the prompt:
52
+ ```
53
+ ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
54
+ ```
55
+ ## 🧬 Voice Cloning
56
+ Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
57
+ The model also attempts to preserve music, ambient noise, etc. from input audio. \
58
+ However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
59
+ ## 👥 Speaker Prompts
60
+ You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
61
+ Please note that these are not always respected, especially if a conflicting audio history prompt is given.
62
+ Try the prompt:
63
+ ```
64
+ WOMAN: I would like an oatmilk latte please.
65
+ MAN: Wow, that's expensive!
66
+ ```
67
+ ## Details
68
+ Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
69
+ Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
70
+ """
71
+
72
+ # examples = [
73
+ # "Jazz standard in Minor key with a swing feel.",
74
+ # "Jazz standard in Major key with a fast tempo.",
75
+ # "Jazz standard in Blues form with a soulfoul melody.",
76
+ # "a painting of a starry night with the moon in the sky",
77
+ # "a green field with a blue sky and clouds",
78
+ # "a beach with a castle on top of it"
79
+ # ]
80
+
81
+ class ABCTokenizer():
82
+ def __init__(self):
83
+ self.pad_token_id = 0
84
+ self.bos_token_id = 2
85
+ self.eos_token_id = 3
86
+ self.merged_tokens = []
87
+
88
+ for i in range(8):
89
+ self.merged_tokens.append('[SECS_'+str(i+1)+']')
90
+ for i in range(32):
91
+ self.merged_tokens.append('[BARS_'+str(i+1)+']')
92
+ for i in range(11):
93
+ self.merged_tokens.append('[SIM_'+str(i)+']')
94
+
95
+ def __len__(self):
96
+ return 128+len(self.merged_tokens)
97
+
98
+ def encode(self, text):
99
+ encodings = {}
100
+ encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens))
101
+ encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids']))
102
+ return encodings
103
+
104
+ def decode(self, ids, skip_special_tokens=False):
105
+ txt = ""
106
+ for i in ids:
107
+ if i>=128:
108
+ if not skip_special_tokens:
109
+ txt += self.merged_tokens[i-128]
110
+ elif i!=self.bos_token_id and i!=self.eos_token_id:
111
+ txt += chr(i)
112
+ return txt
113
+
114
+ def txt2ids(self, text, merged_tokens):
115
+ ids = ["\""+str(ord(c))+"\"" for c in text]
116
+ txt_ids = ' '.join(ids)
117
+ for t_idx, token in enumerate(merged_tokens):
118
+ token_ids = ["\""+str(ord(c))+"\"" for c in token]
119
+ token_txt_ids = ' '.join(token_ids)
120
+ txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"")
121
+
122
+ txt_ids = txt_ids.split(' ')
123
+ txt_ids = [int(i[1:-1]) for i in txt_ids]
124
+ return [self.bos_token_id]+txt_ids+[self.eos_token_id]
125
+
126
+ def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed):
127
+
128
+ prefix = unidecode(control_codes + prefix)
129
+ tokenizer = ABCTokenizer()
130
+ model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device)
131
+
132
+ if prefix:
133
+ ids = tokenizer.encode(prefix)['input_ids'][:-1]
134
+ else:
135
+ ids = torch.tensor([tokenizer.bos_token_id])
136
+
137
+ random.seed(seed)
138
+ tunes = ""
139
+
140
+ for c_idx in range(num_tunes):
141
+ print("\nX:"+str(c_idx+1)+"\n", end="")
142
+ print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="")
143
+ input_ids = ids.unsqueeze(0)
144
+ for t_idx in range(max_length):
145
+ if seed!=None:
146
+ n_seed = random.randint(0, 1000000)
147
+ random.seed(n_seed)
148
+ else:
149
+ n_seed = None
150
+
151
+ outputs = model(input_ids=input_ids.to(device))
152
+ probs = outputs.logits[0][-1]
153
+ probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy()
154
+ sampled_id = temperature_sampling(probs=top_p_sampling(probs,
155
+ top_p=top_p,
156
+ seed=n_seed,
157
+ return_probs=True),
158
+ seed=n_seed,
159
+ temperature=temperature)
160
+ input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1)
161
+ if sampled_id!=tokenizer.eos_token_id:
162
+ print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="")
163
+ continue
164
+ else:
165
+ tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
166
+ tunes += tune+"\n\n"
167
+ print("\n")
168
+ break
169
+
170
+ return tunes
171
+
172
+ input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]")
173
+ input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"")
174
+ input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=3, label="Number of Tunes")
175
+ input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length")
176
+ input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P")
177
+ input_temperature = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Temperature")
178
+ input_seed = gr.inputs.Textbox(lines=1, label="Seed", default="None")
179
+ output_abc = gr.outputs.Textbox(label="Generated Tunes")
180
+
181
+ gr.Interface(generate_abc,
182
+ [input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed],
183
+ output_abc,
184
+ title="TunesFormer: Forming Tunes with Control Codes",
185
+ description=description,
186
+ article=article).launch()