File size: 9,125 Bytes
2c2a2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1321bba
 
 
 
 
2c2a2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e95fac
2c2a2e1
 
6e95fac
1321bba
2c2a2e1
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import gradio as gr
import torch
import random
from unidecode import unidecode
from transformers import GPT2LMHeadModel
from samplings import top_p_sampling, temperature_sampling

device = torch.device("cpu")

description = """
<div>
<a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a>
<a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a>
<a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
</div>
Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \
Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
This demo should be used for research purposes only. Commercial use is strictly prohibited. \
The model output is not censored and the authors do not endorse the opinions in the generated content. \
Use at your own risk.
"""

article = """
## 🌎 Foreign Language
Bark supports various languages out-of-the-box and automatically determines language from input text. \
When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
Try the prompt:
```
Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
```
## 🤭 Non-Speech Sounds
Below is a list of some known non-speech sounds, but we are finding more every day. \
Please let us know if you find patterns that work particularly well on Discord!
* [laughter]
* [laughs]
* [sighs]
* [music]
* [gasps]
* [clears throat]
* — or ... for hesitations
* ♪ for song lyrics
* capitalization for emphasis of a word
* MAN/WOMAN: for bias towards speaker
Try the prompt:
```
" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
```
## 🎶 Music
Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
Try the prompt:
```
♪ In the jungle, the mighty jungle, the lion barks tonight ♪
```
## 🧬 Voice Cloning
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
The model also attempts to preserve music, ambient noise, etc. from input audio. \
However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
## 👥 Speaker Prompts
You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
Please note that these are not always respected, especially if a conflicting audio history prompt is given.
Try the prompt:
```
WOMAN: I would like an oatmilk latte please.
MAN: Wow, that's expensive!
```
## Details
Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
"""

# examples = [
#     "Jazz standard in Minor key with a swing feel.",
#     "Jazz standard in Major key with a fast tempo.",
#     "Jazz standard in Blues form with a soulfoul melody.",
#     "a painting of a starry night with the moon in the sky",
#     "a green field with a blue sky and clouds",
#     "a beach with a castle on top of it"
# ]

class ABCTokenizer():
    def __init__(self):
        self.pad_token_id = 0
        self.bos_token_id = 2
        self.eos_token_id = 3
        self.merged_tokens = []
        
        for i in range(8):
            self.merged_tokens.append('[SECS_'+str(i+1)+']')
        for i in range(32):
            self.merged_tokens.append('[BARS_'+str(i+1)+']')
        for i in range(11):
            self.merged_tokens.append('[SIM_'+str(i)+']')

    def __len__(self):
        return 128+len(self.merged_tokens)
    
    def encode(self, text):
        encodings = {}
        encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens))
        encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids']))
        return encodings

    def decode(self, ids, skip_special_tokens=False):
        txt = ""
        for i in ids:
            if i>=128:
                if not skip_special_tokens:
                    txt += self.merged_tokens[i-128]
            elif i!=self.bos_token_id and i!=self.eos_token_id:
                txt += chr(i)
        return txt

    def txt2ids(self, text, merged_tokens):
        ids = ["\""+str(ord(c))+"\"" for c in text]
        txt_ids = ' '.join(ids)
        for t_idx, token in enumerate(merged_tokens):
            token_ids = ["\""+str(ord(c))+"\"" for c in token]
            token_txt_ids = ' '.join(token_ids)
            txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"")
        
        txt_ids = txt_ids.split(' ')
        txt_ids = [int(i[1:-1]) for i in txt_ids]
        return [self.bos_token_id]+txt_ids+[self.eos_token_id]

def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed):

    try:
        seed = int(seed)
    except:
        seed = None

    prefix = unidecode(control_codes + prefix)
    tokenizer = ABCTokenizer()
    model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device)

    if prefix:
        ids = tokenizer.encode(prefix)['input_ids'][:-1]
    else:
        ids = torch.tensor([tokenizer.bos_token_id])

    random.seed(seed)
    tunes = ""

    for c_idx in range(num_tunes):
        print("\nX:"+str(c_idx+1)+"\n", end="")
        print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="")
        input_ids = ids.unsqueeze(0)
        for t_idx in range(max_length):
            if seed!=None:
                n_seed = random.randint(0, 1000000)
                random.seed(n_seed)
            else:
                n_seed = None

            outputs = model(input_ids=input_ids.to(device))
            probs = outputs.logits[0][-1]
            probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy()
            sampled_id = temperature_sampling(probs=top_p_sampling(probs, 
                                                                top_p=top_p, 
                                                                seed=n_seed,
                                                                return_probs=True),
                                            seed=n_seed,
                                            temperature=temperature)
            input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1)
            if sampled_id!=tokenizer.eos_token_id:
                print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="")
                continue
            else:
                tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
                tunes += tune+"\n\n"
                print("\n")
                break
    
    return tunes

input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]")
input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"")
input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=1, label="Number of Tunes")
input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length")
input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P")
input_temperature = gr.inputs.Slider(minimum=0.0, maximum=2.0, step=0.1, default=1.0, label="Temperature")
input_seed = gr.inputs.Textbox(lines=1, label="Seed (int)", default="None")
output_abc = gr.outputs.Textbox(label="Generated Tunes")

gr.Interface(generate_abc,
                [input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed],
                output_abc,
            title="TunesFormer: Forming Tunes with Control Codes",
            description=description,
            article=article).launch()