Spaces:
Running
Running
expo
Browse files- app.py +5 -8
- audiocraft.py +11 -5
- requirements.txt +1 -1
app.py
CHANGED
@@ -38,7 +38,7 @@ language_names = ['Ancient greek',
|
|
38 |
|
39 |
def audionar_tts(text=None,
|
40 |
lang='Romanian',
|
41 |
-
soundscape='',
|
42 |
cache_lim=24):
|
43 |
|
44 |
# https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
|
@@ -62,9 +62,7 @@ def audionar_tts(text=None,
|
|
62 |
|
63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
if lang not in language_names: # StyleTTS2
|
68 |
|
69 |
text = only_greek_or_only_latin(text, lang='eng')
|
70 |
|
@@ -77,7 +75,7 @@ def audionar_tts(text=None,
|
|
77 |
original_rate=24000,
|
78 |
target_rate=16000)[0, :] # 16 KHz
|
79 |
|
80 |
-
|
81 |
|
82 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
83 |
|
@@ -254,7 +252,6 @@ VOICES = [
|
|
254 |
|
255 |
_tts = StyleTTS2().to('cpu')
|
256 |
|
257 |
-
|
258 |
with gr.Blocks(theme='huggingface') as demo:
|
259 |
with gr.Row():
|
260 |
text_input = gr.Textbox(
|
@@ -264,9 +261,9 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
264 |
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
265 |
)
|
266 |
choice_dropdown = gr.Dropdown(
|
267 |
-
choices=
|
268 |
label="Vox",
|
269 |
-
value=
|
270 |
)
|
271 |
soundscape_input = gr.Textbox(
|
272 |
lines=1,
|
|
|
38 |
|
39 |
def audionar_tts(text=None,
|
40 |
lang='Romanian',
|
41 |
+
soundscape='frogs',
|
42 |
cache_lim=24):
|
43 |
|
44 |
# https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
|
|
|
62 |
|
63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
64 |
|
65 |
+
elif lang not in language_names: # text exists / StyleTTS2
|
|
|
|
|
66 |
|
67 |
text = only_greek_or_only_latin(text, lang='eng')
|
68 |
|
|
|
75 |
original_rate=24000,
|
76 |
target_rate=16000)[0, :] # 16 KHz
|
77 |
|
78 |
+
else: # VITS
|
79 |
|
80 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
81 |
|
|
|
252 |
|
253 |
_tts = StyleTTS2().to('cpu')
|
254 |
|
|
|
255 |
with gr.Blocks(theme='huggingface') as demo:
|
256 |
with gr.Row():
|
257 |
text_input = gr.Textbox(
|
|
|
261 |
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
262 |
)
|
263 |
choice_dropdown = gr.Dropdown(
|
264 |
+
choices=VOICES + language_names,
|
265 |
label="Vox",
|
266 |
+
value=VOICES[0]
|
267 |
)
|
268 |
soundscape_input = gr.Textbox(
|
269 |
lines=1,
|
audiocraft.py
CHANGED
@@ -459,10 +459,16 @@ class LMModel(nn.Module):
|
|
459 |
logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
|
460 |
logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
|
467 |
@torch.no_grad()
|
468 |
def generate(self,
|
@@ -718,7 +724,7 @@ class StreamingTransformer(nn.Module):
|
|
718 |
|
719 |
if __name__ == '__main__':
|
720 |
|
721 |
-
import audiofile
|
722 |
model = AudioGen().to('cpu')
|
723 |
x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
|
724 |
audiofile.write('_sound_.wav', x, 16000)
|
|
|
459 |
logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
|
460 |
logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
|
461 |
|
462 |
+
k = 24
|
463 |
+
logits = torch.softmax(logits / 1.0, dim=3) # [bs, 4, 1, 2048]
|
464 |
+
p, ix = torch.topk(logits, k, dim=3) # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
|
465 |
+
# Exponential Distribution
|
466 |
+
deflation = torch.empty_like(p).exponential_(lambd=1)
|
467 |
+
p = p / deflation
|
468 |
+
# divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0 --> exp doesnt really produce (0, Inf)
|
469 |
+
p = p.argmax(dim=3, keepdim=True) # [bs, 4, n_draw, 24]
|
470 |
+
tok = ix.gather(dim=3, index=p).to(torch.int64) # [bs, 4, n_draw, 1]
|
471 |
+
return tok[:, :, :, 0].transpose(1, 2) # [bs, n_draw, 4]
|
472 |
|
473 |
@torch.no_grad()
|
474 |
def generate(self,
|
|
|
724 |
|
725 |
if __name__ == '__main__':
|
726 |
|
727 |
+
import audiofile # pip uninstall flash-attn
|
728 |
model = AudioGen().to('cpu')
|
729 |
x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
|
730 |
audiofile.write('_sound_.wav', x, 16000)
|
requirements.txt
CHANGED
@@ -9,7 +9,7 @@ numpy<2.0.0
|
|
9 |
gradio==5.27.0
|
10 |
Numbers2Words-Greek
|
11 |
einops
|
12 |
-
torch
|
13 |
pydantic==2.10.6
|
14 |
transformers==4.49.0
|
15 |
sentencepiece
|
|
|
9 |
gradio==5.27.0
|
10 |
Numbers2Words-Greek
|
11 |
einops
|
12 |
+
torch==2.1.0
|
13 |
pydantic==2.10.6
|
14 |
transformers==4.49.0
|
15 |
sentencepiece
|