Spaces:
Running
on
L4
Running
on
L4
Fix OOM
Browse files- app.py +2 -2
- descriptions.py +7 -1
- models/vallex.py +12 -0
- requirements.txt +1 -0
- utils/generation.py +2 -1
app.py
CHANGED
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
|
44 |
text_collater = get_text_token_collater()
|
45 |
|
46 |
device = torch.device("cpu")
|
47 |
-
if torch.cuda.is_available():
|
48 |
-
|
49 |
|
50 |
# VALL-E-X model
|
51 |
model = VALLE(
|
|
|
44 |
text_collater = get_text_token_collater()
|
45 |
|
46 |
device = torch.device("cpu")
|
47 |
+
# if torch.cuda.is_available():
|
48 |
+
# device = torch.device("cuda", 0)
|
49 |
|
50 |
# VALL-E-X model
|
51 |
model = VALLE(
|
descriptions.py
CHANGED
@@ -1,6 +1,12 @@
|
|
1 |
top_md = """
|
2 |
# VALL-E X
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
5 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
6 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
|
|
1 |
top_md = """
|
2 |
# VALL-E X
|
3 |
+
<a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
|
4 |
+
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
|
5 |
+
<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
|
6 |
+
src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> or <a href="https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing"
|
7 |
+
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
|
8 |
+
<img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
|
9 |
+
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>to skip the queue.</p>
|
10 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
11 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
12 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
models/vallex.py
CHANGED
@@ -33,6 +33,15 @@ from modules.transformer import (
|
|
33 |
|
34 |
from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
class Transpose(nn.Identity):
|
38 |
"""(N, T, D) -> (N, D, T)"""
|
@@ -572,6 +581,9 @@ class VALLE(VALLF):
|
|
572 |
)
|
573 |
|
574 |
print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
|
|
|
|
|
|
|
575 |
break
|
576 |
|
577 |
y = torch.concat([y, samples], dim=1)
|
|
|
33 |
|
34 |
from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
|
35 |
|
36 |
+
import psutil
|
37 |
+
def get_memory_usage():
|
38 |
+
process = psutil.Process()
|
39 |
+
memory_info = process.memory_info()
|
40 |
+
|
41 |
+
memory_used = memory_info.rss
|
42 |
+
memory_used_mb = memory_used / (1024 * 1024)
|
43 |
+
|
44 |
+
return memory_used_mb
|
45 |
|
46 |
class Transpose(nn.Identity):
|
47 |
"""(N, T, D) -> (N, D, T)"""
|
|
|
581 |
)
|
582 |
|
583 |
print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
|
584 |
+
|
585 |
+
memory_used = get_memory_usage()
|
586 |
+
print(f"Current memory used: {memory_used:.2f} MB")
|
587 |
break
|
588 |
|
589 |
y = torch.concat([y, samples], dim=1)
|
requirements.txt
CHANGED
@@ -18,4 +18,5 @@ nltk
|
|
18 |
openai-whisper
|
19 |
phonemizer
|
20 |
matplotlib
|
|
|
21 |
gradio
|
|
|
18 |
openai-whisper
|
19 |
phonemizer
|
20 |
matplotlib
|
21 |
+
psutil
|
22 |
gradio
|
utils/generation.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import torch
|
3 |
import gdown
|
4 |
import logging
|
|
|
5 |
import langid
|
6 |
langid.set_languages(['en', 'zh', 'ja'])
|
7 |
|
@@ -253,4 +254,4 @@ def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no
|
|
253 |
)
|
254 |
return samples[0][0].cpu().numpy()
|
255 |
else:
|
256 |
-
raise ValueError(f"No such mode {mode}")
|
|
|
2 |
import torch
|
3 |
import gdown
|
4 |
import logging
|
5 |
+
import psutil
|
6 |
import langid
|
7 |
langid.set_languages(['en', 'zh', 'ja'])
|
8 |
|
|
|
254 |
)
|
255 |
return samples[0][0].cpu().numpy()
|
256 |
else:
|
257 |
+
raise ValueError(f"No such mode {mode}")
|