Spaces:
Running
Running
John Langley
commited on
Commit
•
4bae818
1
Parent(s):
4979540
change to cpu
Browse files- app.py +15 -1
- requirements.txt +1 -1
- utils.py +32 -2
app.py
CHANGED
@@ -61,7 +61,21 @@ os.environ["COQUI_TOS_AGREED"] = "1"
|
|
61 |
##print(m)
|
62 |
#m = model_name
|
63 |
|
64 |
-
xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
|
|
|
61 |
##print(m)
|
62 |
#m = model_name
|
63 |
|
64 |
+
#xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
65 |
+
|
66 |
+
device = "cpu"
|
67 |
+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
68 |
+
print("⏳Downloading model")
|
69 |
+
ModelManager().download_model(model_name)
|
70 |
+
model_path = os.path.join(
|
71 |
+
get_user_data_dir("tts"), model_name.replace("/", "--")
|
72 |
+
)
|
73 |
+
|
74 |
+
config = XttsConfig()
|
75 |
+
config.load_json(os.path.join(model_path, "config.json"))
|
76 |
+
xtts_model = Xtts.init_from_config(config)
|
77 |
+
xtts_model.load_checkpoint(config, checkpoint_dir=model_path, eval=True)
|
78 |
+
xtts_model.to(device)
|
79 |
|
80 |
|
81 |
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# Preinstall requirements from TTS
|
2 |
-
TTS @ git+https://github.com/coqui-ai/TTS@
|
3 |
pydantic==1.10.13
|
4 |
python-multipart==0.0.6
|
5 |
typing-extensions>=4.8.0
|
|
|
1 |
# Preinstall requirements from TTS
|
2 |
+
TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
|
3 |
pydantic==1.10.13
|
4 |
python-multipart==0.0.6
|
5 |
typing-extensions>=4.8.0
|
utils.py
CHANGED
@@ -8,6 +8,7 @@ import textwrap
|
|
8 |
import time
|
9 |
import uuid
|
10 |
import wave
|
|
|
11 |
|
12 |
import emoji
|
13 |
import gradio as gr
|
@@ -81,8 +82,37 @@ def detect_language(prompt, xtts_supported_languages=None):
|
|
81 |
|
82 |
return language
|
83 |
|
84 |
-
def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
try:
|
87 |
t0 = time.time()
|
88 |
chunks = xtts_model.inference_stream(
|
|
|
8 |
import time
|
9 |
import uuid
|
10 |
import wave
|
11 |
+
import torch
|
12 |
|
13 |
import emoji
|
14 |
import gradio as gr
|
|
|
82 |
|
83 |
return language
|
84 |
|
85 |
+
def get_voice_streaming(self,prompt, language, chatbot_voice, xtts_model, suffix="0"):
|
86 |
+
|
87 |
+
speaker = {
|
88 |
+
"speaker_embedding": xtts_model.speaker_manager.speakers["Claribel Dervla"][
|
89 |
+
"speaker_embedding"
|
90 |
+
]
|
91 |
+
.cpu()
|
92 |
+
.squeeze()
|
93 |
+
.half()
|
94 |
+
.tolist(),
|
95 |
+
"gpt_cond_latent": xtts_model.speaker_manager.speakers["Claribel Dervla"][
|
96 |
+
"gpt_cond_latent"
|
97 |
+
]
|
98 |
+
.cpu()
|
99 |
+
.squeeze()
|
100 |
+
.half()
|
101 |
+
.tolist(),
|
102 |
+
}
|
103 |
+
|
104 |
+
speaker_embedding = (
|
105 |
+
torch.tensor(speaker.get("speaker_embedding"))
|
106 |
+
.unsqueeze(0)
|
107 |
+
.unsqueeze(-1)
|
108 |
+
)
|
109 |
+
|
110 |
+
gpt_cond_latent = (
|
111 |
+
torch.tensor(speaker.get("gpt_cond_latent"))
|
112 |
+
.reshape((-1, 1024))
|
113 |
+
.unsqueeze(0)
|
114 |
+
)
|
115 |
+
#gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
|
116 |
try:
|
117 |
t0 = time.time()
|
118 |
chunks = xtts_model.inference_stream(
|