Spaces:
Runtime error
Runtime error
JacobLinCool
commited on
Commit
•
bd8dcd1
1
Parent(s):
64f96e7
feat: use docker space
Browse files- Dockerfile +28 -0
- README.md +1 -3
- app.py +17 -227
- build.py +17 -0
- requirements.txt +1 -1
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11
|
2 |
+
|
3 |
+
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
4 |
+
ENV COQUI_TOS_AGREED=1
|
5 |
+
|
6 |
+
# Set up a new user named "user" with user ID 1000
|
7 |
+
RUN useradd -m -u 1000 user
|
8 |
+
|
9 |
+
# Switch to the "user" user
|
10 |
+
USER user
|
11 |
+
|
12 |
+
# Set home to the user's home directory
|
13 |
+
ENV HOME=/home/user \
|
14 |
+
PATH=/home/user/.local/bin:$PATH
|
15 |
+
|
16 |
+
# Set the working directory to the user's home directory
|
17 |
+
WORKDIR $HOME/app
|
18 |
+
|
19 |
+
# Install dependencies
|
20 |
+
COPY --chown=user:user requirements.txt .
|
21 |
+
RUN pip install -r requirements.txt
|
22 |
+
RUN python -m unidic download
|
23 |
+
|
24 |
+
# Install model weights
|
25 |
+
COPY --chown=user:user . .
|
26 |
+
RUN python build.py
|
27 |
+
|
28 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -3,10 +3,8 @@ title: XTTS
|
|
3 |
emoji: 🐸
|
4 |
colorFrom: green
|
5 |
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.48.0
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
models:
|
11 |
- coqui/XTTS-v2
|
12 |
---
|
|
|
3 |
emoji: 🐸
|
4 |
colorFrom: green
|
5 |
colorTo: red
|
|
|
|
|
|
|
6 |
pinned: false
|
7 |
+
sdk: docker
|
8 |
models:
|
9 |
- coqui/XTTS-v2
|
10 |
---
|
app.py
CHANGED
@@ -1,38 +1,26 @@
|
|
1 |
-
import
|
2 |
-
import io, os, stat
|
3 |
import subprocess
|
4 |
-
import random
|
5 |
-
from zipfile import ZipFile
|
6 |
import uuid
|
7 |
import time
|
8 |
import torch
|
9 |
import torchaudio
|
10 |
|
11 |
-
|
12 |
-
#download for mecab
|
13 |
-
os.system('python -m unidic download')
|
14 |
-
|
15 |
-
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
16 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
17 |
-
|
18 |
# langid is used to detect language for longer text
|
19 |
# Most users expect text to be their own language, there is checkbox to disable it
|
20 |
import langid
|
21 |
-
import base64
|
22 |
import csv
|
23 |
from io import StringIO
|
24 |
import datetime
|
25 |
import re
|
26 |
|
27 |
import gradio as gr
|
28 |
-
from scipy.io.wavfile import write
|
29 |
-
from pydub import AudioSegment
|
30 |
|
31 |
-
from TTS.api import TTS
|
32 |
from TTS.tts.configs.xtts_config import XttsConfig
|
33 |
from TTS.tts.models.xtts import Xtts
|
34 |
from TTS.utils.generic_utils import get_user_data_dir
|
35 |
|
|
|
|
|
36 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
37 |
|
38 |
from huggingface_hub import HfApi
|
@@ -41,21 +29,10 @@ from huggingface_hub import HfApi
|
|
41 |
api = HfApi(token=HF_TOKEN)
|
42 |
repo_id = "coqui/xtts"
|
43 |
|
44 |
-
|
45 |
-
print("Export newer ffmpeg binary for denoise filter")
|
46 |
-
ZipFile("ffmpeg.zip").extractall()
|
47 |
-
print("Make ffmpeg binary executable")
|
48 |
-
st = os.stat("ffmpeg")
|
49 |
-
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
|
50 |
-
|
51 |
-
# This will trigger downloading model
|
52 |
-
print("Downloading if not downloaded Coqui XTTS V2")
|
53 |
-
from TTS.utils.manage import ModelManager
|
54 |
|
55 |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
56 |
-
ModelManager().download_model(model_name)
|
57 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
58 |
-
print("XTTS downloaded")
|
59 |
|
60 |
config = XttsConfig()
|
61 |
config.load_json(os.path.join(model_path, "config.json"))
|
@@ -66,9 +43,15 @@ model.load_checkpoint(
|
|
66 |
checkpoint_path=os.path.join(model_path, "model.pth"),
|
67 |
vocab_path=os.path.join(model_path, "vocab.json"),
|
68 |
eval=True,
|
69 |
-
use_deepspeed=
|
70 |
)
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
# This is for debugging purposes only
|
74 |
DEVICE_ASSERT_DETECTED = 0
|
@@ -81,8 +64,6 @@ def predict(
|
|
81 |
prompt,
|
82 |
language,
|
83 |
audio_file_pth,
|
84 |
-
mic_file_path,
|
85 |
-
use_mic,
|
86 |
voice_cleanup,
|
87 |
no_lang_auto_detect,
|
88 |
agree,
|
@@ -130,22 +111,7 @@ def predict(
|
|
130 |
None,
|
131 |
)
|
132 |
|
133 |
-
|
134 |
-
if mic_file_path is not None:
|
135 |
-
speaker_wav = mic_file_path
|
136 |
-
else:
|
137 |
-
gr.Warning(
|
138 |
-
"Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
|
139 |
-
)
|
140 |
-
return (
|
141 |
-
None,
|
142 |
-
None,
|
143 |
-
None,
|
144 |
-
None,
|
145 |
-
)
|
146 |
-
|
147 |
-
else:
|
148 |
-
speaker_wav = audio_file_pth
|
149 |
|
150 |
# Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
|
151 |
# This is fast filtering not perfect
|
@@ -328,8 +294,6 @@ def predict(
|
|
328 |
prompt,
|
329 |
language,
|
330 |
audio_file_pth,
|
331 |
-
mic_file_path,
|
332 |
-
use_mic,
|
333 |
voice_cleanup,
|
334 |
no_lang_auto_detect,
|
335 |
agree,
|
@@ -450,160 +414,6 @@ article = """
|
|
450 |
<p>We collect data only for error cases for improvement.</p>
|
451 |
</div>
|
452 |
"""
|
453 |
-
examples = [
|
454 |
-
[
|
455 |
-
"Once when I was six years old I saw a magnificent picture",
|
456 |
-
"en",
|
457 |
-
"examples/female.wav",
|
458 |
-
None,
|
459 |
-
False,
|
460 |
-
False,
|
461 |
-
False,
|
462 |
-
True,
|
463 |
-
],
|
464 |
-
[
|
465 |
-
"Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
|
466 |
-
"fr",
|
467 |
-
"examples/male.wav",
|
468 |
-
None,
|
469 |
-
False,
|
470 |
-
False,
|
471 |
-
False,
|
472 |
-
True,
|
473 |
-
],
|
474 |
-
[
|
475 |
-
"Als ich sechs war, sah ich einmal ein wunderbares Bild",
|
476 |
-
"de",
|
477 |
-
"examples/female.wav",
|
478 |
-
None,
|
479 |
-
False,
|
480 |
-
False,
|
481 |
-
False,
|
482 |
-
True,
|
483 |
-
],
|
484 |
-
[
|
485 |
-
"Cuando tenía seis años, vi una vez una imagen magnífica",
|
486 |
-
"es",
|
487 |
-
"examples/male.wav",
|
488 |
-
None,
|
489 |
-
False,
|
490 |
-
False,
|
491 |
-
False,
|
492 |
-
True,
|
493 |
-
],
|
494 |
-
[
|
495 |
-
"Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
|
496 |
-
"pt",
|
497 |
-
"examples/female.wav",
|
498 |
-
None,
|
499 |
-
False,
|
500 |
-
False,
|
501 |
-
False,
|
502 |
-
True,
|
503 |
-
],
|
504 |
-
[
|
505 |
-
"Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
|
506 |
-
"pl",
|
507 |
-
"examples/male.wav",
|
508 |
-
None,
|
509 |
-
False,
|
510 |
-
False,
|
511 |
-
False,
|
512 |
-
True,
|
513 |
-
],
|
514 |
-
[
|
515 |
-
"Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
|
516 |
-
"it",
|
517 |
-
"examples/female.wav",
|
518 |
-
None,
|
519 |
-
False,
|
520 |
-
False,
|
521 |
-
False,
|
522 |
-
True,
|
523 |
-
],
|
524 |
-
[
|
525 |
-
"Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
|
526 |
-
"tr",
|
527 |
-
"examples/female.wav",
|
528 |
-
None,
|
529 |
-
False,
|
530 |
-
False,
|
531 |
-
False,
|
532 |
-
True,
|
533 |
-
],
|
534 |
-
[
|
535 |
-
"Когда мне было шесть лет, я увидел однажды удивительную картинку",
|
536 |
-
"ru",
|
537 |
-
"examples/female.wav",
|
538 |
-
None,
|
539 |
-
False,
|
540 |
-
False,
|
541 |
-
False,
|
542 |
-
True,
|
543 |
-
],
|
544 |
-
[
|
545 |
-
"Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
|
546 |
-
"nl",
|
547 |
-
"examples/male.wav",
|
548 |
-
None,
|
549 |
-
False,
|
550 |
-
False,
|
551 |
-
False,
|
552 |
-
True,
|
553 |
-
],
|
554 |
-
[
|
555 |
-
"Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
|
556 |
-
"cs",
|
557 |
-
"examples/female.wav",
|
558 |
-
None,
|
559 |
-
False,
|
560 |
-
False,
|
561 |
-
False,
|
562 |
-
True,
|
563 |
-
],
|
564 |
-
[
|
565 |
-
"当我还只有六岁的时候, 看到了一副精彩的插画",
|
566 |
-
"zh-cn",
|
567 |
-
"examples/female.wav",
|
568 |
-
None,
|
569 |
-
False,
|
570 |
-
False,
|
571 |
-
False,
|
572 |
-
True,
|
573 |
-
],
|
574 |
-
[
|
575 |
-
"かつて 六歳のとき、素晴らしい絵を見ました",
|
576 |
-
"ja",
|
577 |
-
"examples/female.wav",
|
578 |
-
None,
|
579 |
-
False,
|
580 |
-
True,
|
581 |
-
False,
|
582 |
-
True,
|
583 |
-
],
|
584 |
-
[
|
585 |
-
"한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
|
586 |
-
"ko",
|
587 |
-
"examples/female.wav",
|
588 |
-
None,
|
589 |
-
False,
|
590 |
-
True,
|
591 |
-
False,
|
592 |
-
True,
|
593 |
-
],
|
594 |
-
[
|
595 |
-
"Egyszer hat éves koromban láttam egy csodálatos képet",
|
596 |
-
"hu",
|
597 |
-
"examples/male.wav",
|
598 |
-
None,
|
599 |
-
False,
|
600 |
-
True,
|
601 |
-
False,
|
602 |
-
True,
|
603 |
-
],
|
604 |
-
]
|
605 |
-
|
606 |
-
|
607 |
|
608 |
with gr.Blocks(analytics_enabled=False) as demo:
|
609 |
with gr.Row():
|
@@ -651,7 +461,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
651 |
"ko",
|
652 |
"hu"
|
653 |
],
|
654 |
-
max_choices=1,
|
655 |
value="en",
|
656 |
)
|
657 |
ref_gr = gr.Audio(
|
@@ -660,17 +469,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
660 |
type="filepath",
|
661 |
value="examples/female.wav",
|
662 |
)
|
663 |
-
mic_gr = gr.Audio(
|
664 |
-
source="microphone",
|
665 |
-
type="filepath",
|
666 |
-
info="Use your microphone to record audio",
|
667 |
-
label="Use Microphone for Reference",
|
668 |
-
)
|
669 |
-
use_mic_gr = gr.Checkbox(
|
670 |
-
label="Use Microphone",
|
671 |
-
value=False,
|
672 |
-
info="Notice: Microphone input may not work properly under traffic",
|
673 |
-
)
|
674 |
clean_ref_gr = gr.Checkbox(
|
675 |
label="Cleanup Reference Voice",
|
676 |
value=False,
|
@@ -696,15 +494,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
696 |
out_text_gr = gr.Text(label="Metrics")
|
697 |
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
698 |
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
|
704 |
-
fn=predict,
|
705 |
-
cache_examples=False,)
|
706 |
-
|
707 |
-
tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
|
708 |
-
|
709 |
-
demo.queue()
|
710 |
-
demo.launch(debug=True, show_api=True)
|
|
|
1 |
+
import os
|
|
|
2 |
import subprocess
|
|
|
|
|
3 |
import uuid
|
4 |
import time
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# langid is used to detect language for longer text
|
9 |
# Most users expect text to be their own language, there is checkbox to disable it
|
10 |
import langid
|
|
|
11 |
import csv
|
12 |
from io import StringIO
|
13 |
import datetime
|
14 |
import re
|
15 |
|
16 |
import gradio as gr
|
|
|
|
|
17 |
|
|
|
18 |
from TTS.tts.configs.xtts_config import XttsConfig
|
19 |
from TTS.tts.models.xtts import Xtts
|
20 |
from TTS.utils.generic_utils import get_user_data_dir
|
21 |
|
22 |
+
print("application starting")
|
23 |
+
|
24 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
25 |
|
26 |
from huggingface_hub import HfApi
|
|
|
29 |
api = HfApi(token=HF_TOKEN)
|
30 |
repo_id = "coqui/xtts"
|
31 |
|
32 |
+
print("loading model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
|
35 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
|
|
36 |
|
37 |
config = XttsConfig()
|
38 |
config.load_json(os.path.join(model_path, "config.json"))
|
|
|
43 |
checkpoint_path=os.path.join(model_path, "model.pth"),
|
44 |
vocab_path=os.path.join(model_path, "vocab.json"),
|
45 |
eval=True,
|
46 |
+
use_deepspeed=False,
|
47 |
)
|
48 |
+
|
49 |
+
if torch.cuda.is_available():
|
50 |
+
model.cuda()
|
51 |
+
else:
|
52 |
+
model.cpu()
|
53 |
+
|
54 |
+
print("Model loaded")
|
55 |
|
56 |
# This is for debugging purposes only
|
57 |
DEVICE_ASSERT_DETECTED = 0
|
|
|
64 |
prompt,
|
65 |
language,
|
66 |
audio_file_pth,
|
|
|
|
|
67 |
voice_cleanup,
|
68 |
no_lang_auto_detect,
|
69 |
agree,
|
|
|
111 |
None,
|
112 |
)
|
113 |
|
114 |
+
speaker_wav = audio_file_pth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
|
117 |
# This is fast filtering not perfect
|
|
|
294 |
prompt,
|
295 |
language,
|
296 |
audio_file_pth,
|
|
|
|
|
297 |
voice_cleanup,
|
298 |
no_lang_auto_detect,
|
299 |
agree,
|
|
|
414 |
<p>We collect data only for error cases for improvement.</p>
|
415 |
</div>
|
416 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
|
418 |
with gr.Blocks(analytics_enabled=False) as demo:
|
419 |
with gr.Row():
|
|
|
461 |
"ko",
|
462 |
"hu"
|
463 |
],
|
|
|
464 |
value="en",
|
465 |
)
|
466 |
ref_gr = gr.Audio(
|
|
|
469 |
type="filepath",
|
470 |
value="examples/female.wav",
|
471 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
clean_ref_gr = gr.Checkbox(
|
473 |
label="Cleanup Reference Voice",
|
474 |
value=False,
|
|
|
494 |
out_text_gr = gr.Text(label="Metrics")
|
495 |
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
496 |
|
497 |
+
tts_button.click(predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
|
498 |
+
|
499 |
+
print("Starting server")
|
500 |
+
demo.queue().launch(debug=True, show_api=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, stat
|
2 |
+
from zipfile import ZipFile
|
3 |
+
|
4 |
+
# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
|
5 |
+
print("Export newer ffmpeg binary for denoise filter")
|
6 |
+
ZipFile("ffmpeg.zip").extractall()
|
7 |
+
print("Make ffmpeg binary executable")
|
8 |
+
st = os.stat("ffmpeg")
|
9 |
+
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
|
10 |
+
|
11 |
+
# This will trigger downloading model
|
12 |
+
print("Downloading if not downloaded Coqui XTTS V2")
|
13 |
+
from TTS.utils.manage import ModelManager
|
14 |
+
|
15 |
+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
16 |
+
ModelManager().download_model(model_name)
|
17 |
+
print("XTTS downloaded")
|
requirements.txt
CHANGED
@@ -8,5 +8,5 @@ mecab-python3==1.0.6
|
|
8 |
unidic-lite==1.0.8
|
9 |
unidic==1.1.0
|
10 |
langid
|
11 |
-
deepspeed
|
12 |
pydub
|
|
|
|
8 |
unidic-lite==1.0.8
|
9 |
unidic==1.1.0
|
10 |
langid
|
|
|
11 |
pydub
|
12 |
+
gradio
|