xtts-v2

Runtime error

App Files Files Community

JacobLinCool commited on Nov 23, 2023

Commit

bd8dcd1

•

1 Parent(s): 64f96e7

feat: use docker space

Browse files

Files changed (5) hide show

Dockerfile +28 -0
README.md +1 -3
app.py +17 -227
build.py +17 -0
requirements.txt +1 -1

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11
+# By using XTTS you agree to CPML license https://coqui.ai/cpml
+ENV COQUI_TOS_AGREED=1
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Install dependencies
+COPY --chown=user:user requirements.txt .
+RUN pip install -r requirements.txt
+RUN python -m unidic download
+# Install model weights
+COPY --chown=user:user . .
+RUN python build.py
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,10 +3,8 @@ title: XTTS
 emoji: 🐸
 colorFrom: green
 colorTo: red
-sdk: gradio
-sdk_version: 3.48.0
-app_file: app.py
 pinned: false
 models:
 - coqui/XTTS-v2
 ---

 emoji: 🐸
 colorFrom: green
 colorTo: red
 pinned: false
+sdk: docker
 models:
 - coqui/XTTS-v2
 ---

app.py CHANGED Viewed

@@ -1,38 +1,26 @@
-import sys
-import io, os, stat
 import subprocess
-import random
-from zipfile import ZipFile
 import uuid
 import time
 import torch
 import torchaudio
-#download for mecab
-os.system('python -m unidic download')
-# By using XTTS you agree to CPML license https://coqui.ai/cpml
-os.environ["COQUI_TOS_AGREED"] = "1"
 # langid is used to detect language for longer text
 # Most users expect text to be their own language, there is checkbox to disable it
 import langid
-import base64
 import csv
 from io import StringIO
 import datetime
 import re
 import gradio as gr
-from scipy.io.wavfile import write
-from pydub import AudioSegment
-from TTS.api import TTS
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 HF_TOKEN = os.environ.get("HF_TOKEN")
 from huggingface_hub import HfApi
@@ -41,21 +29,10 @@ from huggingface_hub import HfApi
 api = HfApi(token=HF_TOKEN)
 repo_id = "coqui/xtts"
-# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
-print("Export newer ffmpeg binary for denoise filter")
-ZipFile("ffmpeg.zip").extractall()
-print("Make ffmpeg binary executable")
-st = os.stat("ffmpeg")
-os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
-# This will trigger downloading model
-print("Downloading if not downloaded Coqui XTTS V2")
-from TTS.utils.manage import ModelManager
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
-ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
-print("XTTS downloaded")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
@@ -66,9 +43,15 @@ model.load_checkpoint(
     checkpoint_path=os.path.join(model_path, "model.pth"),
     vocab_path=os.path.join(model_path, "vocab.json"),
     eval=True,
-    use_deepspeed=True,
 )
-model.cuda()
 # This is for debugging purposes only
 DEVICE_ASSERT_DETECTED = 0
@@ -81,8 +64,6 @@ def predict(
     prompt,
     language,
     audio_file_pth,
-    mic_file_path,
-    use_mic,
     voice_cleanup,
     no_lang_auto_detect,
     agree,
@@ -130,22 +111,7 @@ def predict(
                     None,
                 )
-        if use_mic == True:
-            if mic_file_path is not None:
-                speaker_wav = mic_file_path
-            else:
-                gr.Warning(
-                    "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
-                )
-                return (
-                    None,
-                    None,
-                    None,
-                    None,
-                )
-        else:
-            speaker_wav = audio_file_pth
         # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
         # This is fast filtering not perfect
@@ -328,8 +294,6 @@ def predict(
                     prompt,
                     language,
                     audio_file_pth,
-                    mic_file_path,
-                    use_mic,
                     voice_cleanup,
                     no_lang_auto_detect,
                     agree,
@@ -450,160 +414,6 @@ article = """
 <p>We collect data only for error cases for improvement.</p>
 </div>
 """
-examples = [
-    [
-        "Once when I was six years old I saw a magnificent picture",
-        "en",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
-        "fr",
-        "examples/male.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Als ich sechs war, sah ich einmal ein wunderbares Bild",
-        "de",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Cuando tenía seis años, vi una vez una imagen magnífica",
-        "es",
-        "examples/male.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
-        "pt",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
-        "pl",
-        "examples/male.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
-        "it",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
-        "tr",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Когда мне было шесть лет, я увидел однажды удивительную картинку",
-        "ru",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
-        "nl",
-        "examples/male.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
-        "cs",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "当我还只有六岁的时候， 看到了一副精彩的插画",
-        "zh-cn",
-        "examples/female.wav",
-        None,
-        False,
-        False,
-        False,
-        True,
-    ],
-    [
-        "かつて 六歳のとき、素晴らしい絵を見ました",
-        "ja",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-        False,
-        True,
-    ],
-    [
-        "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
-        "ko",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-        False,
-        True,
-    ],
-        [
-        "Egyszer hat éves koromban láttam egy csodálatos képet",
-        "hu",
-        "examples/male.wav",
-        None,
-        False,
-        True,
-        False,
-        True,
-    ],
-]
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
@@ -651,7 +461,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
                     "ko",
                     "hu"
                 ],
-                max_choices=1,
                 value="en",
             )
             ref_gr = gr.Audio(
@@ -660,17 +469,6 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 type="filepath",
                 value="examples/female.wav",
             )
-            mic_gr = gr.Audio(
-                source="microphone",
-                type="filepath",
-                info="Use your microphone to record audio",
-                label="Use Microphone for Reference",
-            )
-            use_mic_gr = gr.Checkbox(
-                label="Use Microphone",
-                value=False,
-                info="Notice: Microphone input may not work properly under traffic",
-            )
             clean_ref_gr = gr.Checkbox(
                 label="Cleanup Reference Voice",
                 value=False,
@@ -696,15 +494,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             out_text_gr = gr.Text(label="Metrics")
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
-    with gr.Row():
-        gr.Examples(examples,
-                    label="Examples",
-                    inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
-                    outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
-                    fn=predict,
-                    cache_examples=False,)
-    tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
-demo.queue()
-demo.launch(debug=True, show_api=True)

+import os
 import subprocess
 import uuid
 import time
 import torch
 import torchaudio
 # langid is used to detect language for longer text
 # Most users expect text to be their own language, there is checkbox to disable it
 import langid
 import csv
 from io import StringIO
 import datetime
 import re
 import gradio as gr
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
+print("application starting")
 HF_TOKEN = os.environ.get("HF_TOKEN")
 from huggingface_hub import HfApi
 api = HfApi(token=HF_TOKEN)
 repo_id = "coqui/xtts"
+print("loading model")
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
     checkpoint_path=os.path.join(model_path, "model.pth"),
     vocab_path=os.path.join(model_path, "vocab.json"),
     eval=True,
+    use_deepspeed=False,
 )
+if torch.cuda.is_available():
+    model.cuda()
+else:
+    model.cpu()
+print("Model loaded")
 # This is for debugging purposes only
 DEVICE_ASSERT_DETECTED = 0
     prompt,
     language,
     audio_file_pth,
     voice_cleanup,
     no_lang_auto_detect,
     agree,
                     None,
                 )
+        speaker_wav = audio_file_pth
         # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
         # This is fast filtering not perfect
                     prompt,
                     language,
                     audio_file_pth,
                     voice_cleanup,
                     no_lang_auto_detect,
                     agree,
 <p>We collect data only for error cases for improvement.</p>
 </div>
 """
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
                     "ko",
                     "hu"
                 ],
                 value="en",
             )
             ref_gr = gr.Audio(
                 type="filepath",
                 value="examples/female.wav",
             )
             clean_ref_gr = gr.Checkbox(
                 label="Cleanup Reference Voice",
                 value=False,
             out_text_gr = gr.Text(label="Metrics")
             ref_audio_gr = gr.Audio(label="Reference Audio Used")
+    tts_button.click(predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
+print("Starting server")
+demo.queue().launch(debug=True, show_api=True)

build.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os, stat
+from zipfile import ZipFile
+# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
+print("Export newer ffmpeg binary for denoise filter")
+ZipFile("ffmpeg.zip").extractall()
+print("Make ffmpeg binary executable")
+st = os.stat("ffmpeg")
+os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
+# This will trigger downloading model
+print("Downloading if not downloaded Coqui XTTS V2")
+from TTS.utils.manage import ModelManager
+model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+ModelManager().download_model(model_name)
+print("XTTS downloaded")

requirements.txt CHANGED Viewed

@@ -8,5 +8,5 @@ mecab-python3==1.0.6
 unidic-lite==1.0.8
 unidic==1.1.0
 langid
-deepspeed
 pydub

 unidic-lite==1.0.8
 unidic==1.1.0
 langid
 pydub
+gradio