Spaces:

Hematej
/

conqui-tts2

Build error

App Files Files Community

Hematej commited on Jun 3

Commit

004e9ee

verified ·

1 Parent(s): ad3deb5

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -92

app.py CHANGED Viewed

@@ -1,93 +1,89 @@
-import gradio as gr
-from TTS.api import TTS
-css = """
-#warning {background-color: #FFCCCB !important}
-.feedback label textarea {height: auto !important;
-                    font-size: 22px !important;
-                    font-weight: 800 !important;
-                    text-align: center !important;
-                    color: #801313 !important;
-                    padding: 0px !important}
-#alert {background-color: #fff !important}
-"""
-# Init TTS
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
-zh_tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=False)
-de_tts = TTS(model_name="tts_models/de/thorsten/vits", gpu=False)
-es_tts = TTS(model_name="tts_models/es/mai/tacotron2-DDC", progress_bar=False, gpu=False)
-def text_to_speech(text: str, speaker_wav, speaker_wav_file):
-    if len(text) > 0:
-       return change_aud(text, speaker_wav, speaker_wav_file)
-    else:
-       return (None)
-def change_aud(text: str, speaker_wav, speaker_wav_file):
-    if speaker_wav_file and not speaker_wav:
-        speaker_wav = speaker_wav_file
-    file_path = "output.wav"
-    if speaker_wav is not None:
-        tts.tts_to_file(text, speaker_wav=speaker_wav, language="en", file_path=file_path)
-    else:
-        tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=file_path)
-    return file_path
-def show_error(text):
-    if text == "":
-        return gr.update(visible=True, elem_id="warning", elem_classes="feedback"), gr.update(visible=False)
-    else:
-        return gr.update(visible=False), gr.update(visible=True)
-# def download_file():
-#     return file_path
-title = "Voice-Cloning-Demo"
-def toggle(choice):
-    if choice == "mic":
-        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
-    else:
-        return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
-def change_color(text_input):
-    if len(text_input) == 0:
-        return gr.update(elem_id="warning", autofocus=True)
-    else:
-        return gr.update(elem_id="alert", autofocus=False)
-def clear_color(text_input, radio,error_box):
-    return gr.update(elem_id="alert"), gr.update(value="mic"), gr.update(visible=False)
-with gr.Blocks(css="footer {visibility: hidden}") as demo:
-    with gr.Row():
-        with gr.Column():
-            text_input = gr.Textbox(label="Input the text", value="", max_lines=4, lines=4)
-            radio = gr.Radio(["mic", "file"], value="mic",
-                             label="How would you like to upload your audio?")
-            audio_input_mic = gr.Audio(label="Voice to clone", sources="microphone", type="filepath", visible=True)
-            audio_input_file = gr.Audio(label="Voice to clone", type="filepath", visible=False)
-            with gr.Row():
-                with gr.Column():
-                    btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
-                with gr.Column():
-                    btn = gr.Button("Generate", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="Output", visible=True, autoplay=True, show_share_button=False)
-            # download_button = gr.DownloadButton(label="Download Audio", value=None, visible=True)
-            error_box = gr.Textbox(label="WARNING", value="Input box cannot be blank!!", visible=False, container=True)
-    # download_button.click(download_file, outputs=download_button)
-    btn_clear.add(audio_output)
-    btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=audio_output)
-    btn.click(show_error, text_input, [error_box, audio_output])
-    radio.change(toggle, radio, [audio_input_mic, audio_input_file])
-    btn_clear.click(clear_color, [text_input, radio, error_box], [text_input, radio, error_box])
-    btn.click(change_color, text_input, text_input)
 demo.launch()

+import gradio as gr
+from TTS.api import TTS
+import torch
+import os
+from pydub import AudioSegment
+# ✅ Environment fixes
+os.environ["COQUI_TOS_AGREED"] = "1"
+os.environ["TTS_MODELS_PATH"] = "/home/user/app/coqui_models"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ✅ Load XTTS model efficiently
+try:
+    tts = TTS(
+        model_name="tts_models/multilingual/multi-dataset/xtts_v2",
+        progress_bar=True,  # Speeds up processing
+        gpu=(device == "cuda")  # ✅ Remove `trust_remote_code=True`
+    )
+    tts.to(device)
+    print(f"[INFO] XTTS model loaded successfully on {device}")
+except Exception as e:
+    print(f"[ERROR] Failed to load XTTS model: {e}")
+    raise e
+# ✅ Optimize MP3 to WAV conversion
+def convert_mp3_to_wav(mp3_path: str) -> str:
+    wav_path = mp3_path.replace(".mp3", ".wav")
+    if not os.path.exists(wav_path):
+        try:
+            audio = AudioSegment.from_file(mp3_path)
+            audio.export(wav_path, format="wav")
+        except Exception as e:
+            print(f"[ERROR] MP3 conversion failed: {e}")
+            return None
+    return wav_path
+# ✅ Fix Speaker File Handling & Text Processing for Long Inputs
+def text_to_speech(text: str, speaker_wav: str, speaker_wav_file: str):
+    text = text.strip().replace("\n", " ")[:1500]  # ✅ Supports up to 10+ lines
+    speaker_audio = speaker_wav_file or speaker_wav
+    if not text:
+        return None, "⚠️ Error: Text input is empty."
+    if not speaker_audio or not os.path.exists(speaker_audio):
+        return None, "⚠️ Error: No valid speaker audio provided."
+    if speaker_audio.endswith(".mp3"):
+        speaker_audio = convert_mp3_to_wav(speaker_audio)
+        if not speaker_audio:
+            return None, "⚠️ Error converting MP3 to WAV."
+    output_path = "output.wav"
+    try:
+        # ✅ Ensure correct tensor shape for attention mask
+        attention_mask = torch.ones((1, len(text.split())), dtype=torch.float32).to(device)
+        tts.tts_to_file(text=text, speaker_wav=speaker_audio, language="en", file_path=output_path, attention_mask=attention_mask)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, ""
+        else:
+            return None, "⚠️ Error: Audio was not generated."
+    except Exception as e:
+        return None, f"⚠️ Error during synthesis: {str(e)}"
+# ✅ Gradio UI setup
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(label="Enter text to clone", max_lines=15, lines=15)  # ✅ Supports long input
+            radio = gr.Radio(["mic", "file"], value="mic", label="Upload speaker audio")
+            audio_input_mic = gr.Audio(label="Use Microphone", sources="microphone", type="filepath", visible=True)
+            audio_input_file = gr.Audio(label="Upload File (.wav/.mp3)", type="filepath", visible=False)
+            with gr.Row():
+                with gr.Column():
+                    btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
+                with gr.Column():
+                    btn = gr.Button("Generate Voice", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Voice", visible=True, autoplay=True)
+            error_box = gr.Textbox(label="Status", value="", visible=False)
+    btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=[audio_output, error_box])
 demo.launch()