Hematej commited on
Commit
004e9ee
Β·
verified Β·
1 Parent(s): ad3deb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -92
app.py CHANGED
@@ -1,93 +1,89 @@
1
- import gradio as gr
2
- from TTS.api import TTS
3
-
4
- css = """
5
- #warning {background-color: #FFCCCB !important}
6
- .feedback label textarea {height: auto !important;
7
- font-size: 22px !important;
8
- font-weight: 800 !important;
9
- text-align: center !important;
10
- color: #801313 !important;
11
- padding: 0px !important}
12
- #alert {background-color: #fff !important}
13
- """
14
-
15
- # Init TTS
16
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
17
- zh_tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=False)
18
- de_tts = TTS(model_name="tts_models/de/thorsten/vits", gpu=False)
19
- es_tts = TTS(model_name="tts_models/es/mai/tacotron2-DDC", progress_bar=False, gpu=False)
20
-
21
- def text_to_speech(text: str, speaker_wav, speaker_wav_file):
22
- if len(text) > 0:
23
- return change_aud(text, speaker_wav, speaker_wav_file)
24
- else:
25
- return (None)
26
-
27
- def change_aud(text: str, speaker_wav, speaker_wav_file):
28
- if speaker_wav_file and not speaker_wav:
29
- speaker_wav = speaker_wav_file
30
- file_path = "output.wav"
31
- if speaker_wav is not None:
32
- tts.tts_to_file(text, speaker_wav=speaker_wav, language="en", file_path=file_path)
33
- else:
34
- tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=file_path)
35
- return file_path
36
-
37
- def show_error(text):
38
- if text == "":
39
- return gr.update(visible=True, elem_id="warning", elem_classes="feedback"), gr.update(visible=False)
40
- else:
41
- return gr.update(visible=False), gr.update(visible=True)
42
-
43
-
44
- # def download_file():
45
- # return file_path
46
-
47
-
48
- title = "Voice-Cloning-Demo"
49
-
50
- def toggle(choice):
51
- if choice == "mic":
52
- return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
53
- else:
54
- return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
55
- def change_color(text_input):
56
- if len(text_input) == 0:
57
- return gr.update(elem_id="warning", autofocus=True)
58
- else:
59
- return gr.update(elem_id="alert", autofocus=False)
60
-
61
- def clear_color(text_input, radio,error_box):
62
- return gr.update(elem_id="alert"), gr.update(value="mic"), gr.update(visible=False)
63
-
64
-
65
-
66
- with gr.Blocks(css="footer {visibility: hidden}") as demo:
67
- with gr.Row():
68
- with gr.Column():
69
- text_input = gr.Textbox(label="Input the text", value="", max_lines=4, lines=4)
70
- radio = gr.Radio(["mic", "file"], value="mic",
71
- label="How would you like to upload your audio?")
72
- audio_input_mic = gr.Audio(label="Voice to clone", sources="microphone", type="filepath", visible=True)
73
- audio_input_file = gr.Audio(label="Voice to clone", type="filepath", visible=False)
74
-
75
- with gr.Row():
76
- with gr.Column():
77
- btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
78
- with gr.Column():
79
- btn = gr.Button("Generate", variant="primary")
80
- with gr.Column():
81
- audio_output = gr.Audio(label="Output", visible=True, autoplay=True, show_share_button=False)
82
- # download_button = gr.DownloadButton(label="Download Audio", value=None, visible=True)
83
- error_box = gr.Textbox(label="WARNING", value="Input box cannot be blank!!", visible=False, container=True)
84
-
85
- # download_button.click(download_file, outputs=download_button)
86
- btn_clear.add(audio_output)
87
- btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=audio_output)
88
- btn.click(show_error, text_input, [error_box, audio_output])
89
- radio.change(toggle, radio, [audio_input_mic, audio_input_file])
90
- btn_clear.click(clear_color, [text_input, radio, error_box], [text_input, radio, error_box])
91
- btn.click(change_color, text_input, text_input)
92
-
93
  demo.launch()
 
1
+ import gradio as gr
2
+ from TTS.api import TTS
3
+ import torch
4
+ import os
5
+ from pydub import AudioSegment
6
+
7
+ # βœ… Environment fixes
8
+ os.environ["COQUI_TOS_AGREED"] = "1"
9
+ os.environ["TTS_MODELS_PATH"] = "/home/user/app/coqui_models"
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # βœ… Load XTTS model efficiently
14
+ try:
15
+ tts = TTS(
16
+ model_name="tts_models/multilingual/multi-dataset/xtts_v2",
17
+ progress_bar=True, # Speeds up processing
18
+ gpu=(device == "cuda") # βœ… Remove `trust_remote_code=True`
19
+ )
20
+ tts.to(device)
21
+ print(f"[INFO] XTTS model loaded successfully on {device}")
22
+ except Exception as e:
23
+ print(f"[ERROR] Failed to load XTTS model: {e}")
24
+ raise e
25
+
26
+ # βœ… Optimize MP3 to WAV conversion
27
+ def convert_mp3_to_wav(mp3_path: str) -> str:
28
+ wav_path = mp3_path.replace(".mp3", ".wav")
29
+ if not os.path.exists(wav_path):
30
+ try:
31
+ audio = AudioSegment.from_file(mp3_path)
32
+ audio.export(wav_path, format="wav")
33
+ except Exception as e:
34
+ print(f"[ERROR] MP3 conversion failed: {e}")
35
+ return None
36
+ return wav_path
37
+
38
+ # βœ… Fix Speaker File Handling & Text Processing for Long Inputs
39
+ def text_to_speech(text: str, speaker_wav: str, speaker_wav_file: str):
40
+ text = text.strip().replace("\n", " ")[:1500] # βœ… Supports up to 10+ lines
41
+
42
+ speaker_audio = speaker_wav_file or speaker_wav
43
+ if not text:
44
+ return None, "⚠️ Error: Text input is empty."
45
+ if not speaker_audio or not os.path.exists(speaker_audio):
46
+ return None, "⚠️ Error: No valid speaker audio provided."
47
+
48
+ if speaker_audio.endswith(".mp3"):
49
+ speaker_audio = convert_mp3_to_wav(speaker_audio)
50
+ if not speaker_audio:
51
+ return None, "⚠️ Error converting MP3 to WAV."
52
+
53
+ output_path = "output.wav"
54
+
55
+ try:
56
+ # βœ… Ensure correct tensor shape for attention mask
57
+ attention_mask = torch.ones((1, len(text.split())), dtype=torch.float32).to(device)
58
+
59
+ tts.tts_to_file(text=text, speaker_wav=speaker_audio, language="en", file_path=output_path, attention_mask=attention_mask)
60
+
61
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
62
+ return output_path, ""
63
+ else:
64
+ return None, "⚠️ Error: Audio was not generated."
65
+ except Exception as e:
66
+ return None, f"⚠️ Error during synthesis: {str(e)}"
67
+
68
+ # βœ… Gradio UI setup
69
+ with gr.Blocks() as demo:
70
+ with gr.Row():
71
+ with gr.Column():
72
+ text_input = gr.Textbox(label="Enter text to clone", max_lines=15, lines=15) # βœ… Supports long input
73
+ radio = gr.Radio(["mic", "file"], value="mic", label="Upload speaker audio")
74
+ audio_input_mic = gr.Audio(label="Use Microphone", sources="microphone", type="filepath", visible=True)
75
+ audio_input_file = gr.Audio(label="Upload File (.wav/.mp3)", type="filepath", visible=False)
76
+
77
+ with gr.Row():
78
+ with gr.Column():
79
+ btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
80
+ with gr.Column():
81
+ btn = gr.Button("Generate Voice", variant="primary")
82
+
83
+ with gr.Column():
84
+ audio_output = gr.Audio(label="Generated Voice", visible=True, autoplay=True)
85
+ error_box = gr.Textbox(label="Status", value="", visible=False)
86
+
87
+ btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=[audio_output, error_box])
88
+
 
 
 
 
89
  demo.launch()