tahirturk commited on
Commit
f12ab45
Β·
verified Β·
1 Parent(s): 87119c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -33
app.py CHANGED
@@ -1,23 +1,24 @@
1
  import spaces
2
  import gradio as gr
3
  import torch
4
- from f5_tts.api import F5TTS
5
  import os
6
  from pydub import AudioSegment
7
  import re
8
 
9
  os.makedirs("audio", exist_ok=True)
10
 
 
 
 
11
  # Auto-detect device
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
-
14
- # Load ultra-realistic model (F5-TTS)
15
- tts = F5TTS(device=device)
16
 
17
  # Function for long text voice cloning
18
  @spaces.GPU(enable_queue=True)
19
  def clone(text, audio):
20
- # Split text into smaller chunks (sentences or short phrases)
21
  sentences = re.split(r'(?<=[.!?]) +', text)
22
  final_audio = AudioSegment.silent(duration=0)
23
 
@@ -25,28 +26,23 @@ def clone(text, audio):
25
  if not chunk.strip():
26
  continue
27
  temp_path = f"chunk_{i}.wav"
28
-
29
- # Generate speech from cloned voice using F5-TTS
30
- tts.infer(
31
- ref_audio_path=audio,
32
  text=chunk,
33
- output_path=temp_path,
34
- speaker_id=0,
35
  language="en",
36
- emotion="neutral"
37
  )
38
-
39
  final_audio += AudioSegment.from_wav(temp_path)
40
 
41
- # Merge chunks into one final audio file
42
  output_path = "./output.wav"
43
  final_audio.export(output_path, format="wav")
44
  return output_path
45
 
46
-
47
- # ==================== UI SECTION ====================
48
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="cyan", neutral_hue="slate")) as demo:
49
 
 
50
  gr.HTML("""
51
  <style>
52
  body {
@@ -106,33 +102,33 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="cyan", ne
106
  with gr.Column(scale=1):
107
  gr.Markdown(
108
  """
109
- # πŸŽ™οΈ Ultra Realistic Voice Clone Studio (F5-TTS)
110
- Clone **any voice** with high emotional realism using **F5-TTS**.
111
- Upload a short reference audio and type what you want it to say.
112
- **Supports English and Multilingual Texts.**
113
  """
114
  )
115
 
116
  text_input = gr.Textbox(
117
  label="Enter your text",
118
- placeholder="Type what you'd like the cloned voice to say...",
119
  lines=6
120
  )
121
  audio_input = gr.Audio(
122
  type="filepath",
123
  label="Upload voice reference (WAV or MP3)"
124
  )
125
- submit_btn = gr.Button("✨ Generate Ultra-Realistic Voice", variant="primary")
126
 
127
  with gr.Column(scale=1):
128
  output_audio = gr.Audio(type="filepath", label="πŸ”Š Generated Voice Output")
129
  gr.Markdown(
130
  """
131
  ---
132
- ⚑ **Pro Tips**
133
- - Use **clean 5–15 sec** reference audio for best results.
134
- - Long text is automatically chunked for natural flow.
135
- - You can now generate **multi-minute, realistic speech**.
136
  ---
137
  """
138
  )
@@ -140,16 +136,16 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="cyan", ne
140
  with gr.Row():
141
  gr.Examples(
142
  examples=[
143
- ["Hey! It's me Dorthy, from the Wizard of Oz. Type what you'd like me to say.", "./audio/Wizard-of-Oz-Dorthy.wav"],
144
- ["It's me Vito Corleone from The Godfather.", "./audio/Godfather.wav"],
145
- ["Hey, it's me Paris Hilton!", "./audio/Paris-Hilton.mp3"],
146
- ["Hey, it's me Megan Fox from Transformers.", "./audio/Megan-Fox.mp3"],
147
- ["Hey there, it's me Jeff Goldblum.", "./audio/Jeff-Goldblum.mp3"],
148
- ["Hey there, it's me Heath Ledger as the Joker.", "./audio/Heath-Ledger.mp3"],
149
  ],
150
  inputs=[text_input, audio_input],
151
  outputs=[output_audio],
152
- label="🎭 Try these sample voices"
153
  )
154
 
155
  submit_btn.click(fn=clone, inputs=[text_input, audio_input], outputs=output_audio)
 
1
  import spaces
2
  import gradio as gr
3
  import torch
4
+ from TTS.api import TTS
5
  import os
6
  from pydub import AudioSegment
7
  import re
8
 
9
  os.makedirs("audio", exist_ok=True)
10
 
11
+ # Agree to Coqui TTS license
12
+ os.environ["COQUI_TOS_AGREED"] = "1"
13
+
14
  # Auto-detect device
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
 
17
 
18
  # Function for long text voice cloning
19
  @spaces.GPU(enable_queue=True)
20
  def clone(text, audio):
21
+ # Split input into sentences/phrases
22
  sentences = re.split(r'(?<=[.!?]) +', text)
23
  final_audio = AudioSegment.silent(duration=0)
24
 
 
26
  if not chunk.strip():
27
  continue
28
  temp_path = f"chunk_{i}.wav"
29
+ tts.tts_to_file(
 
 
 
30
  text=chunk,
31
+ speaker_wav=audio,
 
32
  language="en",
33
+ file_path=temp_path
34
  )
 
35
  final_audio += AudioSegment.from_wav(temp_path)
36
 
37
+ # Merge chunks into one file
38
  output_path = "./output.wav"
39
  final_audio.export(output_path, format="wav")
40
  return output_path
41
 
42
+ # UI
 
43
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="cyan", neutral_hue="slate")) as demo:
44
 
45
+ # Custom CSS
46
  gr.HTML("""
47
  <style>
48
  body {
 
102
  with gr.Column(scale=1):
103
  gr.Markdown(
104
  """
105
+ # πŸŽ™οΈ Voice Clone Studio By Tahir Turk
106
+ Clone any voice by uploading a short reference audio file
107
+ and typing what you want it to say.
108
+ **Powered by XTTS v2 β€” multilingual voice cloning.**
109
  """
110
  )
111
 
112
  text_input = gr.Textbox(
113
  label="Enter your text",
114
+ placeholder="Type anything you'd like the cloned voice to say...",
115
  lines=6
116
  )
117
  audio_input = gr.Audio(
118
  type="filepath",
119
  label="Upload voice reference (WAV or MP3)"
120
  )
121
+ submit_btn = gr.Button("✨ Generate Voice", variant="primary")
122
 
123
  with gr.Column(scale=1):
124
  output_audio = gr.Audio(type="filepath", label="πŸ”Š Generated Voice Output")
125
  gr.Markdown(
126
  """
127
  ---
128
+ ⚑ **Tips for Best Results**
129
+ - Use a **clean, clear** reference audio (5–15 seconds works best).
130
+ - Long text will be split automatically for natural speech.
131
+ - You can generate **minutes of audio** now without cutoff.
132
  ---
133
  """
134
  )
 
136
  with gr.Row():
137
  gr.Examples(
138
  examples=[
139
+ ["Hey! It's me Dorthy, from the Wizard of Oz. Type in whatever you'd like me to say.", "./audio/Wizard-of-Oz-Dorthy.wav"],
140
+ ["It's me Vito Corleone, from the Godfather. Type in whatever you'd like me to say.", "./audio/Godfather.wav"],
141
+ ["Hey, it's me Paris Hilton. Type in whatever you'd like me to say.", "./audio/Paris-Hilton.mp3"],
142
+ ["Hey, it's me Megan Fox from Transformers. Type in whatever you'd like me to say.", "./audio/Megan-Fox.mp3"],
143
+ ["Hey there, it's me Jeff Goldblum. Type in whatever you'd like me to say.", "./audio/Jeff-Goldblum.mp3"],
144
+ ["Hey there, it's me Heath Ledger as the Joker. Type in whatever you'd like me to say.", "./audio/Heath-Ledger.mp3"],
145
  ],
146
  inputs=[text_input, audio_input],
147
  outputs=[output_audio],
148
+ label="🎭 Try with these sample voices"
149
  )
150
 
151
  submit_btn.click(fn=clone, inputs=[text_input, audio_input], outputs=output_audio)