midhyaraj commited on
Commit
a446a91
1 Parent(s): 34e0ba3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install -U scipy
2
+ !git clone https://github.com/neonbjb/tortoise-tts.git
3
+ %cd tortoise-tts
4
+ !pip install -r requirements.txt
5
+ !python setup.py install
6
+ !pip install gradio
7
+
8
+ import os
9
+ import gradio as gr
10
+ import torchaudio
11
+ import time
12
+ from datetime import datetime
13
+ from tortoise.api import TextToSpeech
14
+ from tortoise.utils.audio import load_audio, load_voice, load_voices
15
+ import os
16
+
17
+ # Set the Gradio queue flag to disabled
18
+ os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
19
+ VOICE_OPTIONS = [
20
+ "random", # special option for random voice
21
+ "custom_voice", # special option for custom voice
22
+ "disabled", # special option for disabled voice
23
+ ]
24
+
25
+ def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
26
+ if voice != "custom_voice":
27
+ voices = [voice]
28
+ else:
29
+ voices = []
30
+
31
+ if voice_b != "disabled":
32
+ voices.append(voice_b)
33
+ if voice_c != "disabled":
34
+ voices.append(voice_c)
35
+
36
+ if emotion != "None/Custom":
37
+ text = f"[I am really {emotion.lower()},] {text}"
38
+ elif prompt.strip() != "":
39
+ text = f"[{prompt},] {text}"
40
+
41
+ c = None
42
+ if voice == "custom_voice":
43
+ if mic_audio is None:
44
+ raise gr.Error("Please provide audio from mic when choosing custom voice")
45
+ c = load_audio(mic_audio, 22050)
46
+
47
+ if len(voices) == 1 or len(voices) == 0:
48
+ if voice == "custom_voice":
49
+ voice_samples, conditioning_latents = [c], None
50
+ else:
51
+ voice_samples, conditioning_latents = load_voice(voice)
52
+ else:
53
+ voice_samples, conditioning_latents = load_voices(voices)
54
+ if voice == "custom_voice":
55
+ voice_samples.extend([c])
56
+
57
+ sample_voice = voice_samples[0] if len(voice_samples) else None
58
+
59
+ start_time = time.time()
60
+ gen, _ = tts.tts_with_preset(
61
+ text,
62
+ voice_samples=voice_samples,
63
+ conditioning_latents=conditioning_latents,
64
+ preset=preset,
65
+ use_deterministic_seed=seed,
66
+ return_deterministic_state=True,
67
+ k=3,
68
+ )
69
+
70
+ with open("Tortoise_TTS_Runs.log", "a") as f:
71
+ f.write(
72
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
73
+ )
74
+
75
+ return (
76
+ (22050, sample_voice.squeeze().cpu().numpy()),
77
+ (24000, gen[0].squeeze().cpu().numpy()),
78
+ (24000, gen[1].squeeze().cpu().numpy()),
79
+ (24000, gen[2].squeeze().cpu().numpy()),
80
+ )
81
+
82
+ def main():
83
+ # Custom HTML for the title
84
+ title_html = "<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>"
85
+
86
+ # Interface components
87
+ text = gr.Textbox(lines=4, label="Text:")
88
+ emotion = gr.Radio(
89
+ ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
90
+ value="None/Custom",
91
+ label="Select emotion:",
92
+ type="value",
93
+ )
94
+ prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
95
+ preset = gr.Radio(
96
+ ["ultra_fast", "fast", "standard", "high_quality"],
97
+ value="fast",
98
+ label="Preset mode (determines quality with tradeoff over speed):",
99
+ type="value",
100
+ )
101
+ voice = gr.Dropdown(
102
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
103
+ value="angie", # Default voice
104
+ label="Select voice:",
105
+ type="value",
106
+ )
107
+ mic_audio = gr.Audio(
108
+ label="Record voice (when selected custom_voice):",
109
+ type="filepath"
110
+ )
111
+ voice_b = gr.Dropdown(
112
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
113
+ value="disabled",
114
+ label="(Optional) Select second voice:",
115
+ type="value",
116
+ )
117
+ voice_c = gr.Dropdown(
118
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
119
+ value="disabled",
120
+ label="(Optional) Select third voice:",
121
+ type="value",
122
+ )
123
+ seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
124
+
125
+ selected_voice = gr.Audio(label="Sample of selected voice (first):")
126
+ output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
127
+ output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
128
+ output_audio_3 = gr.Audio(label="Output [Candidate 3]:")
129
+
130
+ # Create the Gradio interface
131
+ interface = gr.Interface(
132
+ fn=inference,
133
+ inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed],
134
+ outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
135
+ title="RJ VOICE CLONING",
136
+ description=title_html,
137
+ css=".gradio-container { background-color: black; color: orange; }"
138
+ )
139
+
140
+ # Launch the interface
141
+ interface.launch(share=True)
142
+
143
+ if __name__ == "__main__":
144
+ tts = TextToSpeech()
145
+
146
+ with open("Tortoise_TTS_Runs.log", "a") as f:
147
+ f.write(
148
+ f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
149
+ )
150
+
151
+ main()