CodingBillionaire commited on
Commit
2f468d9
·
1 Parent(s): 5b9a279

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torchaudio
4
+ import time
5
+ from datetime import datetime
6
+ from tortoise.api import TextToSpeech
7
+ from tortoise.utils.audio import load_audio, load_voice, load_voices
8
+
9
+ VOICE_OPTIONS = [
10
+ "random", # special option for random voice
11
+ "custom_voice", # special option for custom voice
12
+ "disabled", # special option for disabled voice
13
+ ]
14
+
15
+
16
+ def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
17
+ if voice != "custom_voice":
18
+ voices = [voice]
19
+ else:
20
+ voices = []
21
+
22
+ if voice_b != "disabled":
23
+ voices.append(voice_b)
24
+ if voice_c != "disabled":
25
+ voices.append(voice_c)
26
+
27
+ if emotion != "None/Custom":
28
+ text = f"[I am really {emotion.lower()},] {text}"
29
+ elif prompt.strip() != "":
30
+ text = f"[{prompt},] {text}"
31
+
32
+ c = None
33
+ if voice == "custom_voice":
34
+ if mic_audio is None:
35
+ raise gr.Error("Please provide audio from mic when choosing custom voice")
36
+ c = load_audio(mic_audio, 22050)
37
+
38
+
39
+ if len(voices) == 1 or len(voices) == 0:
40
+ if voice == "custom_voice":
41
+ voice_samples, conditioning_latents = [c], None
42
+ else:
43
+ voice_samples, conditioning_latents = load_voice(voice)
44
+ else:
45
+ voice_samples, conditioning_latents = load_voices(voices)
46
+ if voice == "custom_voice":
47
+ voice_samples.extend([c])
48
+
49
+ sample_voice = voice_samples[0] if len(voice_samples) else None
50
+
51
+ start_time = time.time()
52
+ gen, _ = tts.tts_with_preset(
53
+ text,
54
+ voice_samples=voice_samples,
55
+ conditioning_latents=conditioning_latents,
56
+ preset=preset,
57
+ use_deterministic_seed=seed,
58
+ return_deterministic_state=True,
59
+ k=3,
60
+ )
61
+
62
+ with open("Tortoise_TTS_Runs.log", "a") as f:
63
+ f.write(
64
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
65
+ )
66
+
67
+ return (
68
+ (22050, sample_voice.squeeze().cpu().numpy()),
69
+ (24000, gen[0].squeeze().cpu().numpy()),
70
+ (24000, gen[1].squeeze().cpu().numpy()),
71
+ (24000, gen[2].squeeze().cpu().numpy()),
72
+ )
73
+
74
+
75
+ def main():
76
+ text = gr.Textbox(lines=4, label="Text:")
77
+ emotion = gr.Radio(
78
+ ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
79
+ value="None/Custom",
80
+ label="Select emotion:",
81
+ type="value",
82
+ )
83
+ prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
84
+ preset = gr.Radio(
85
+ ["ultra_fast", "fast", "standard", "high_quality"],
86
+ value="fast",
87
+ label="Preset mode (determines quality with tradeoff over speed):",
88
+ type="value",
89
+ )
90
+ voice = gr.Dropdown(
91
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
92
+ value="angie",
93
+ label="Select voice:",
94
+ type="value",
95
+ )
96
+ mic_audio = gr.Audio(
97
+ label="Record voice (when selected custom_voice):",
98
+ source="microphone",
99
+ type="filepath",
100
+ )
101
+ voice_b = gr.Dropdown(
102
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
103
+ value="disabled",
104
+ label="(Optional) Select second voice:",
105
+ type="value",
106
+ )
107
+ voice_c = gr.Dropdown(
108
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
109
+ value="disabled",
110
+ label="(Optional) Select third voice:",
111
+ type="value",
112
+ )
113
+ seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
114
+
115
+ selected_voice = gr.Audio(label="Sample of selected voice (first):")
116
+ output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
117
+ output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
118
+ output_audio_3 = gr.Audio(label="Output [Candidate 3]:")
119
+
120
+ interface = gr.Interface(
121
+ fn=inference,
122
+ inputs=[
123
+ text,
124
+ emotion,
125
+ prompt,
126
+ voice,
127
+ mic_audio,
128
+ voice_b,
129
+ voice_c,
130
+ preset,
131
+ seed,
132
+ ],
133
+ outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
134
+ )
135
+ interface.launch(share=True)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ tts = TextToSpeech()
140
+
141
+ with open("Tortoise_TTS_Runs.log", "a") as f:
142
+ f.write(
143
+ f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
144
+ )
145
+
146
+ main()