midhyaraj commited on
Commit
34e0ba3
1 Parent(s): 99f6290

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +151 -0
README.md CHANGED
@@ -3,3 +3,154 @@ license: apache-2.0
3
  base_model:
4
  - nvidia/NVLM-D-72B
5
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  base_model:
4
  - nvidia/NVLM-D-72B
5
  ---
6
+ !pip install -U scipy
7
+ !git clone https://github.com/neonbjb/tortoise-tts.git
8
+ %cd tortoise-tts
9
+ !pip install -r requirements.txt
10
+ !python setup.py install
11
+ !pip install gradio
12
+
13
+ import os
14
+ import gradio as gr
15
+ import torchaudio
16
+ import time
17
+ from datetime import datetime
18
+ from tortoise.api import TextToSpeech
19
+ from tortoise.utils.audio import load_audio, load_voice, load_voices
20
+ import os
21
+
22
+ # Set the Gradio queue flag to disabled
23
+ os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
24
+ VOICE_OPTIONS = [
25
+ "random", # special option for random voice
26
+ "custom_voice", # special option for custom voice
27
+ "disabled", # special option for disabled voice
28
+ ]
29
+
30
+ def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
31
+ if voice != "custom_voice":
32
+ voices = [voice]
33
+ else:
34
+ voices = []
35
+
36
+ if voice_b != "disabled":
37
+ voices.append(voice_b)
38
+ if voice_c != "disabled":
39
+ voices.append(voice_c)
40
+
41
+ if emotion != "None/Custom":
42
+ text = f"[I am really {emotion.lower()},] {text}"
43
+ elif prompt.strip() != "":
44
+ text = f"[{prompt},] {text}"
45
+
46
+ c = None
47
+ if voice == "custom_voice":
48
+ if mic_audio is None:
49
+ raise gr.Error("Please provide audio from mic when choosing custom voice")
50
+ c = load_audio(mic_audio, 22050)
51
+
52
+ if len(voices) == 1 or len(voices) == 0:
53
+ if voice == "custom_voice":
54
+ voice_samples, conditioning_latents = [c], None
55
+ else:
56
+ voice_samples, conditioning_latents = load_voice(voice)
57
+ else:
58
+ voice_samples, conditioning_latents = load_voices(voices)
59
+ if voice == "custom_voice":
60
+ voice_samples.extend([c])
61
+
62
+ sample_voice = voice_samples[0] if len(voice_samples) else None
63
+
64
+ start_time = time.time()
65
+ gen, _ = tts.tts_with_preset(
66
+ text,
67
+ voice_samples=voice_samples,
68
+ conditioning_latents=conditioning_latents,
69
+ preset=preset,
70
+ use_deterministic_seed=seed,
71
+ return_deterministic_state=True,
72
+ k=3,
73
+ )
74
+
75
+ with open("Tortoise_TTS_Runs.log", "a") as f:
76
+ f.write(
77
+ f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
78
+ )
79
+
80
+ return (
81
+ (22050, sample_voice.squeeze().cpu().numpy()),
82
+ (24000, gen[0].squeeze().cpu().numpy()),
83
+ (24000, gen[1].squeeze().cpu().numpy()),
84
+ (24000, gen[2].squeeze().cpu().numpy()),
85
+ )
86
+
87
+ def main():
88
+ # Custom HTML for the title
89
+ title_html = "<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>"
90
+
91
+ # Interface components
92
+ text = gr.Textbox(lines=4, label="Text:")
93
+ emotion = gr.Radio(
94
+ ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
95
+ value="None/Custom",
96
+ label="Select emotion:",
97
+ type="value",
98
+ )
99
+ prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
100
+ preset = gr.Radio(
101
+ ["ultra_fast", "fast", "standard", "high_quality"],
102
+ value="fast",
103
+ label="Preset mode (determines quality with tradeoff over speed):",
104
+ type="value",
105
+ )
106
+ voice = gr.Dropdown(
107
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
108
+ value="angie", # Default voice
109
+ label="Select voice:",
110
+ type="value",
111
+ )
112
+ mic_audio = gr.Audio(
113
+ label="Record voice (when selected custom_voice):",
114
+ type="filepath"
115
+ )
116
+ voice_b = gr.Dropdown(
117
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
118
+ value="disabled",
119
+ label="(Optional) Select second voice:",
120
+ type="value",
121
+ )
122
+ voice_c = gr.Dropdown(
123
+ os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
124
+ value="disabled",
125
+ label="(Optional) Select third voice:",
126
+ type="value",
127
+ )
128
+ seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
129
+
130
+ selected_voice = gr.Audio(label="Sample of selected voice (first):")
131
+ output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
132
+ output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
133
+ output_audio_3 = gr.Audio(label="Output [Candidate 3]:")
134
+
135
+ # Create the Gradio interface
136
+ interface = gr.Interface(
137
+ fn=inference,
138
+ inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed],
139
+ outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
140
+ title="RJ VOICE CLONING",
141
+ description=title_html,
142
+ css=".gradio-container { background-color: black; color: orange; }"
143
+ )
144
+
145
+ # Launch the interface
146
+ interface.launch(share=True)
147
+
148
+ if __name__ == "__main__":
149
+ tts = TextToSpeech()
150
+
151
+ with open("Tortoise_TTS_Runs.log", "a") as f:
152
+ f.write(
153
+ f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
154
+ )
155
+
156
+ main()