midhyaraj
/

voiceclone

Model card Files Files and versions Community

midhyaraj commited on 28 days ago

Commit

34e0ba3

•

1 Parent(s): 99f6290

Update README.md

Browse files

Files changed (1) hide show

README.md +151 -0

README.md CHANGED Viewed

@@ -3,3 +3,154 @@ license: apache-2.0
 base_model:
 - nvidia/NVLM-D-72B
 ---

 base_model:
 - nvidia/NVLM-D-72B
 ---
+!pip install -U scipy
+!git clone https://github.com/neonbjb/tortoise-tts.git
+%cd tortoise-tts
+!pip install -r requirements.txt
+!python setup.py install
+!pip install gradio
+import os
+import gradio as gr
+import torchaudio
+import time
+from datetime import datetime
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_audio, load_voice, load_voices
+import os
+# Set the Gradio queue flag to disabled
+os.environ["COMMANDLINE_ARGS"] = "--no-gradio-queue"
+VOICE_OPTIONS = [
+    "random",  # special option for random voice
+    "custom_voice",  # special option for custom voice
+    "disabled",  # special option for disabled voice
+]
+def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
+    if voice != "custom_voice":
+        voices = [voice]
+    else:
+        voices = []
+    if voice_b != "disabled":
+        voices.append(voice_b)
+    if voice_c != "disabled":
+        voices.append(voice_c)
+    if emotion != "None/Custom":
+        text = f"[I am really {emotion.lower()},] {text}"
+    elif prompt.strip() != "":
+        text = f"[{prompt},] {text}"
+    c = None
+    if voice == "custom_voice":
+        if mic_audio is None:
+            raise gr.Error("Please provide audio from mic when choosing custom voice")
+        c = load_audio(mic_audio, 22050)
+    if len(voices) == 1 or len(voices) == 0:
+        if voice == "custom_voice":
+            voice_samples, conditioning_latents = [c], None
+        else:
+            voice_samples, conditioning_latents = load_voice(voice)
+    else:
+        voice_samples, conditioning_latents = load_voices(voices)
+        if voice == "custom_voice":
+            voice_samples.extend([c])
+    sample_voice = voice_samples[0] if len(voice_samples) else None
+    start_time = time.time()
+    gen, _ = tts.tts_with_preset(
+        text,
+        voice_samples=voice_samples,
+        conditioning_latents=conditioning_latents,
+        preset=preset,
+        use_deterministic_seed=seed,
+        return_deterministic_state=True,
+        k=3,
+    )
+    with open("Tortoise_TTS_Runs.log", "a") as f:
+        f.write(
+            f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n"
+        )
+    return (
+        (22050, sample_voice.squeeze().cpu().numpy()),
+        (24000, gen[0].squeeze().cpu().numpy()),
+        (24000, gen[1].squeeze().cpu().numpy()),
+        (24000, gen[2].squeeze().cpu().numpy()),
+    )
+def main():
+    # Custom HTML for the title
+    title_html = "<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>"
+    # Interface components
+    text = gr.Textbox(lines=4, label="Text:")
+    emotion = gr.Radio(
+        ["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
+        value="None/Custom",
+        label="Select emotion:",
+        type="value",
+    )
+    prompt = gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:")
+    preset = gr.Radio(
+        ["ultra_fast", "fast", "standard", "high_quality"],
+        value="fast",
+        label="Preset mode (determines quality with tradeoff over speed):",
+        type="value",
+    )
+    voice = gr.Dropdown(
+        os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
+        value="angie",  # Default voice
+        label="Select voice:",
+        type="value",
+    )
+    mic_audio = gr.Audio(
+        label="Record voice (when selected custom_voice):",
+        type="filepath"
+    )
+    voice_b = gr.Dropdown(
+        os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
+        value="disabled",
+        label="(Optional) Select second voice:",
+        type="value",
+    )
+    voice_c = gr.Dropdown(
+        os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
+        value="disabled",
+        label="(Optional) Select third voice:",
+        type="value",
+    )
+    seed = gr.Number(value=0, precision=0, label="Seed (for reproducibility):")
+    selected_voice = gr.Audio(label="Sample of selected voice (first):")
+    output_audio_1 = gr.Audio(label="Output [Candidate 1]:")
+    output_audio_2 = gr.Audio(label="Output [Candidate 2]:")
+    output_audio_3 = gr.Audio(label="Output [Candidate 3]:")
+    # Create the Gradio interface
+    interface = gr.Interface(
+        fn=inference,
+        inputs=[text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed],
+        outputs=[selected_voice, output_audio_1, output_audio_2, output_audio_3],
+        title="RJ VOICE CLONING",
+        description=title_html,
+        css=".gradio-container { background-color: black; color: orange; }"
+    )
+    # Launch the interface
+    interface.launch(share=True)
+if __name__ == "__main__":
+    tts = TextToSpeech()
+    with open("Tortoise_TTS_Runs.log", "a") as f:
+        f.write(
+            f"\n\n-------------------------Tortoise TTS Logs, {datetime.now()}-------------------------\n"
+        )
+    main()