Coqui + Bark Voice Cloning

from TTS.api import TTS
import json
import gradio as gr
from share_btn import community_icon_html, loading_icon_html, share_js
import os
import shutil
import re

import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write, read
from pydub import AudioSegment

file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
MAX_NUMBER_SENTENCES = 10

with open("characters.json", "r") as file:
    data = json.load(file)
    characters = [
        {
            "image": item["image"],
            "title": item["title"],
            "speaker": item["speaker"]
        }
        for item in data
    ]

tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=False)


def load_hidden_mic(audio_in):
    print("USER RECORDED A NEW SAMPLE")

    library_path = 'bark_voices'
    folder_name = 'audio-0-100'
    second_folder_name = 'audio-0-100_cleaned'

    folder_path = os.path.join(library_path, folder_name)
    second_folder_path = os.path.join(library_path, second_folder_name)

    print("We need to clean previous util files, if needed:")
    if os.path.exists(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(
                f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}")
        except OSError as e:
            print(f"Error: {folder_path} - {e.strerror}")
    else:
        print(
            f"OK, the folder a raw recorded sample does not exist: {folder_path}")

    if os.path.exists(second_folder_path):
        try:
            shutil.rmtree(second_folder_path)
            print(
                f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}")
        except OSError as e:
            print(f"Error: {second_folder_path} - {e.strerror}")
    else:
        print(
            f"Ok, the folderfor a cleaned recorded sample does not exist: {second_folder_path}")

    return audio_in


def infer(hidden_numpy_audio):
    print("""
—————
NEW INFERENCE:
———————
    """)

    prompt = "Hi mom, I have a broken tire and need a transfer. Can you send me some money please?"

    gr.Info("Generating audio from prompt")
    tts.tts_to_file(text=prompt,
                    file_path="output.wav",
                    voice_dir="bark_voices/",
                    speaker=f"{file_name}")

    print("Preparing final waveform video ...")
    tts_video = gr.make_waveform(audio="output.wav")
    print(tts_video)
    print("FINISHED")
    return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path


css = """
.mic-wrap > button {
    width: 100%;
    height: 60px;
    font-size: 1.4em!important;
}
.record-icon.svelte-1thnwz {
    display: flex;
    position: relative;
    margin-right: var(--size-2);
    width: unset;
    height: unset;
}
span.record-icon > span.dot.svelte-1thnwz {
    width: 20px!important;
    height: 20px!important;
}
"""
html_header = """
        <h1 style="text-align: center;">Coqui + Bark Voice Cloning</h1>
        <p style="text-align: center;">
        Mimic any voice character in less than 2 minutes with this <a href="https://tts.readthedocs.io/en/dev/models/bark.html" target="_blank">Coqui TTS + Bark</a> demo ! <br />
        Record a clean 20 seconds voice using the microphone provided.<br />
        The hard-coded TTS prompt is: “Hi mom, I have a broken tire and need an e-transfer. Can you send me some money please?”<br />
        </p>
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(html_header)

    micro_in = gr.Audio(
        label="Record voice to clone",
        type="filepath",
        source="microphone",
        interactive=True
    )
    hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
    micro_submit_btn = gr.Button("Submit")

    micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[
                            hidden_audio_numpy], queue=False)

    cloned_out = gr.Audio(
        label="Text to speech output",
        visible=False
    )

    video_out = gr.Video(
        label="Waveform video",
        elem_id="voice-video-out"
    )

    micro_submit_btn.click(
        fn=infer,
        inputs=[hidden_audio_numpy],
        outputs=[cloned_out, video_out]
    )

demo.queue(api_open=False, max_size=10).launch()