import os
import subprocess
import sys

# Function to setup the environment
def setup_environment():
    # Clone the Tortoise-TTS repository if it doesn't exist
    if not os.path.exists("tortoise-tts"):
        subprocess.run(["git", "clone", "https://github.com/neonbjb/tortoise-tts.git"], check=True)

    # Change directory to the cloned repository
    os.chdir("tortoise-tts")

    # Install requirements from requirements.txt
    subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)

    # Install the package using setup.py
    subprocess.run([sys.executable, "setup.py", "install"], check=True)

    # Install Gradio
    subprocess.run([sys.executable, "-m", "pip", "install", "gradio"], check=True)

def main():
    # Call the setup function to ensure everything is installed
    setup_environment()

    # Import Gradio and other required libraries after setting up the environment
    import gradio as gr
    import torchaudio
    import time
    from datetime import datetime

    # Ensure the tortoise package is correctly imported
    try:
        from tortoise.api import TextToSpeech
    except ImportError as e:
        raise ImportError("Tortoise TTS not found. Make sure it is correctly installed.") from e

    # Initialize the TextToSpeech instance
    tts = TextToSpeech()

    VOICE_OPTIONS = [
        "random",  # special option for random voice
        "custom_voice",  # special option for custom voice
        "disabled",  # special option for disabled voice
    ]

    def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
        if voice != "custom_voice":
            voices = [voice]
        else:
            voices = []

        if voice_b != "disabled":
            voices.append(voice_b)
        if voice_c != "disabled":
            voices.append(voice_c)

        if emotion != "None/Custom":
            text = f"[I am really {emotion.lower()},] {text}"
        elif prompt.strip() != "":
            text = f"[{prompt},] {text}"

        c = None
        if voice == "custom_voice":
            if mic_audio is None:
                raise gr.Error("Please provide audio from mic when choosing custom voice")
            c = torchaudio.load(mic_audio)[0]  # Use torchaudio to load audio

        if len(voices) == 1 or len(voices) == 0:
            if voice == "custom_voice":
                voice_samples, conditioning_latents = [c], None
            else:
                voice_samples, conditioning_latents = tts.load_voice(voice)  # Ensure to call TTS method
        else:
            voice_samples, conditioning_latents = tts.load_voices(voices)
            if voice == "custom_voice":
                voice_samples.append(c)

        sample_voice = voice_samples[0] if len(voice_samples) else None

        start_time = time.time()
        gen, _ = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
            use_deterministic_seed=seed,
            return_deterministic_state=True,
            k=3,
        )

        return (
            (22050, sample_voice.squeeze().cpu().numpy()),
            (24000, gen[0].squeeze().cpu().numpy()),
            (24000, gen[1].squeeze().cpu().numpy()),
            (24000, gen[2].squeeze().cpu().numpy()),
        )

    # Create the Gradio interface
    interface = gr.Interface(
        fn=inference,
        inputs=[
            gr.Textbox(lines=4, label="Text:"),
            gr.Radio(["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
                     value="None/Custom", label="Select emotion:"),
            gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:"),
            gr.Radio(["ultra_fast", "fast", "standard", "high_quality"],
                     value="fast", label="Preset mode:"),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="angie",  # Default voice
                label="Select voice:"
            ),
            gr.Audio(label="Record voice (when selected custom_voice):", type="filepath"),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="disabled",
                label="(Optional) Select second voice:"
            ),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="disabled",
                label="(Optional) Select third voice:"
            ),
            gr.Number(value=0, precision=0, label="Seed (for reproducibility):"),
        ],
        outputs=[
            gr.Audio(label="Sample of selected voice (first):"),
            gr.Audio(label="Output [Candidate 1]:"),
            gr.Audio(label="Output [Candidate 2]:"),
            gr.Audio(label="Output [Candidate 3]:"),
        ],
        title="RJ VOICE CLONING",
        description="<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>",
        css=".gradio-container { background-color: black; color: orange; }"
    )

    # Launch the interface
    interface.launch(share=True)

if __name__ == "__main__":
    main()