import gradio as gr
import torch
from pathlib import Path
from pytube import YouTube
from pydub import AudioSegment
from transformers import AutoProcessor, BarkModel

from bark import SAMPLE_RATE, generate_audio, preload_models
from bark.generation import SUPPORTED_LANGS

from TTS.api import TTS
import scipy
import uuid
import os

test_audio="./shufflin.wav"

uid = uuid.uuid4()

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
DEBUG_MODE=False

AVAILABLE_PROMPTS = ["Unconditional", "Announcer"]
PROMPT_LOOKUP = {}
for _, lang in SUPPORTED_LANGS:
    for n in range(10):
        label = f"Speaker {n} ({lang})"
        AVAILABLE_PROMPTS.append(label)
        PROMPT_LOOKUP[label] = f"{lang}_speaker_{n}"
PROMPT_LOOKUP["Unconditional"] = None
PROMPT_LOOKUP["Announcer"] = "announcer"

def gen_tts(text, history_prompt):  # , temp_semantic, temp_waveform):
    history_prompt = PROMPT_LOOKUP[history_prompt]
    if DEBUG_MODE:
        audio_arr = np.zeros(SAMPLE_RATE)
    else:
        # , text_temp=temp_semantic, waveform_temp=temp_waveform)
        audio_arr = generate_audio(text, history_prompt=history_prompt)
    audio_arr = (audio_arr * 32767).astype(np.int16)
    return (SAMPLE_RATE, audio_arr)


def run_bark(text, n='1', lang='en'):
    uid=uuid.uuid4()
    #history_prompt = []
    semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

        #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
    inputs = processor(text=text,
        voice_preset = semantic_prompt,
        return_tensors="pt",
    )
    print("generating")
    speech_values = model.generate(
        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
    )
    #speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate

    #sampling_rate = 24_000
    print("writing")
    scipy.io.wavfile.write(f"bark_out-{uid}.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
    return (f"bark_out-{uid}.wav")


#tts = TTS(model_name=f"tts_models/en/ljspeech/fast_pitch", progress_bar=False).to("cpu")
tts = TTS(model_name=f"tts_models/en/ljspeech/glow-tts", progress_bar=False).to("cpu")
print(TTS().list_models())
def bark_ez(text, n='1', land='en'):
    uid=uuid.uuid4()
    tts.tts_to_file(text, file_path=f"{uid}-output.wav")
    return (f'{uid}-output.wav')


def custom_bark(inp, tog, speaker,in_aud=None, trim_aud=None, in_aud_mic=None):
    if tog=="Custom":
        if in_aud_mic != None:
            speaker_wav=in_aud_mic
        if in_aud !=None and trim_aud==None:
            speaker_wav=in_aud
            #speaker_wav=Path(f"{uid}-tmp_aud.mp4")
        if trim_aud != None:
            speaker_wav=Path(f"{uid}-trim.wav")
        tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
        tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path=f"{uid}-output.wav")
        return (f"{uid}-output.wav")
    if tog=="Preset":
        return (bark_ez(inp,speaker))
def load_video_yt(vid):
    yt = YouTube(vid)
    vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename=f"{uid}-tmp.mp4")
    vid_aud = yt.streams.filter(only_audio=True)[0].download(filename=f"{uid}-tmp_aud.mp4")
    print (f'Video Length: {yt.length}')
    return vid, vid_aud, f"{uid}-tmp_aud.mp4"

def trim_clip(clip, start_t, end_t):
    clip = Path(f"{clip}")
    song = AudioSegment.from_file(f"{clip}", format="mp4")
    #song = AudioSegment.from_file(Path(f"{clip}"), format="mp4")
    start_min = int(start_t.split(":",1)[0])
    start_sec = int(start_t.split(":",1)[1])
    end_min = int(end_t.split(":",1)[0])
    end_sec = int(end_t.split(":",1)[1])
    start = ((start_min*60)+start_sec)*1000
    end = ((end_min*60)+end_sec)*1000
    song_clip = song[start: end]
    song_clip.export(f"{uid}-trim.wav", format="wav")
    print("New Audio file is created and saved")

    return f"{uid}-trim.wav"
def pre_aud(inp):
    print(inp)
    song = AudioSegment.from_file(Path(f'{inp}'), format="mp4")
    song.export(f"{uid}-tmp_aud.mp4", format="mp4")
    print(f'pre_aud:: {f"{uid}-tmp_aud.mp4"}')
    return inp
def tog_in(tog):
    if tog=="Preset":
        return (gr.update(visible=True),gr.update(visible=False))
    if tog=="Custom":
        return (gr.update(visible=False),gr.update(visible=True))   


with gr.Blocks() as app:
    with gr.Group():
        with gr.Row():
            in_text = gr.Textbox(lines = 6, max_lines = 20)
            with gr.Column():
                alt_go_btn = gr.Button()
                out_audio = gr.Audio(interactive=False)

        with gr.Row():
            gr.Markdown('''<H1> Audio Source:''')
        with gr.Row():
            tog = gr.Radio(label="Input Type", choices=["Preset","Custom"], value="Preset")    
    with gr.Group(visible=True) as group_1:
        speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
        options = gr.Dropdown(AVAILABLE_PROMPTS, value="Speaker 1 (en)", label="Acoustic Prompt", elem_id="speaker_option") 
        semantic_btn=gr.Button("Run Semantic")
    with gr.Group(visible=False) as group_2:      
        with gr.Row():
            with gr.Column():

                #in_aud_mic = gr.Audio(source='microphone')
                in_aud_file = gr.Audio(label = 'Audio Source', sources=['microphone','upload'], interactive = True,type='filepath', value=test_audio)
                aud_file = gr.File(interactive=False,visible=True)
                with gr.Row():
                    start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
                    end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
                trim_clip_btn = gr.Button("Trim Clip")
                trim_aud = gr.Audio(label = 'Trimmed Audio Source', sources=['upload'], interactive = False)              
            with gr.Column():
                in_aud_yt = gr.Textbox(label="YouTube URL")
                load_yt_btn = gr.Button("Load URL")    
                yt_vid = gr.Video(interactive=False)
                
    semantic_btn.click(gen_tts,[in_text,options],out_audio)
    tog.change(tog_in,tog,[group_1,group_2])
    #in_aud_file.change(pre_aud,in_aud_file,aud_file)
    load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
    trim_clip_btn.click(trim_clip,[in_aud_file, start_time, end_time],trim_aud)
    alt_go_btn.click(custom_bark, [in_text,tog,speaker_num,in_aud_file,trim_aud], out_audio)

app.launch()