import gradio as gr
import torch
from pathlib import Path
from transformers import AutoProcessor, BarkModel
import scipy
from pytube import YouTube
from pydub import AudioSegment
from TTS.api import TTS
#import ffmpeg


# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
# model.enable_cpu_offload()

device = "cpu"


processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
#SAMPLE_RATE = 24_000
def run_bark(text, n, lang):
    #history_prompt = []
    semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

        #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
    inputs = processor(text=text,
        voice_preset = semantic_prompt,
        return_tensors="pt",
    )
    print("generating")
    speech_values = model.generate(
        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
    )
    #speech_values = model.generate(**inputs, do_sample=True)
    sampling_rate = model.generation_config.sample_rate

    #sampling_rate = 24_000
    print("writing")
    scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
    return ("bark_out.wav")

def custom_bark(inp):
    speaker_wav=Path("Mid.mp3")
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
    tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path="output.wav")
    return ("output.wav")
    
def load_video_yt(vid):
    yt = YouTube(vid)
    vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
    vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4")
    print (yt.length)
    return vid, vid_aud, "tmp_aud.mp4"

def trim_clip(clip, start_t, end_t):
    clip = Path("tmp_aud.mp4")
    #clip = "tmp_aud.mp3"
    # Open an mp3 file
    song = AudioSegment.from_file("tmp_aud.mp4", 
                                  format="mp4")
      
    # start and end time
    #start_min = 0
    #start_sec = 10
    #end_min = 0
    #end_sec = 55
    start_min = int(start_t.split(":",1)[0])
    start_sec = int(start_t.split(":",1)[1])
    end_min = int(end_t.split(":",1)[0])
    end_sec = int(end_t.split(":",1)[1])
    # pydub does things in milliseconds, so convert time
    start = ((start_min*60)+start_sec)*1000
    end = ((end_min*60)+end_sec)*1000
    #start = 0
    #end = 15*1000
    # song clip of 10 seconds from starting
    first_10_seconds = song[start: end]
      
    # save file
    first_10_seconds.export("Mid.mp3", format="mp3")
    print("New Audio file is created and saved")

    return "Mid.mp3"
    
with gr.Blocks() as app:
    with gr.Column():
        in_text = gr.Textbox()
        with gr.Tab("Default"):
            with gr.Row():
                speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
                speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
            go_btn = gr.Button()
        with gr.Tab("Upload"):
            with gr.Row():
                with gr.Column():
                    in_aud_mic = gr.Audio(source='microphone')
                    in_aud_file = gr.Audio(source='upload', interactive = True)
                    aud_file = gr.File()
                with gr.Column():
                    in_aud_yt = gr.Textbox(label="YouTube URL")
                    load_yt_btn = gr.Button("Load URL")
                with gr.Column():
                    with gr.Row():
                        start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
                        end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
                        
                    trim_clip_btn = gr.Button("Trim Clip")
                    trim_aud = gr.Audio(source='upload', interactive = False)
            alt_go_btn = gr.Button()
            yt_vid = gr.Video(type = 'filepath')
        #speaker_num = gr.Number(value=0)

    with gr.Column():
        out_audio = gr.Audio()

    go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
    load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
    trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
    alt_go_btn.click(custom_bark, in_text, out_audio)

app.launch()