import whisper
import gradio as gr
import ffmpeg
from yt_dlp import YoutubeDL
import os

youtube_livestream_codes = [
    91,
    92,
    93,
    94,
    95,
    96,
    300,
    301,
]
youtube_mp4_codes = [
    298,
    18,
    22,
    140,
    133,
    134
]

import sys

def get_video_metadata(video_url: str = "https://www.youtube.com/watch?v=21X5lGlDOfg&ab_channel=NASA")-> dict:
    with YoutubeDL({'outtmpl': '%(id)s.%(ext)s'}) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)
        video_title = info_dict.get('title', None)
        uploader_id = info_dict.get('uploader_id', None)
        print(f"[youtube] {video_title}: {uploader_id}")
    return info_dict


def parse_metadata(metadata) -> dict:
    """
    Parse metadata and send to discord.
    After a video is done recording, 
    it will have both the livestream format and the mp4 format.
    """
    # send metadata to discord
    formats = metadata.get("formats", [])
    # filter for ext = mp4
    mp4_formats = [f for f in formats if f.get("ext", "") == "mp4"]
    format_ids = [int(f.get("format_id", 0)) for f in mp4_formats]
    video_entries = sorted(set(format_ids).intersection(youtube_mp4_codes))

    is_livestream = True
    if len(video_entries) > 0:
        # use video format id over livestream id if available
        selected_id = video_entries[0]
        is_livestream = False


    return {
        "selected_id": selected_id,
        "is_livestream": is_livestream,
    }

def get_video(url: str, config: dict):
    """
    Get video from start time.
    """
    # result = subprocess.run()
    # could delay start time by a few seconds to just sync up and capture the full video length
    # but would need to time how long it takes to fetch the video using youtube-dl and other adjustments and start a bit before
    filename = config.get("filename", "livestream01.mp4")
    end = config.get("end", "00:15:00")
    overlay_file = ffmpeg.input(filename)
    (
        ffmpeg
        .input(url, t=end)
        .output(filename)
        .run()
    )

def get_all_files(url: str, end: str = "00:15:00"):
    metadata = get_video_metadata(url)
    temp_dict = parse_metadata(metadata)
    selected_id = temp_dict.get("selected_id", 0)
    formats = metadata.get("formats", [])
    selected_format = [f for f in formats if f.get("format_id", "") == str(selected_id)][0]
    format_url = selected_format.get("url", "")
    filename = "temp.mp4"
    get_video(format_url, {"filename": filename, "end": end})
    return filename

def get_text_from_mp3_whisper(inputType:str, mp3_file: str, url_path: str, taskName: str, srcLanguage: str)->str:
    model = whisper.load_model("medium")
    # options = whisper.DecodingOptions(language="en", without_timestamps=True)
    options = dict(language=srcLanguage)
    transcribe_options = dict(task=taskName, **options)
    # return if url_path is not set, taskName is not set, srcLanguage is not set
    if not url_path:
        return "url_path is not set"
    if not taskName:
        return "taskName is not set"
    if not srcLanguage:
        return "srcLanguage is not set"
    if inputType == "url":
        filename = get_all_files(url_path)
        result = model.transcribe(filename, **transcribe_options)
    else:
        result = model.transcribe(mp3_file, **transcribe_options)
    # adjust for spacy mode
    html_text = ""
    lines = []
    for count, segment in enumerate(result.get("segments")):
        # print(segment)
        start = segment.get("start")
        end = segment.get("end")
        lines.append(f"{count}")
        lines.append(f"{second_to_timecode(start)} --> {second_to_timecode(end)}")
        lines.append(segment.get("text", "").strip())
        lines.append('')
    words = '\n'.join(lines)
    input_file = filename or mp3_file
    # ffmpeg -i testing.mp4 -vf subtitles=transcript.srt mysubtitledmovie.mp4
    # use ffmpeg bindings to add subtitles to video
    # use python to call ffmpeg -i testing.mp4 -vf subtitles=transcript.srt mysubtitledmovie.mp4
    
    input_video = ffmpeg.input('testing.mp4')
    subtitle = ffmpeg.filter('subtitles', 'transcript.srt')

    output_video = ffmpeg.output(input_video, subtitle, 'subtitled.mp4', vcodec='libx264', video_filters='[v]subtitles=transcript.srt[v]')

    ffmpeg.run(output_video)
    # for spacy use advanced logic to extract and append to html_text using tables?

    # get output_video as mp4
    return result.get("segments"), words, "subtitled.mp4"
 
gr.Interface(
    title = 'Download Video From url and extract text from audio', 
    fn=get_text_from_mp3_whisper, 
    inputs=[
        gr.Dropdown(["url", "file"], value="url"),
        gr.inputs.Audio(type="filepath"),
        gr.inputs.Textbox(),
        gr.Dropdown(["translate", "transcribe"], value="translate"),
        gr.Dropdown(["Japanese", "English"], value="Japanese")
    ],
    button_text="Go!",
    button_color="#333333",
    outputs=[
        "json", "text", gr.outputs.Video(type="file")
    ],
    live=True).launch()