from io import StringIO
import os
import tempfile
import streamlit as st
import json
import whisper_timestamped as whisper
import pandas as pd

STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
LANGUAGES = {"English": "en", "Spanish": "es"}
MODEL_SIZES = {"Medium": "medium", "Large": "large"}


def save_temp_file(file):
    temp_dir = tempfile.gettempdir()
    temp_file_path = os.path.join(temp_dir, file.name)
    with open(temp_file_path, "wb") as temp_file:
        temp_file.write(file.getvalue())
    return temp_file_path


@st.cache_resource(show_spinner=False)
def load_model(model_size: str):
    print(f"model size : {MODEL_SIZES[model_size]}")
    return whisper.load_model(
        MODEL_SIZES[model_size], device="cpu", download_root="models"
    )


def get_sentence_data(filename: str, timestamp_dict: dict):
    sentence_df = pd.DataFrame(
        columns=["Audio file", "Sentence", "Start", "End", "Duration"]
    )
    for sentence_i in timestamp_dict["segments"]:
        sentence_i = pd.DataFrame(
            {
                "Audio file": [filename],
                "Sentence": [str(sentence_i["text"])],
                "Start": [sentence_i["start"]],
                "End": [sentence_i["end"]],
                "Duration": [sentence_i["end"] - sentence_i["start"]],
            }
        )
        sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
    return sentence_df


def get_word_data(filename: str, timestamp_dict: dict):
    word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
    for sentence_i in timestamp_dict["segments"]:
        for word_i in sentence_i["words"]:
            word_i_df = pd.DataFrame(
                {
                    "Audio file": [filename],
                    "Word": [str(word_i["text"])],
                    "Start": [word_i["start"]],
                    "End": [word_i["end"]],
                    "Duration": [word_i["end"] - word_i["start"]],
                }
            )
            word_df = pd.concat([word_df, word_i_df], ignore_index=True)
    return word_df

st.set_page_config(layout="wide")

st.title("⏱️💬 Timestamping with Whisper")

# Audio load
audio_file = st.file_uploader(
    "Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True
)

stamp_type, lang, size = st.columns(3)

with stamp_type:
    timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys()))

with lang:
    language = st.selectbox("Language", options=list(LANGUAGES.keys()))

with size:
    model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))


if st.button("Generate Timestamp", use_container_width=True):
    with st.spinner("Loading model..."):
        model = load_model(model_size)
    timestamps_df = pd.DataFrame()
    for audio_i in audio_file:
        with st.spinner(f"Processing audio: {audio_i.name}"):
            tmp_audio = save_temp_file(audio_i)
            tmp_audio_file = whisper.load_audio(tmp_audio)
            timestamp_result = whisper.transcribe(
                model, tmp_audio_file, language=LANGUAGES[language]
            )
            if timestamp_type == "Sentence-level":
                audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
            if timestamp_type == "Word-level":
                audio_i_df = get_word_data(audio_i.name, timestamp_result)
            timestamps_df = pd.concat([timestamps_df, audio_i_df], ignore_index=True)

    st.dataframe(timestamps_df)

    st.download_button(
        "Save timestamps",
        timestamps_df.to_csv(index=False),
        file_name="timestamps.csv",
        mime="text/csv",
        use_container_width=True,
    )