from io import StringIO import os import tempfile import streamlit as st import json import whisper_timestamped as whisper import pandas as pd STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"} LANGUAGES = {"English": "en", "Spanish": "es"} MODEL_SIZES = {"Medium": "medium", "Large": "large"} def save_temp_file(file): temp_dir = tempfile.gettempdir() temp_file_path = os.path.join(temp_dir, file.name) with open(temp_file_path, "wb") as temp_file: temp_file.write(file.getvalue()) return temp_file_path @st.cache_resource(show_spinner=False) def load_model(model_size: str): print(f"model size : {MODEL_SIZES[model_size]}") return whisper.load_model( MODEL_SIZES[model_size], device="cpu", download_root="models" ) def get_sentence_data(filename: str, timestamp_dict: dict): sentence_df = pd.DataFrame( columns=["Audio file", "Sentence", "Start", "End", "Duration"] ) for sentence_i in timestamp_dict["segments"]: sentence_i = pd.DataFrame( { "Audio file": [filename], "Sentence": [str(sentence_i["text"])], "Start": [sentence_i["start"]], "End": [sentence_i["end"]], "Duration": [sentence_i["end"] - sentence_i["start"]], } ) sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True) return sentence_df def get_word_data(filename: str, timestamp_dict: dict): word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"]) for sentence_i in timestamp_dict["segments"]: for word_i in sentence_i["words"]: word_i_df = pd.DataFrame( { "Audio file": [filename], "Word": [str(word_i["text"])], "Start": [word_i["start"]], "End": [word_i["end"]], "Duration": [word_i["end"] - word_i["start"]], } ) word_df = pd.concat([word_df, word_i_df], ignore_index=True) return word_df st.set_page_config(layout="wide") st.title("⏱️💬 Timestamping with Whisper") # Audio load audio_file = st.file_uploader( "Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True ) stamp_type, lang, size = st.columns(3) with stamp_type: timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys())) with lang: language = st.selectbox("Language", options=list(LANGUAGES.keys())) with size: model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys())) if st.button("Generate Timestamp", use_container_width=True): with st.spinner("Loading model..."): model = load_model(model_size) timestamps_df = pd.DataFrame() for audio_i in audio_file: with st.spinner(f"Processing audio: {audio_i.name}"): tmp_audio = save_temp_file(audio_i) tmp_audio_file = whisper.load_audio(tmp_audio) timestamp_result = whisper.transcribe( model, tmp_audio_file, language=LANGUAGES[language] ) if timestamp_type == "Sentence-level": audio_i_df = get_sentence_data(audio_i.name, timestamp_result) if timestamp_type == "Word-level": audio_i_df = get_word_data(audio_i.name, timestamp_result) timestamps_df = pd.concat([timestamps_df, audio_i_df], ignore_index=True) st.dataframe(timestamps_df) st.download_button( "Save timestamps", timestamps_df.to_csv(index=False), file_name="timestamps.csv", mime="text/csv", use_container_width=True, )