from io import StringIO import os import tempfile import streamlit as st import json import whisper_timestamped as whisper import pandas as pd STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"} LANGUAGES = {"English": "en", "Spanish": "es"} MODEL_SIZES = {"Medium": "medium", "Large": "large"} def save_temp_file(file): temp_dir = tempfile.gettempdir() temp_file_path = os.path.join(temp_dir, file.name) with open(temp_file_path, "wb") as temp_file: temp_file.write(file.getvalue()) return temp_file_path @st.cache_resource(show_spinner=False) def load_model(model_size: str): print(f"model size : {MODEL_SIZES[model_size]}") return whisper.load_model( MODEL_SIZES[model_size], device="cpu", download_root="models" ) def get_sentence_data(filename: str, timestamp_dict: dict): sentence_df = pd.DataFrame( columns=["Audio file", "Sentence", "Start", "End", "Duration"] ) for sentence_i in timestamp_dict["segments"]: sentence_i = pd.DataFrame( { "Audio file": [filename], "Sentence": [str(sentence_i["text"])], "Start": [sentence_i["start"]], "End": [sentence_i["end"]], "Duration": [sentence_i["end"] - sentence_i["start"]], } ) sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True) return sentence_df def get_word_data(filename: str, timestamp_dict: dict): pass def get_word_data(): pass st.title("⏱️🧾 Timestamp generator") # Audio load audio_file = st.file_uploader( "Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True ) stamp_type, lang, size = st.columns(3) with stamp_type: timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys())) with lang: language = st.selectbox("Language", options=list(LANGUAGES.keys())) with size: model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys())) # Botón para generar el timestamp if st.button("Generate Timestamp", use_container_width=True): with st.spinner("Loading model..."): model = load_model(model_size) sentences_df = pd.DataFrame() for audio_i in audio_file: with st.spinner(f"Processing audio: {audio_i.name}"): tmp_audio = save_temp_file(audio_i) tmp_audio_file = whisper.load_audio(tmp_audio) timestamp_result = whisper.transcribe( model, tmp_audio_file, language=LANGUAGES[language] ) audio_i_df = get_sentence_data(audio_i.name, timestamp_result) sentences_df = pd.concat([sentences_df, audio_i_df], ignore_index=True) st.dataframe(sentences_df) st.download_button( "Save timestamps", sentences_df.to_csv(index=False), file_name="timestamps.csv", mime="text/csv", use_container_width=True, )