nahue-passano
update: initial commit
cd79d05
raw
history blame
2.96 kB
from io import StringIO
import os
import tempfile
import streamlit as st
import json
import whisper_timestamped as whisper
import pandas as pd
STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
LANGUAGES = {"English": "en", "Spanish": "es"}
MODEL_SIZES = {"Medium": "medium", "Large": "large"}
def save_temp_file(file):
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, file.name)
with open(temp_file_path, "wb") as temp_file:
temp_file.write(file.getvalue())
return temp_file_path
@st.cache_resource(show_spinner=False)
def load_model(model_size: str):
print(f"model size : {MODEL_SIZES[model_size]}")
return whisper.load_model(
MODEL_SIZES[model_size], device="cpu", download_root="models"
)
def get_sentence_data(filename: str, timestamp_dict: dict):
sentence_df = pd.DataFrame(
columns=["Audio file", "Sentence", "Start", "End", "Duration"]
)
for sentence_i in timestamp_dict["segments"]:
sentence_i = pd.DataFrame(
{
"Audio file": [filename],
"Sentence": [str(sentence_i["text"])],
"Start": [sentence_i["start"]],
"End": [sentence_i["end"]],
"Duration": [sentence_i["end"] - sentence_i["start"]],
}
)
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
return sentence_df
def get_word_data(filename: str, timestamp_dict: dict):
pass
def get_word_data():
pass
st.title("⏱️🧾 Timestamp generator")
# Audio load
audio_file = st.file_uploader(
"Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True
)
stamp_type, lang, size = st.columns(3)
with stamp_type:
timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys()))
with lang:
language = st.selectbox("Language", options=list(LANGUAGES.keys()))
with size:
model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
# Botón para generar el timestamp
if st.button("Generate Timestamp", use_container_width=True):
with st.spinner("Loading model..."):
model = load_model(model_size)
sentences_df = pd.DataFrame()
for audio_i in audio_file:
with st.spinner(f"Processing audio: {audio_i.name}"):
tmp_audio = save_temp_file(audio_i)
tmp_audio_file = whisper.load_audio(tmp_audio)
timestamp_result = whisper.transcribe(
model, tmp_audio_file, language=LANGUAGES[language]
)
audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
sentences_df = pd.concat([sentences_df, audio_i_df], ignore_index=True)
st.dataframe(sentences_df)
st.download_button(
"Save timestamps",
sentences_df.to_csv(index=False),
file_name="timestamps.csv",
mime="text/csv",
use_container_width=True,
)