Spaces:
Runtime error
Runtime error
import streamlit as st | |
import whisper_timestamped as whisper | |
import pandas as pd | |
from utils.files import ( | |
create_temp_directory, | |
save_temp_file, | |
compress_utterances_folder, | |
) | |
from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits | |
from utils.audio import generate_audio_splits | |
STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"} | |
LANGUAGES = {"English": "en", "Spanish": "es"} | |
MODEL_SIZES = {"Medium": "medium", "Large": "large"} | |
def load_model(model_size: str): | |
"""Loads the Whisper model with size model_size | |
Parameters | |
---------- | |
model_size : str | |
Available size of the whisper model | |
Returns | |
------- | |
_type_ | |
Whisper model | |
""" | |
return whisper.load_model( | |
MODEL_SIZES[model_size], device="cpu", download_root="models" | |
) | |
def main_app(): | |
st.title("🗣️💬 LibriSpeech Corpus Generator") | |
st.divider() | |
# Audio load | |
audio_file = st.file_uploader( | |
"Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True | |
) | |
st.divider() | |
stamp_type, lang, size = st.columns(3) | |
with stamp_type: | |
timestamp_type = st.selectbox( | |
"Division level", options=list(STAMP_TYPES.keys()) | |
) | |
with lang: | |
language = st.selectbox("Language", options=list(LANGUAGES.keys())) | |
with size: | |
model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys())) | |
st.divider() | |
if st.button("Process audios", use_container_width=True): | |
with st.spinner("Loading model..."): | |
model = load_model(model_size) | |
timestamps_df = pd.DataFrame() | |
temp_dir = create_temp_directory() | |
utterances_folder = temp_dir / "utterances_segments" | |
utterances_folder.mkdir(exist_ok=True) | |
for audio_i in audio_file: | |
with st.spinner(f"Processing audio: {audio_i.name}"): | |
tmp_audio = save_temp_file(audio_i) | |
# Whisper inference | |
tmp_audio_file = whisper.load_audio(tmp_audio) | |
timestamp_result = whisper.transcribe( | |
model, tmp_audio_file, language=LANGUAGES[language] | |
) | |
# Stamp level | |
if timestamp_type == "Sentence-level": | |
audio_i_df = get_sentence_data(audio_i.name, timestamp_result) | |
if timestamp_type == "Word-level": | |
audio_i_df = get_word_data(audio_i.name, timestamp_result) | |
# Timestamps in dataframe | |
timestamps_df = pd.concat( | |
[timestamps_df, audio_i_df], ignore_index=True | |
) | |
generate_audio_splits(tmp_audio, audio_i_df, utterances_folder) | |
generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder) | |
st.divider() | |
st.markdown( | |
"<h3 style='text-align: center;'>Timestamps</h3>", | |
unsafe_allow_html=True, | |
) | |
st.dataframe(timestamps_df) | |
st.divider() | |
col1, col2 = st.columns(2) | |
with col1: | |
st.download_button( | |
"Download timestamps in .csv", | |
timestamps_df.to_csv(index=False), | |
file_name="timestamps.csv", | |
mime="text/csv", | |
use_container_width=True, | |
) | |
with col2: | |
st.download_button( | |
"Download LibriSpeech-like dataset", | |
data=compress_utterances_folder(utterances_folder), | |
file_name="librispeech-like-dataset.zip", | |
mime="application/zip", | |
use_container_width=True, | |
) | |
if __name__ == "__main__": | |
main_app() | |