Spaces:
Runtime error
Runtime error
File size: 3,992 Bytes
cd79d05 9bdb941 cd79d05 7405904 9bdb941 7405904 cd79d05 7405904 cd79d05 7405904 cd79d05 7405904 cd79d05 7405904 cd79d05 7405904 9bdb941 7405904 9bdb941 7405904 9bdb941 7405904 9bdb941 7405904 9bdb941 7405904 cd79d05 7405904 cd79d05 7405904 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import streamlit as st
import whisper_transcriber as whisper
import pandas as pd
from utils.files import (
create_temp_directory,
save_temp_file,
compress_utterances_folder,
)
from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits, check_ut_min_duration
from utils.audio import generate_audio_splits
STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
LANGUAGES = {"English": "en", "Spanish": "es"}
MODEL_SIZES = {"Medium": "medium", "Large": "large"}
@st.cache_resource(show_spinner=False)
def load_model(model_size: str):
"""Loads the Whisper model with size model_size
Parameters
----------
model_size : str
Available size of the whisper model
Returns
-------
_type_
Whisper model
"""
return whisper.load_model(
MODEL_SIZES[model_size], device="cpu", download_root="models"
)
def main_app():
st.title("🗣️💬 LibriSpeech Corpus Generator")
st.divider()
# Audio load
audio_file = st.file_uploader(
"Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True
)
st.divider()
stamp_type, lang, size = st.columns(3)
with stamp_type:
timestamp_type = st.selectbox(
"Division level", options=list(STAMP_TYPES.keys())
)
with lang:
language = st.selectbox("Language", options=list(LANGUAGES.keys()))
with size:
model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
st.divider()
if st.button("Process audios", use_container_width=True):
with st.spinner("Loading model..."):
model = load_model(model_size)
timestamps_df = pd.DataFrame()
temp_dir = create_temp_directory()
utterances_folder = temp_dir / "utterances_segments"
utterances_folder.mkdir(exist_ok=True)
for audio_i in audio_file:
with st.spinner(f"Processing audio: {audio_i.name}"):
tmp_audio = save_temp_file(audio_i)
# Whisper inference
tmp_audio_file = whisper.load_audio(tmp_audio)
timestamp_result = whisper.transcribe(
model, tmp_audio_file, language=LANGUAGES[language]
)
# Stamp level
if timestamp_type == "Sentence-level":
audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
# Checks utterance duration
audio_i_df = check_ut_min_duration(audio_i_df)
if timestamp_type == "Word-level":
audio_i_df = get_word_data(audio_i.name, timestamp_result)
# Timestamps in dataframe
timestamps_df = pd.concat(
[timestamps_df, audio_i_df], ignore_index=True
)
generate_audio_splits(tmp_audio, audio_i_df, utterances_folder)
generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder)
st.divider()
st.markdown(
"<h3 style='text-align: center;'>Timestamps</h3>",
unsafe_allow_html=True,
)
st.dataframe(timestamps_df)
st.divider()
col1, col2 = st.columns(2)
with col1:
st.download_button(
"Download timestamps in .csv",
timestamps_df.to_csv(index=False),
file_name="timestamps.csv",
mime="text/csv",
use_container_width=True,
)
with col2:
st.download_button(
"Download LibriSpeech-like dataset",
data=compress_utterances_folder(utterances_folder),
file_name="librispeech-like-dataset.zip",
mime="application/zip",
use_container_width=True,
)
if __name__ == "__main__":
main_app()
|