File size: 3,992 Bytes
cd79d05
9bdb941
cd79d05
 
7405904
 
 
 
 
9bdb941
7405904
 
cd79d05
 
 
 
 
 
 
7405904
 
 
 
 
 
 
 
 
 
 
 
cd79d05
 
 
 
 
7405904
 
 
cd79d05
7405904
 
 
 
 
 
cd79d05
7405904
 
 
 
cd79d05
7405904
 
 
 
 
 
 
 
 
 
 
 
9bdb941
7405904
 
 
9bdb941
7405904
 
 
 
 
 
 
 
 
 
 
 
 
9bdb941
 
7405904
 
 
9bdb941
7405904
 
 
 
 
 
 
9bdb941
7405904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd79d05
7405904
 
 
 
 
 
 
cd79d05
 
 
7405904
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
import whisper_transcriber as whisper
import pandas as pd

from utils.files import (
    create_temp_directory,
    save_temp_file,
    compress_utterances_folder,
)
from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits, check_ut_min_duration
from utils.audio import generate_audio_splits

STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
LANGUAGES = {"English": "en", "Spanish": "es"}
MODEL_SIZES = {"Medium": "medium", "Large": "large"}


@st.cache_resource(show_spinner=False)
def load_model(model_size: str):
    """Loads the Whisper model with size model_size

    Parameters
    ----------
    model_size : str
        Available size of the whisper model

    Returns
    -------
    _type_
        Whisper model
    """
    return whisper.load_model(
        MODEL_SIZES[model_size], device="cpu", download_root="models"
    )


def main_app():
    st.title("🗣️💬 LibriSpeech Corpus Generator")
    st.divider()

    # Audio load
    audio_file = st.file_uploader(
        "Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True
    )
    st.divider()
    stamp_type, lang, size = st.columns(3)

    with stamp_type:
        timestamp_type = st.selectbox(
            "Division level", options=list(STAMP_TYPES.keys())
        )

    with lang:
        language = st.selectbox("Language", options=list(LANGUAGES.keys()))

    with size:
        model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
    st.divider()

    if st.button("Process audios", use_container_width=True):
        with st.spinner("Loading model..."):
            model = load_model(model_size)

        timestamps_df = pd.DataFrame()
        
        temp_dir = create_temp_directory()
        utterances_folder = temp_dir / "utterances_segments"
        utterances_folder.mkdir(exist_ok=True)
        
        for audio_i in audio_file:
            with st.spinner(f"Processing audio: {audio_i.name}"):
                tmp_audio = save_temp_file(audio_i)

                # Whisper inference
                tmp_audio_file = whisper.load_audio(tmp_audio)
                timestamp_result = whisper.transcribe(
                    model, tmp_audio_file, language=LANGUAGES[language]
                )

                # Stamp level
                if timestamp_type == "Sentence-level":
                    audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
                    # Checks utterance duration
                    audio_i_df = check_ut_min_duration(audio_i_df)

                if timestamp_type == "Word-level":
                    audio_i_df = get_word_data(audio_i.name, timestamp_result)
                
                # Timestamps in dataframe
                timestamps_df = pd.concat(
                    [timestamps_df, audio_i_df], ignore_index=True
                )

                generate_audio_splits(tmp_audio, audio_i_df, utterances_folder)
                generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder)
                
        st.divider()
        st.markdown(
            "<h3 style='text-align: center;'>Timestamps</h3>",
            unsafe_allow_html=True,
        )
        st.dataframe(timestamps_df)
        st.divider()
        col1, col2 = st.columns(2)

        with col1:
            st.download_button(
                "Download timestamps in .csv",
                timestamps_df.to_csv(index=False),
                file_name="timestamps.csv",
                mime="text/csv",
                use_container_width=True,
            )

        with col2:
            st.download_button(
                "Download LibriSpeech-like dataset",
                data=compress_utterances_folder(utterances_folder),
                file_name="librispeech-like-dataset.zip",
                mime="application/zip",
                use_container_width=True,
            )


if __name__ == "__main__":
    main_app()