nahue-passano commited on
Commit
7405904
1 Parent(s): 6125290

update: new release

Browse files
Files changed (6) hide show
  1. app.py +98 -74
  2. pyproject.toml +3 -0
  3. requirements.txt +4 -1
  4. utils/audio.py +96 -0
  5. utils/files.py +71 -0
  6. utils/text.py +142 -0
app.py CHANGED
@@ -1,97 +1,121 @@
1
- from io import StringIO
2
- import os
3
- import tempfile
4
  import streamlit as st
5
- import json
6
  import whisper_timestamped as whisper
7
  import pandas as pd
8
 
 
 
 
 
 
 
 
 
9
  STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
10
  LANGUAGES = {"English": "en", "Spanish": "es"}
11
  MODEL_SIZES = {"Medium": "medium", "Large": "large"}
12
 
13
 
14
- def save_temp_file(file):
15
- temp_dir = tempfile.gettempdir()
16
- temp_file_path = os.path.join(temp_dir, file.name)
17
- with open(temp_file_path, "wb") as temp_file:
18
- temp_file.write(file.getvalue())
19
- return temp_file_path
20
-
21
-
22
  @st.cache_resource(show_spinner=False)
23
  def load_model(model_size: str):
24
- print(f"model size : {MODEL_SIZES[model_size]}")
 
 
 
 
 
 
 
 
 
 
 
25
  return whisper.load_model(
26
  MODEL_SIZES[model_size], device="cpu", download_root="models"
27
  )
28
 
29
 
30
- def get_sentence_data(filename: str, timestamp_dict: dict):
31
- sentence_df = pd.DataFrame(
32
- columns=["Audio file", "Sentence", "Start", "End", "Duration"]
33
- )
34
- for sentence_i in timestamp_dict["segments"]:
35
- sentence_i = pd.DataFrame(
36
- {
37
- "Audio file": [filename],
38
- "Sentence": [str(sentence_i["text"])],
39
- "Start": [sentence_i["start"]],
40
- "End": [sentence_i["end"]],
41
- "Duration": [sentence_i["end"] - sentence_i["start"]],
42
- }
43
- )
44
- sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
45
- return sentence_df
46
-
47
-
48
- def get_word_data(filename: str, timestamp_dict: dict):
49
- pass
50
-
51
-
52
- def get_word_data():
53
- pass
54
 
 
 
 
 
 
 
55
 
56
- st.title("⏱️🧾 Timestamp generator")
57
-
58
- # Audio load
59
- audio_file = st.file_uploader(
60
- "Load audio file to transcribe", type=["wav", "mp3"], accept_multiple_files=True
61
- )
62
-
63
- stamp_type, lang, size = st.columns(3)
64
-
65
- with stamp_type:
66
- timestamp_type = st.selectbox("Timestamp type", options=list(STAMP_TYPES.keys()))
67
-
68
- with lang:
69
- language = st.selectbox("Language", options=list(LANGUAGES.keys()))
70
 
71
- with size:
72
- model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- # Botón para generar el timestamp
75
- if st.button("Generate Timestamp", use_container_width=True):
76
- with st.spinner("Loading model..."):
77
- model = load_model(model_size)
78
- sentences_df = pd.DataFrame()
79
- for audio_i in audio_file:
80
- with st.spinner(f"Processing audio: {audio_i.name}"):
81
- tmp_audio = save_temp_file(audio_i)
82
- tmp_audio_file = whisper.load_audio(tmp_audio)
83
- timestamp_result = whisper.transcribe(
84
- model, tmp_audio_file, language=LANGUAGES[language]
85
  )
86
- audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
87
- sentences_df = pd.concat([sentences_df, audio_i_df], ignore_index=True)
88
 
89
- st.dataframe(sentences_df)
90
 
91
- st.download_button(
92
- "Save timestamps",
93
- sentences_df.to_csv(index=False),
94
- file_name="timestamps.csv",
95
- mime="text/csv",
96
- use_container_width=True,
97
- )
 
 
 
 
1
  import streamlit as st
 
2
  import whisper_timestamped as whisper
3
  import pandas as pd
4
 
5
+ from utils.files import (
6
+ create_temp_directory,
7
+ save_temp_file,
8
+ compress_utterances_folder,
9
+ )
10
+ from utils.text import get_sentence_data, get_word_data, generate_transcriptions_splits
11
+ from utils.audio import generate_audio_splits
12
+
13
  STAMP_TYPES = {"Sentence-level": "sentence", "Word-level": "word"}
14
  LANGUAGES = {"English": "en", "Spanish": "es"}
15
  MODEL_SIZES = {"Medium": "medium", "Large": "large"}
16
 
17
 
 
 
 
 
 
 
 
 
18
  @st.cache_resource(show_spinner=False)
19
  def load_model(model_size: str):
20
+ """Loads the Whisper model with size model_size
21
+
22
+ Parameters
23
+ ----------
24
+ model_size : str
25
+ Available size of the whisper model
26
+
27
+ Returns
28
+ -------
29
+ _type_
30
+ Whisper model
31
+ """
32
  return whisper.load_model(
33
  MODEL_SIZES[model_size], device="cpu", download_root="models"
34
  )
35
 
36
 
37
+ def main_app():
38
+ st.title("🗣️💬 LibriSpeech Corpus Generator")
39
+ st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Audio load
42
+ audio_file = st.file_uploader(
43
+ "Load audio files to process", type=["wav", "mp3"], accept_multiple_files=True
44
+ )
45
+ st.divider()
46
+ stamp_type, lang, size = st.columns(3)
47
 
48
+ with stamp_type:
49
+ timestamp_type = st.selectbox(
50
+ "Division level", options=list(STAMP_TYPES.keys())
51
+ )
 
 
 
 
 
 
 
 
 
 
52
 
53
+ with lang:
54
+ language = st.selectbox("Language", options=list(LANGUAGES.keys()))
55
+
56
+ with size:
57
+ model_size = st.selectbox("Model size", options=list(MODEL_SIZES.keys()))
58
+ st.divider()
59
+
60
+ if st.button("Process audios", use_container_width=True):
61
+ with st.spinner("Loading model..."):
62
+ model = load_model(model_size)
63
+
64
+ timestamps_df = pd.DataFrame()
65
+ temp_dir = create_temp_directory()
66
+ utterances_folder = temp_dir / "utterances_segments"
67
+ utterances_folder.mkdir(exist_ok=True)
68
+ for audio_i in audio_file:
69
+ with st.spinner(f"Processing audio: {audio_i.name}"):
70
+ tmp_audio = save_temp_file(audio_i)
71
+
72
+ # Whisper inference
73
+ tmp_audio_file = whisper.load_audio(tmp_audio)
74
+ timestamp_result = whisper.transcribe(
75
+ model, tmp_audio_file, language=LANGUAGES[language]
76
+ )
77
+
78
+ # Stamp level
79
+ if timestamp_type == "Sentence-level":
80
+ audio_i_df = get_sentence_data(audio_i.name, timestamp_result)
81
+
82
+ if timestamp_type == "Word-level":
83
+ audio_i_df = get_word_data(audio_i.name, timestamp_result)
84
+
85
+ # Timestamps in dataframe
86
+ timestamps_df = pd.concat(
87
+ [timestamps_df, audio_i_df], ignore_index=True
88
+ )
89
+
90
+ generate_audio_splits(tmp_audio, audio_i_df, utterances_folder)
91
+ generate_transcriptions_splits(tmp_audio, audio_i_df, utterances_folder)
92
+ st.divider()
93
+ st.markdown(
94
+ "<h3 style='text-align: center;'>Timestamps</h3>",
95
+ unsafe_allow_html=True,
96
+ )
97
+ st.dataframe(timestamps_df)
98
+ st.divider()
99
+ col1, col2 = st.columns(2)
100
+
101
+ with col1:
102
+ st.download_button(
103
+ "Download timestamps in .csv",
104
+ timestamps_df.to_csv(index=False),
105
+ file_name="timestamps.csv",
106
+ mime="text/csv",
107
+ use_container_width=True,
108
+ )
109
 
110
+ with col2:
111
+ st.download_button(
112
+ "Download LibriSpeech-like dataset",
113
+ data=compress_utterances_folder(utterances_folder),
114
+ file_name="librispeech-like-dataset.zip",
115
+ mime="application/zip",
116
+ use_container_width=True,
 
 
 
 
117
  )
 
 
118
 
 
119
 
120
+ if __name__ == "__main__":
121
+ main_app()
 
 
 
 
 
pyproject.toml CHANGED
@@ -14,6 +14,9 @@ openai-whisper = "*"
14
  torch = "1.13"
15
  matplotlib = "^3.7.1"
16
  streamlit = "^1.24.0"
 
 
 
17
 
18
 
19
  [build-system]
 
14
  torch = "1.13"
15
  matplotlib = "^3.7.1"
16
  streamlit = "^1.24.0"
17
+ sounddevice = "^0.4.6"
18
+ soundfile = "^0.12.1"
19
+ pydub = "^0.25.1"
20
 
21
 
22
  [build-system]
requirements.txt CHANGED
@@ -2,4 +2,7 @@ Cython
2
  dtw-python
3
  openai-whisper
4
  torch==1.13
5
- streamlit==1.24
 
 
 
 
2
  dtw-python
3
  openai-whisper
4
  torch==1.13
5
+ streamlit==1.24
6
+ pandas
7
+ numpy
8
+ soundfile
utils/audio.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, List
2
+ from pathlib import Path
3
+ import numpy as np
4
+ import soundfile as sf
5
+ import pandas as pd
6
+
7
+ from utils.text import filter_dataframe_by_audiofile
8
+
9
+
10
+ def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
11
+ """Loads an audio given its path
12
+
13
+ Parameters
14
+ ----------
15
+ audio_path : Path
16
+ Path of the audio file
17
+
18
+ Returns
19
+ -------
20
+ Tuple[np.ndarray, float]
21
+ Audio array and sample rate
22
+ """
23
+ audio_array, sample_rate = sf.read(str(audio_path))
24
+ return audio_array, sample_rate
25
+
26
+
27
+ def split_audio(
28
+ audio_array: np.ndarray, sample_rate: float, timestamp_list: list
29
+ ) -> List[np.ndarray]:
30
+ """Slices audio_array with timestamps in timestamp_list
31
+
32
+ Parameters
33
+ ----------
34
+ audio_array : np.ndarray
35
+ Array of the audio to be splitted
36
+ sample_rate : float
37
+ Audio sample rate
38
+ timestamp_list : list
39
+ List of tuples containing the start and end of each stamp.
40
+
41
+ Returns
42
+ -------
43
+ List[np.ndarray]
44
+ List of numpy arrays with audio splits
45
+ """
46
+ audio_segments = []
47
+ for timestamp_i in timestamp_list:
48
+ start_sample = round(timestamp_i[0] * sample_rate)
49
+ end_sample = round(timestamp_i[1] * sample_rate)
50
+ audio_segments.append(audio_array[start_sample:end_sample])
51
+
52
+ return audio_segments
53
+
54
+
55
+ def save_audio_segments(
56
+ destination: Path,
57
+ audio_path: Path,
58
+ audio_segments: List[np.ndarray],
59
+ sample_rate: float,
60
+ ) -> None:
61
+ """Saves audio segments from audio_segments in destination path.
62
+
63
+ Parameters
64
+ ----------
65
+ destination : Path
66
+ Path were segments will be saved
67
+ audio_name : Path
68
+ Name of the original audio file
69
+ audio_segments : List[np.ndarray]
70
+ List containing numpy arrays with the audio segments
71
+ sample_rate : float
72
+ Sample rate of the original audio file
73
+ """
74
+ for i, segment in enumerate(audio_segments):
75
+ segment_path = destination / f"{audio_path.stem}-{i}.wav"
76
+ sf.write(str(segment_path), segment, sample_rate)
77
+
78
+
79
+ def generate_audio_splits(
80
+ audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
81
+ ) -> None:
82
+ """Splits an audio given its path and timestamps
83
+
84
+ Parameters
85
+ ----------
86
+ audio_path : Path
87
+ Path of the audio
88
+ timestamps_df : pd.DataFrame
89
+ DataFrame containing start and end of the utterances
90
+ destination : Path
91
+ Path were segments will be saved.
92
+ """
93
+ audio_array, sample_rate = load_audio(audio_path)
94
+ timestamp_list = filter_dataframe_by_audiofile(timestamps_df, audio_path.name)
95
+ audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
96
+ save_audio_segments(destination, audio_path, audio_segments, sample_rate)
utils/files.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import zipfile
3
+ import shutil
4
+ import io
5
+ import streamlit as st
6
+
7
+
8
+ def save_temp_file(file: st.runtime.uploaded_file_manager.UploadedFile) -> str:
9
+ """Saves in a temporary directory an Streamlit uploaded file
10
+
11
+ Parameters
12
+ ----------
13
+ file : st.runtime.uploaded_file_manager.UploadedFile
14
+ File from st.file_uploader return
15
+
16
+ Returns
17
+ -------
18
+ str
19
+ Path were file is saved temporary
20
+ """
21
+ temp_dir = Path(".temp")
22
+ temp_file_path = temp_dir.joinpath(file.name)
23
+ with open(str(temp_file_path), "wb") as temp_file:
24
+ temp_file.write(file.getvalue())
25
+ return temp_file_path
26
+
27
+
28
+ def create_temp_directory(dir_name: str = ".temp") -> Path:
29
+ """Create a temporary directory.
30
+
31
+ Parameters
32
+ ----------
33
+ dir_name : str, optional
34
+ Name of the temporary directory, by default ".temp"
35
+
36
+ Returns
37
+ -------
38
+ Path
39
+ Path object representing the created temporary directory.
40
+ """
41
+ temp_dir = Path(dir_name)
42
+ temp_dir.mkdir(exist_ok=True)
43
+ return temp_dir
44
+
45
+
46
+ def clean_temp_directory() -> None:
47
+ """Cleans .temp directory"""
48
+ shutil.rmtree(Path(".temp"))
49
+
50
+
51
+ def compress_utterances_folder(utterances_folder: Path) -> io.BytesIO:
52
+ """Compresses the contents of utterances_folder into a zip file.
53
+
54
+ Parameters
55
+ ----------
56
+ utterances_folder : Path
57
+ Path to the folder containing utterances.
58
+
59
+ Returns
60
+ -------
61
+ io.BytesIO
62
+ A BytesIO object representing the compressed zip file.
63
+ """
64
+ memory_file = io.BytesIO()
65
+ with zipfile.ZipFile(memory_file, "w") as zip_file:
66
+ for file_i in utterances_folder.iterdir():
67
+ zip_file.write(str(file_i), arcname=file_i.name)
68
+
69
+ memory_file.seek(0)
70
+ clean_temp_directory()
71
+ return memory_file
utils/text.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pathlib import Path
3
+ import pandas as pd
4
+
5
+
6
+ def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
7
+ """Extracts the sentences from the output dictionary of whisper inference
8
+
9
+ Parameters
10
+ ----------
11
+ filename : str
12
+ Name of the audio analyzed
13
+ timestamp_dict : dict
14
+ Output dictionary from whisper inference
15
+
16
+ Returns
17
+ -------
18
+ pd.DataFrame
19
+ DataFrame containing audio filename, start, end and duration of sentences with
20
+ its transcriptions.
21
+ """
22
+ sentence_df = pd.DataFrame(
23
+ columns=["Audio file", "Sentence", "Start", "End", "Duration"]
24
+ )
25
+ for sentence_i in timestamp_dict["segments"]:
26
+ sentence_i = pd.DataFrame(
27
+ {
28
+ "Audio file": [filename],
29
+ "Sentence": [str(sentence_i["text"])],
30
+ "Start": [sentence_i["start"]],
31
+ "End": [sentence_i["end"]],
32
+ "Duration": [sentence_i["end"] - sentence_i["start"]],
33
+ }
34
+ )
35
+ sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
36
+ return sentence_df
37
+
38
+
39
+ def get_word_data(filename: str, timestamp_dict: dict):
40
+ """Extracts the words from the output dictionary of whisper inference
41
+
42
+ Parameters
43
+ ----------
44
+ filename : str
45
+ Name of the audio analyzed
46
+ timestamp_dict : dict
47
+ Output dictionary from whisper inference
48
+
49
+ Returns
50
+ -------
51
+ pd.DataFrame
52
+ DataFrame containing audio filename, start, end and duration of words with
53
+ its transcriptions.
54
+ """
55
+ word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
56
+ for sentence_i in timestamp_dict["segments"]:
57
+ for word_i in sentence_i["words"]:
58
+ word_i_df = pd.DataFrame(
59
+ {
60
+ "Audio file": [filename],
61
+ "Word": [str(word_i["text"])],
62
+ "Start": [word_i["start"]],
63
+ "End": [word_i["end"]],
64
+ "Duration": [word_i["end"] - word_i["start"]],
65
+ }
66
+ )
67
+ word_df = pd.concat([word_df, word_i_df], ignore_index=True)
68
+ return word_df
69
+
70
+
71
+ def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str) -> List:
72
+ """Generates a list from timestamps_df with the timestamps belonging to audio_file
73
+
74
+ Parameters
75
+ ----------
76
+ timestamps_df : pd.DataFrame
77
+ Dataframe containing timestamps
78
+ audio_file : str
79
+ Name of the audio file.
80
+
81
+ Returns
82
+ -------
83
+ List
84
+ List of tuples containing the start and end of each stamp.
85
+ E.g: [(start_1, end_2), ..., (start_n, end_n)]
86
+ """
87
+ audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
88
+ return list(zip(audio_df["Start"], audio_df["End"]))
89
+
90
+
91
+ def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
92
+ """Gives column with transcriptions
93
+
94
+ Parameters
95
+ ----------
96
+ timestamps_df : pd.DataFrame
97
+ DataFrame with transcriptions
98
+
99
+ Returns
100
+ -------
101
+ List[str]
102
+ List of the transcriptions
103
+ """
104
+ return timestamps_df.iloc[:, 1].tolist()
105
+
106
+
107
+ def save_transcriptions_segments(
108
+ audio_path: Path, transcriptions_list: List[str], destination: Path
109
+ ) -> None:
110
+ """Save transcription segments to text files.
111
+
112
+ Parameters
113
+ ----------
114
+ audio_path : Path
115
+ Path to the audio file.
116
+ transcriptions_list : List[str]
117
+ List of transcriptions.
118
+ destination : Path
119
+ Destination path for the text files.
120
+ """
121
+ for i, transcription_i in enumerate(transcriptions_list):
122
+ transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
123
+ with open(str(transcription_i_path), "w") as file:
124
+ file.write(transcription_i)
125
+
126
+
127
+ def generate_transcriptions_splits(
128
+ audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
129
+ ):
130
+ """Generate and save transcription splits based on timestamps.
131
+
132
+ Parameters
133
+ ----------
134
+ audio_path : Path
135
+ Path to the audio file.
136
+ timestamps_df : pd.DataFrame
137
+ DataFrame containing timestamps.
138
+ destination : Path
139
+ Destination path for the text files.
140
+ """
141
+ transcriptions_list = get_utterances_transcriptions(timestamps_df)
142
+ save_transcriptions_segments(audio_path, transcriptions_list, destination)