Spaces:
Runtime error
Runtime error
nahue-passano
commited on
Commit
•
80f5b87
1
Parent(s):
b2f2444
update: added lenght fixing in short utterances
Browse files- utils/audio.py +2 -2
- utils/text.py +52 -6
utils/audio.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
import soundfile as sf
|
5 |
import pandas as pd
|
6 |
|
7 |
-
from utils.text import
|
8 |
|
9 |
|
10 |
def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
|
@@ -91,6 +91,6 @@ def generate_audio_splits(
|
|
91 |
Path were segments will be saved.
|
92 |
"""
|
93 |
audio_array, sample_rate = load_audio(audio_path)
|
94 |
-
timestamp_list =
|
95 |
audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
|
96 |
save_audio_segments(destination, audio_path, audio_segments, sample_rate)
|
|
|
4 |
import soundfile as sf
|
5 |
import pandas as pd
|
6 |
|
7 |
+
from utils.text import get_utterance_boundaries
|
8 |
|
9 |
|
10 |
def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
|
|
|
91 |
Path were segments will be saved.
|
92 |
"""
|
93 |
audio_array, sample_rate = load_audio(audio_path)
|
94 |
+
timestamp_list = get_utterance_boundaries(timestamps_df)
|
95 |
audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
|
96 |
save_audio_segments(destination, audio_path, audio_segments, sample_rate)
|
utils/text.py
CHANGED
@@ -68,15 +68,13 @@ def get_word_data(filename: str, timestamp_dict: dict):
|
|
68 |
return word_df
|
69 |
|
70 |
|
71 |
-
def
|
72 |
-
"""Generates a list from
|
73 |
|
74 |
Parameters
|
75 |
----------
|
76 |
-
|
77 |
Dataframe containing timestamps
|
78 |
-
audio_file : str
|
79 |
-
Name of the audio file.
|
80 |
|
81 |
Returns
|
82 |
-------
|
@@ -84,10 +82,58 @@ def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str)
|
|
84 |
List of tuples containing the start and end of each stamp.
|
85 |
E.g: [(start_1, end_2), ..., (start_n, end_n)]
|
86 |
"""
|
87 |
-
audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
|
88 |
return list(zip(audio_df["Start"], audio_df["End"]))
|
89 |
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
|
92 |
"""Gives column with transcriptions
|
93 |
|
|
|
68 |
return word_df
|
69 |
|
70 |
|
71 |
+
def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
|
72 |
+
"""Generates a list from starts and ends of utterances in an audio.
|
73 |
|
74 |
Parameters
|
75 |
----------
|
76 |
+
audio_df : pd.DataFrame
|
77 |
Dataframe containing timestamps
|
|
|
|
|
78 |
|
79 |
Returns
|
80 |
-------
|
|
|
82 |
List of tuples containing the start and end of each stamp.
|
83 |
E.g: [(start_1, end_2), ..., (start_n, end_n)]
|
84 |
"""
|
|
|
85 |
return list(zip(audio_df["Start"], audio_df["End"]))
|
86 |
|
87 |
|
88 |
+
def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
|
89 |
+
"""
|
90 |
+
Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece
|
91 |
+
|
92 |
+
Parameters
|
93 |
+
----------
|
94 |
+
dataframe: pd.DataFrame
|
95 |
+
Selected DataFrame to process
|
96 |
+
|
97 |
+
Returns
|
98 |
+
-------
|
99 |
+
pd.DataFrame
|
100 |
+
DataFrame with corrected audio segments
|
101 |
+
"""
|
102 |
+
corrected_dataframe = pd.DataFrame()
|
103 |
+
|
104 |
+
# Get lists from dataframe
|
105 |
+
segments = list(zip(dataframe['Start'], dataframe['End']))
|
106 |
+
segment_durations = list(dataframe['Duration'])
|
107 |
+
names = list(dataframe['Audio file'])
|
108 |
+
texts = list(dataframe['Sentence'])
|
109 |
+
|
110 |
+
i = 0
|
111 |
+
while i < len(segments) and len(segments) > 1:
|
112 |
+
if segment_durations[i] < 1.6:
|
113 |
+
# See if the segment can be re-attached with the right or the left segment
|
114 |
+
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
115 |
+
right_duration = (
|
116 |
+
float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
117 |
+
)
|
118 |
+
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
119 |
+
|
120 |
+
# Re-attach the segment with the neighbour of shortest duration
|
121 |
+
j = i - 1 if left_duration <= right_duration else i
|
122 |
+
segments[j] = (segments[j][0], segments[j + 1][1])
|
123 |
+
segment_durations[j] = joined_duration
|
124 |
+
texts[j] = texts[j] + texts[j + 1]
|
125 |
+
del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
|
126 |
+
else:
|
127 |
+
i += 1
|
128 |
+
|
129 |
+
# Append modified lists to new Dataframe
|
130 |
+
corrected_dataframe["Audio file"] = names
|
131 |
+
corrected_dataframe["Sentence"] = texts
|
132 |
+
corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
|
133 |
+
corrected_dataframe["Duration"] = segment_durations
|
134 |
+
return corrected_dataframe
|
135 |
+
|
136 |
+
|
137 |
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
|
138 |
"""Gives column with transcriptions
|
139 |
|