Spaces:
Runtime error
Runtime error
File size: 6,071 Bytes
7405904 80f5b87 7405904 80f5b87 7405904 80f5b87 7405904 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
from typing import List
from pathlib import Path
import pandas as pd
def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
"""Extracts the sentences from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of sentences with
its transcriptions.
"""
sentence_df = pd.DataFrame(
columns=["Audio file", "Sentence", "Start", "End", "Duration"]
)
for sentence_i in timestamp_dict["segments"]:
sentence_i = pd.DataFrame(
{
"Audio file": [filename],
"Sentence": [str(sentence_i["text"])],
"Start": [sentence_i["start"]],
"End": [sentence_i["end"]],
"Duration": [sentence_i["end"] - sentence_i["start"]],
}
)
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
return sentence_df
def get_word_data(filename: str, timestamp_dict: dict):
"""Extracts the words from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of words with
its transcriptions.
"""
word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
for sentence_i in timestamp_dict["segments"]:
for word_i in sentence_i["words"]:
word_i_df = pd.DataFrame(
{
"Audio file": [filename],
"Word": [str(word_i["text"])],
"Start": [word_i["start"]],
"End": [word_i["end"]],
"Duration": [word_i["end"] - word_i["start"]],
}
)
word_df = pd.concat([word_df, word_i_df], ignore_index=True)
return word_df
def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
"""Generates a list from starts and ends of utterances in an audio.
Parameters
----------
audio_df : pd.DataFrame
Dataframe containing timestamps
Returns
-------
List
List of tuples containing the start and end of each stamp.
E.g: [(start_1, end_2), ..., (start_n, end_n)]
"""
return list(zip(audio_df["Start"], audio_df["End"]))
def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece
Parameters
----------
dataframe: pd.DataFrame
Selected DataFrame to process
Returns
-------
pd.DataFrame
DataFrame with corrected audio segments
"""
corrected_dataframe = pd.DataFrame()
# Get lists from dataframe
segments = list(zip(dataframe['Start'], dataframe['End']))
segment_durations = list(dataframe['Duration'])
names = list(dataframe['Audio file'])
texts = list(dataframe['Sentence'])
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < 1.6:
# See if the segment can be re-attached with the right or the left segment
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = (
float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
)
joined_duration = segment_durations[i] + min(left_duration, right_duration)
# Re-attach the segment with the neighbour of shortest duration
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
texts[j] = texts[j] + texts[j + 1]
del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
else:
i += 1
# Append modified lists to new Dataframe
corrected_dataframe["Audio file"] = names
corrected_dataframe["Sentence"] = texts
corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
corrected_dataframe["Duration"] = segment_durations
return corrected_dataframe
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
"""Gives column with transcriptions
Parameters
----------
timestamps_df : pd.DataFrame
DataFrame with transcriptions
Returns
-------
List[str]
List of the transcriptions
"""
return timestamps_df.iloc[:, 1].tolist()
def save_transcriptions_segments(
audio_path: Path, transcriptions_list: List[str], destination: Path
) -> None:
"""Save transcription segments to text files.
Parameters
----------
audio_path : Path
Path to the audio file.
transcriptions_list : List[str]
List of transcriptions.
destination : Path
Destination path for the text files.
"""
for i, transcription_i in enumerate(transcriptions_list):
transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
with open(str(transcription_i_path), "w") as file:
file.write(transcription_i)
def generate_transcriptions_splits(
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
):
"""Generate and save transcription splits based on timestamps.
Parameters
----------
audio_path : Path
Path to the audio file.
timestamps_df : pd.DataFrame
DataFrame containing timestamps.
destination : Path
Destination path for the text files.
"""
transcriptions_list = get_utterances_transcriptions(timestamps_df)
save_transcriptions_segments(audio_path, transcriptions_list, destination)
|