Spaces:
Runtime error
Runtime error
from typing import List | |
from pathlib import Path | |
import pandas as pd | |
def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame: | |
"""Extracts the sentences from the output dictionary of whisper inference | |
Parameters | |
---------- | |
filename : str | |
Name of the audio analyzed | |
timestamp_dict : dict | |
Output dictionary from whisper inference | |
Returns | |
------- | |
pd.DataFrame | |
DataFrame containing audio filename, start, end and duration of sentences with | |
its transcriptions. | |
""" | |
sentence_df = pd.DataFrame( | |
columns=["Audio file", "Sentence", "Start", "End", "Duration"] | |
) | |
for sentence_i in timestamp_dict["segments"]: | |
sentence_i = pd.DataFrame( | |
{ | |
"Audio file": [filename], | |
"Sentence": [str(sentence_i["text"])], | |
"Start": [sentence_i["start"]], | |
"End": [sentence_i["end"]], | |
"Duration": [sentence_i["end"] - sentence_i["start"]], | |
} | |
) | |
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True) | |
return sentence_df | |
def get_word_data(filename: str, timestamp_dict: dict): | |
"""Extracts the words from the output dictionary of whisper inference | |
Parameters | |
---------- | |
filename : str | |
Name of the audio analyzed | |
timestamp_dict : dict | |
Output dictionary from whisper inference | |
Returns | |
------- | |
pd.DataFrame | |
DataFrame containing audio filename, start, end and duration of words with | |
its transcriptions. | |
""" | |
word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"]) | |
for sentence_i in timestamp_dict["segments"]: | |
for word_i in sentence_i["words"]: | |
word_i_df = pd.DataFrame( | |
{ | |
"Audio file": [filename], | |
"Word": [str(word_i["text"])], | |
"Start": [word_i["start"]], | |
"End": [word_i["end"]], | |
"Duration": [word_i["end"] - word_i["start"]], | |
} | |
) | |
word_df = pd.concat([word_df, word_i_df], ignore_index=True) | |
return word_df | |
def get_utterance_boundaries(audio_df: pd.DataFrame) -> List: | |
"""Generates a list from starts and ends of utterances in an audio. | |
Parameters | |
---------- | |
audio_df : pd.DataFrame | |
Dataframe containing timestamps | |
Returns | |
------- | |
List | |
List of tuples containing the start and end of each stamp. | |
E.g: [(start_1, end_2), ..., (start_n, end_n)] | |
""" | |
return list(zip(audio_df["Start"], audio_df["End"])) | |
def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece | |
Parameters | |
---------- | |
dataframe: pd.DataFrame | |
Selected DataFrame to process | |
Returns | |
------- | |
pd.DataFrame | |
DataFrame with corrected audio segments | |
""" | |
corrected_dataframe = pd.DataFrame() | |
# Get lists from dataframe | |
segments = list(zip(dataframe['Start'], dataframe['End'])) | |
segment_durations = list(dataframe['Duration']) | |
names = list(dataframe['Audio file']) | |
texts = list(dataframe['Sentence']) | |
i = 0 | |
while i < len(segments) and len(segments) > 1: | |
if segment_durations[i] < 1.6: | |
# See if the segment can be re-attached with the right or the left segment | |
left_duration = float("inf") if i == 0 else segment_durations[i - 1] | |
right_duration = ( | |
float("inf") if i == len(segments) - 1 else segment_durations[i + 1] | |
) | |
joined_duration = segment_durations[i] + min(left_duration, right_duration) | |
# Re-attach the segment with the neighbour of shortest duration | |
j = i - 1 if left_duration <= right_duration else i | |
segments[j] = (segments[j][0], segments[j + 1][1]) | |
segment_durations[j] = joined_duration | |
texts[j] = texts[j] + texts[j + 1] | |
del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1] | |
else: | |
i += 1 | |
# Append modified lists to new Dataframe | |
corrected_dataframe["Audio file"] = names | |
corrected_dataframe["Sentence"] = texts | |
corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments) | |
corrected_dataframe["Duration"] = segment_durations | |
return corrected_dataframe | |
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]: | |
"""Gives column with transcriptions | |
Parameters | |
---------- | |
timestamps_df : pd.DataFrame | |
DataFrame with transcriptions | |
Returns | |
------- | |
List[str] | |
List of the transcriptions | |
""" | |
return timestamps_df.iloc[:, 1].tolist() | |
def save_transcriptions_segments( | |
audio_path: Path, transcriptions_list: List[str], destination: Path | |
) -> None: | |
"""Save transcription segments to text files. | |
Parameters | |
---------- | |
audio_path : Path | |
Path to the audio file. | |
transcriptions_list : List[str] | |
List of transcriptions. | |
destination : Path | |
Destination path for the text files. | |
""" | |
for i, transcription_i in enumerate(transcriptions_list): | |
transcription_i_path = destination / f"{audio_path.stem}-{i}.txt" | |
with open(str(transcription_i_path), "w") as file: | |
file.write(transcription_i) | |
def generate_transcriptions_splits( | |
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path | |
): | |
"""Generate and save transcription splits based on timestamps. | |
Parameters | |
---------- | |
audio_path : Path | |
Path to the audio file. | |
timestamps_df : pd.DataFrame | |
DataFrame containing timestamps. | |
destination : Path | |
Destination path for the text files. | |
""" | |
transcriptions_list = get_utterances_transcriptions(timestamps_df) | |
save_transcriptions_segments(audio_path, transcriptions_list, destination) | |