nahue-passano
update: added lenght fixing in short utterances
80f5b87
from typing import List
from pathlib import Path
import pandas as pd
def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
"""Extracts the sentences from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of sentences with
its transcriptions.
"""
sentence_df = pd.DataFrame(
columns=["Audio file", "Sentence", "Start", "End", "Duration"]
)
for sentence_i in timestamp_dict["segments"]:
sentence_i = pd.DataFrame(
{
"Audio file": [filename],
"Sentence": [str(sentence_i["text"])],
"Start": [sentence_i["start"]],
"End": [sentence_i["end"]],
"Duration": [sentence_i["end"] - sentence_i["start"]],
}
)
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
return sentence_df
def get_word_data(filename: str, timestamp_dict: dict):
"""Extracts the words from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of words with
its transcriptions.
"""
word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
for sentence_i in timestamp_dict["segments"]:
for word_i in sentence_i["words"]:
word_i_df = pd.DataFrame(
{
"Audio file": [filename],
"Word": [str(word_i["text"])],
"Start": [word_i["start"]],
"End": [word_i["end"]],
"Duration": [word_i["end"] - word_i["start"]],
}
)
word_df = pd.concat([word_df, word_i_df], ignore_index=True)
return word_df
def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
"""Generates a list from starts and ends of utterances in an audio.
Parameters
----------
audio_df : pd.DataFrame
Dataframe containing timestamps
Returns
-------
List
List of tuples containing the start and end of each stamp.
E.g: [(start_1, end_2), ..., (start_n, end_n)]
"""
return list(zip(audio_df["Start"], audio_df["End"]))
def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece
Parameters
----------
dataframe: pd.DataFrame
Selected DataFrame to process
Returns
-------
pd.DataFrame
DataFrame with corrected audio segments
"""
corrected_dataframe = pd.DataFrame()
# Get lists from dataframe
segments = list(zip(dataframe['Start'], dataframe['End']))
segment_durations = list(dataframe['Duration'])
names = list(dataframe['Audio file'])
texts = list(dataframe['Sentence'])
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < 1.6:
# See if the segment can be re-attached with the right or the left segment
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = (
float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
)
joined_duration = segment_durations[i] + min(left_duration, right_duration)
# Re-attach the segment with the neighbour of shortest duration
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
texts[j] = texts[j] + texts[j + 1]
del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
else:
i += 1
# Append modified lists to new Dataframe
corrected_dataframe["Audio file"] = names
corrected_dataframe["Sentence"] = texts
corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
corrected_dataframe["Duration"] = segment_durations
return corrected_dataframe
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
"""Gives column with transcriptions
Parameters
----------
timestamps_df : pd.DataFrame
DataFrame with transcriptions
Returns
-------
List[str]
List of the transcriptions
"""
return timestamps_df.iloc[:, 1].tolist()
def save_transcriptions_segments(
audio_path: Path, transcriptions_list: List[str], destination: Path
) -> None:
"""Save transcription segments to text files.
Parameters
----------
audio_path : Path
Path to the audio file.
transcriptions_list : List[str]
List of transcriptions.
destination : Path
Destination path for the text files.
"""
for i, transcription_i in enumerate(transcriptions_list):
transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
with open(str(transcription_i_path), "w") as file:
file.write(transcription_i)
def generate_transcriptions_splits(
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
):
"""Generate and save transcription splits based on timestamps.
Parameters
----------
audio_path : Path
Path to the audio file.
timestamps_df : pd.DataFrame
DataFrame containing timestamps.
destination : Path
Destination path for the text files.
"""
transcriptions_list = get_utterances_transcriptions(timestamps_df)
save_transcriptions_segments(audio_path, transcriptions_list, destination)