from typing import List from pathlib import Path import pandas as pd def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame: """Extracts the sentences from the output dictionary of whisper inference Parameters ---------- filename : str Name of the audio analyzed timestamp_dict : dict Output dictionary from whisper inference Returns ------- pd.DataFrame DataFrame containing audio filename, start, end and duration of sentences with its transcriptions. """ sentence_df = pd.DataFrame( columns=["Audio file", "Sentence", "Start", "End", "Duration"] ) for sentence_i in timestamp_dict["segments"]: sentence_i = pd.DataFrame( { "Audio file": [filename], "Sentence": [str(sentence_i["text"])], "Start": [sentence_i["start"]], "End": [sentence_i["end"]], "Duration": [sentence_i["end"] - sentence_i["start"]], } ) sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True) return sentence_df def get_word_data(filename: str, timestamp_dict: dict): """Extracts the words from the output dictionary of whisper inference Parameters ---------- filename : str Name of the audio analyzed timestamp_dict : dict Output dictionary from whisper inference Returns ------- pd.DataFrame DataFrame containing audio filename, start, end and duration of words with its transcriptions. """ word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"]) for sentence_i in timestamp_dict["segments"]: for word_i in sentence_i["words"]: word_i_df = pd.DataFrame( { "Audio file": [filename], "Word": [str(word_i["text"])], "Start": [word_i["start"]], "End": [word_i["end"]], "Duration": [word_i["end"] - word_i["start"]], } ) word_df = pd.concat([word_df, word_i_df], ignore_index=True) return word_df def get_utterance_boundaries(audio_df: pd.DataFrame) -> List: """Generates a list from starts and ends of utterances in an audio. Parameters ---------- audio_df : pd.DataFrame Dataframe containing timestamps Returns ------- List List of tuples containing the start and end of each stamp. E.g: [(start_1, end_2), ..., (start_n, end_n)] """ return list(zip(audio_df["Start"], audio_df["End"])) def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame: """ Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece Parameters ---------- dataframe: pd.DataFrame Selected DataFrame to process Returns ------- pd.DataFrame DataFrame with corrected audio segments """ corrected_dataframe = pd.DataFrame() # Get lists from dataframe segments = list(zip(dataframe['Start'], dataframe['End'])) segment_durations = list(dataframe['Duration']) names = list(dataframe['Audio file']) texts = list(dataframe['Sentence']) i = 0 while i < len(segments) and len(segments) > 1: if segment_durations[i] < 1.6: # See if the segment can be re-attached with the right or the left segment left_duration = float("inf") if i == 0 else segment_durations[i - 1] right_duration = ( float("inf") if i == len(segments) - 1 else segment_durations[i + 1] ) joined_duration = segment_durations[i] + min(left_duration, right_duration) # Re-attach the segment with the neighbour of shortest duration j = i - 1 if left_duration <= right_duration else i segments[j] = (segments[j][0], segments[j + 1][1]) segment_durations[j] = joined_duration texts[j] = texts[j] + texts[j + 1] del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1] else: i += 1 # Append modified lists to new Dataframe corrected_dataframe["Audio file"] = names corrected_dataframe["Sentence"] = texts corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments) corrected_dataframe["Duration"] = segment_durations return corrected_dataframe def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]: """Gives column with transcriptions Parameters ---------- timestamps_df : pd.DataFrame DataFrame with transcriptions Returns ------- List[str] List of the transcriptions """ return timestamps_df.iloc[:, 1].tolist() def save_transcriptions_segments( audio_path: Path, transcriptions_list: List[str], destination: Path ) -> None: """Save transcription segments to text files. Parameters ---------- audio_path : Path Path to the audio file. transcriptions_list : List[str] List of transcriptions. destination : Path Destination path for the text files. """ for i, transcription_i in enumerate(transcriptions_list): transcription_i_path = destination / f"{audio_path.stem}-{i}.txt" with open(str(transcription_i_path), "w") as file: file.write(transcription_i) def generate_transcriptions_splits( audio_path: Path, timestamps_df: pd.DataFrame, destination: Path ): """Generate and save transcription splits based on timestamps. Parameters ---------- audio_path : Path Path to the audio file. timestamps_df : pd.DataFrame DataFrame containing timestamps. destination : Path Destination path for the text files. """ transcriptions_list = get_utterances_transcriptions(timestamps_df) save_transcriptions_segments(audio_path, transcriptions_list, destination)