Spaces:

nahue-passano
/

librispeech-corpus-generator

Runtime error

File size: 6,071 Bytes

from typing import List
from pathlib import Path
import pandas as pd


def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
    """Extracts the sentences from the output dictionary of whisper inference

    Parameters
    ----------
    filename : str
        Name of the audio analyzed
    timestamp_dict : dict
        Output dictionary from whisper inference

    Returns
    -------
    pd.DataFrame
        DataFrame containing audio filename, start, end and duration of sentences with
        its transcriptions.
    """
    sentence_df = pd.DataFrame(
        columns=["Audio file", "Sentence", "Start", "End", "Duration"]
    )
    for sentence_i in timestamp_dict["segments"]:
        sentence_i = pd.DataFrame(
            {
                "Audio file": [filename],
                "Sentence": [str(sentence_i["text"])],
                "Start": [sentence_i["start"]],
                "End": [sentence_i["end"]],
                "Duration": [sentence_i["end"] - sentence_i["start"]],
            }
        )
        sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
    return sentence_df


def get_word_data(filename: str, timestamp_dict: dict):
    """Extracts the words from the output dictionary of whisper inference

    Parameters
    ----------
    filename : str
        Name of the audio analyzed
    timestamp_dict : dict
        Output dictionary from whisper inference

    Returns
    -------
    pd.DataFrame
        DataFrame containing audio filename, start, end and duration of words with
        its transcriptions.
    """
    word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
    for sentence_i in timestamp_dict["segments"]:
        for word_i in sentence_i["words"]:
            word_i_df = pd.DataFrame(
                {
                    "Audio file": [filename],
                    "Word": [str(word_i["text"])],
                    "Start": [word_i["start"]],
                    "End": [word_i["end"]],
                    "Duration": [word_i["end"] - word_i["start"]],
                }
            )
            word_df = pd.concat([word_df, word_i_df], ignore_index=True)
    return word_df


def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
    """Generates a list from starts and ends of utterances in an audio.

    Parameters
    ----------
    audio_df : pd.DataFrame
        Dataframe containing timestamps

    Returns
    -------
    List
        List of tuples containing the start and end of each stamp.
        E.g: [(start_1, end_2), ..., (start_n, end_n)]
    """
    return list(zip(audio_df["Start"], audio_df["End"]))


def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece

    Parameters
    ----------
    dataframe: pd.DataFrame
        Selected DataFrame to process

    Returns
    -------
    pd.DataFrame
        DataFrame with corrected audio segments
    """
    corrected_dataframe = pd.DataFrame()

    # Get lists from dataframe
    segments = list(zip(dataframe['Start'], dataframe['End']))
    segment_durations = list(dataframe['Duration'])
    names = list(dataframe['Audio file'])
    texts = list(dataframe['Sentence'])

    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < 1.6:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
            right_duration = (
                float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
            )
            joined_duration = segment_durations[i] + min(left_duration, right_duration)

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            texts[j] = texts[j] + texts[j + 1]
            del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
        else:
            i += 1

    # Append modified lists to new Dataframe
    corrected_dataframe["Audio file"] = names
    corrected_dataframe["Sentence"] = texts
    corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
    corrected_dataframe["Duration"] = segment_durations
    return corrected_dataframe


def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
    """Gives column with transcriptions

    Parameters
    ----------
    timestamps_df : pd.DataFrame
        DataFrame with transcriptions

    Returns
    -------
    List[str]
        List of the transcriptions
    """
    return timestamps_df.iloc[:, 1].tolist()


def save_transcriptions_segments(
    audio_path: Path, transcriptions_list: List[str], destination: Path
) -> None:
    """Save transcription segments to text files.

    Parameters
    ----------
    audio_path : Path
        Path to the audio file.
    transcriptions_list : List[str]
        List of transcriptions.
    destination : Path
        Destination path for the text files.
    """
    for i, transcription_i in enumerate(transcriptions_list):
        transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
        with open(str(transcription_i_path), "w") as file:
            file.write(transcription_i)


def generate_transcriptions_splits(
    audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
):
    """Generate and save transcription splits based on timestamps.

    Parameters
    ----------
    audio_path : Path
        Path to the audio file.
    timestamps_df : pd.DataFrame
        DataFrame containing timestamps.
    destination : Path
        Destination path for the text files.
    """
    transcriptions_list = get_utterances_transcriptions(timestamps_df)
    save_transcriptions_segments(audio_path, transcriptions_list, destination)