Spaces:
Runtime error
Runtime error
File size: 4,383 Bytes
7405904 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from typing import List
from pathlib import Path
import pandas as pd
def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
"""Extracts the sentences from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of sentences with
its transcriptions.
"""
sentence_df = pd.DataFrame(
columns=["Audio file", "Sentence", "Start", "End", "Duration"]
)
for sentence_i in timestamp_dict["segments"]:
sentence_i = pd.DataFrame(
{
"Audio file": [filename],
"Sentence": [str(sentence_i["text"])],
"Start": [sentence_i["start"]],
"End": [sentence_i["end"]],
"Duration": [sentence_i["end"] - sentence_i["start"]],
}
)
sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
return sentence_df
def get_word_data(filename: str, timestamp_dict: dict):
"""Extracts the words from the output dictionary of whisper inference
Parameters
----------
filename : str
Name of the audio analyzed
timestamp_dict : dict
Output dictionary from whisper inference
Returns
-------
pd.DataFrame
DataFrame containing audio filename, start, end and duration of words with
its transcriptions.
"""
word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
for sentence_i in timestamp_dict["segments"]:
for word_i in sentence_i["words"]:
word_i_df = pd.DataFrame(
{
"Audio file": [filename],
"Word": [str(word_i["text"])],
"Start": [word_i["start"]],
"End": [word_i["end"]],
"Duration": [word_i["end"] - word_i["start"]],
}
)
word_df = pd.concat([word_df, word_i_df], ignore_index=True)
return word_df
def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str) -> List:
"""Generates a list from timestamps_df with the timestamps belonging to audio_file
Parameters
----------
timestamps_df : pd.DataFrame
Dataframe containing timestamps
audio_file : str
Name of the audio file.
Returns
-------
List
List of tuples containing the start and end of each stamp.
E.g: [(start_1, end_2), ..., (start_n, end_n)]
"""
audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
return list(zip(audio_df["Start"], audio_df["End"]))
def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
"""Gives column with transcriptions
Parameters
----------
timestamps_df : pd.DataFrame
DataFrame with transcriptions
Returns
-------
List[str]
List of the transcriptions
"""
return timestamps_df.iloc[:, 1].tolist()
def save_transcriptions_segments(
audio_path: Path, transcriptions_list: List[str], destination: Path
) -> None:
"""Save transcription segments to text files.
Parameters
----------
audio_path : Path
Path to the audio file.
transcriptions_list : List[str]
List of transcriptions.
destination : Path
Destination path for the text files.
"""
for i, transcription_i in enumerate(transcriptions_list):
transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
with open(str(transcription_i_path), "w") as file:
file.write(transcription_i)
def generate_transcriptions_splits(
audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
):
"""Generate and save transcription splits based on timestamps.
Parameters
----------
audio_path : Path
Path to the audio file.
timestamps_df : pd.DataFrame
DataFrame containing timestamps.
destination : Path
Destination path for the text files.
"""
transcriptions_list = get_utterances_transcriptions(timestamps_df)
save_transcriptions_segments(audio_path, transcriptions_list, destination)
|