Spaces:

nahue-passano
/

librispeech-corpus-generator

Runtime error

App Files Files Community

librispeech-corpus-generator / utils /text.py

nahue-passano

update: added lenght fixing in short utterances

80f5b87 about 1 year ago

raw

history blame

No virus

6.07 kB

	from typing import List
	from pathlib import Path
	import pandas as pd


	def get_sentence_data(filename: str, timestamp_dict: dict) -> pd.DataFrame:
	"""Extracts the sentences from the output dictionary of whisper inference

	Parameters
	----------
	filename : str
	Name of the audio analyzed
	timestamp_dict : dict
	Output dictionary from whisper inference

	Returns
	-------
	pd.DataFrame
	DataFrame containing audio filename, start, end and duration of sentences with
	its transcriptions.
	"""
	sentence_df = pd.DataFrame(
	columns=["Audio file", "Sentence", "Start", "End", "Duration"]
	)
	for sentence_i in timestamp_dict["segments"]:
	sentence_i = pd.DataFrame(
	{
	"Audio file": [filename],
	"Sentence": [str(sentence_i["text"])],
	"Start": [sentence_i["start"]],
	"End": [sentence_i["end"]],
	"Duration": [sentence_i["end"] - sentence_i["start"]],
	}
	)
	sentence_df = pd.concat([sentence_df, sentence_i], ignore_index=True)
	return sentence_df


	def get_word_data(filename: str, timestamp_dict: dict):
	"""Extracts the words from the output dictionary of whisper inference

	Parameters
	----------
	filename : str
	Name of the audio analyzed
	timestamp_dict : dict
	Output dictionary from whisper inference

	Returns
	-------
	pd.DataFrame
	DataFrame containing audio filename, start, end and duration of words with
	its transcriptions.
	"""
	word_df = pd.DataFrame(columns=["Audio file", "Word", "Start", "End", "Duration"])
	for sentence_i in timestamp_dict["segments"]:
	for word_i in sentence_i["words"]:
	word_i_df = pd.DataFrame(
	{
	"Audio file": [filename],
	"Word": [str(word_i["text"])],
	"Start": [word_i["start"]],
	"End": [word_i["end"]],
	"Duration": [word_i["end"] - word_i["start"]],
	}
	)
	word_df = pd.concat([word_df, word_i_df], ignore_index=True)
	return word_df


	def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
	"""Generates a list from starts and ends of utterances in an audio.

	Parameters
	----------
	audio_df : pd.DataFrame
	Dataframe containing timestamps

	Returns
	-------
	List
	List of tuples containing the start and end of each stamp.
	E.g: [(start_1, end_2), ..., (start_n, end_n)]
	"""
	return list(zip(audio_df["Start"], audio_df["End"]))


	def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
	"""
	Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece

	Parameters
	----------
	dataframe: pd.DataFrame
	Selected DataFrame to process

	Returns
	-------
	pd.DataFrame
	DataFrame with corrected audio segments
	"""
	corrected_dataframe = pd.DataFrame()

	# Get lists from dataframe
	segments = list(zip(dataframe['Start'], dataframe['End']))
	segment_durations = list(dataframe['Duration'])
	names = list(dataframe['Audio file'])
	texts = list(dataframe['Sentence'])

	i = 0
	while i < len(segments) and len(segments) > 1:
	if segment_durations[i] < 1.6:
	# See if the segment can be re-attached with the right or the left segment
	left_duration = float("inf") if i == 0 else segment_durations[i - 1]
	right_duration = (
	float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
	)
	joined_duration = segment_durations[i] + min(left_duration, right_duration)

	# Re-attach the segment with the neighbour of shortest duration
	j = i - 1 if left_duration <= right_duration else i
	segments[j] = (segments[j][0], segments[j + 1][1])
	segment_durations[j] = joined_duration
	texts[j] = texts[j] + texts[j + 1]
	del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
	else:
	i += 1

	# Append modified lists to new Dataframe
	corrected_dataframe["Audio file"] = names
	corrected_dataframe["Sentence"] = texts
	corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
	corrected_dataframe["Duration"] = segment_durations
	return corrected_dataframe


	def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
	"""Gives column with transcriptions

	Parameters
	----------
	timestamps_df : pd.DataFrame
	DataFrame with transcriptions

	Returns
	-------
	List[str]
	List of the transcriptions
	"""
	return timestamps_df.iloc[:, 1].tolist()


	def save_transcriptions_segments(
	audio_path: Path, transcriptions_list: List[str], destination: Path
	) -> None:
	"""Save transcription segments to text files.

	Parameters
	----------
	audio_path : Path
	Path to the audio file.
	transcriptions_list : List[str]
	List of transcriptions.
	destination : Path
	Destination path for the text files.
	"""
	for i, transcription_i in enumerate(transcriptions_list):
	transcription_i_path = destination / f"{audio_path.stem}-{i}.txt"
	with open(str(transcription_i_path), "w") as file:
	file.write(transcription_i)


	def generate_transcriptions_splits(
	audio_path: Path, timestamps_df: pd.DataFrame, destination: Path
	):
	"""Generate and save transcription splits based on timestamps.

	Parameters
	----------
	audio_path : Path
	Path to the audio file.
	timestamps_df : pd.DataFrame
	DataFrame containing timestamps.
	destination : Path
	Destination path for the text files.
	"""
	transcriptions_list = get_utterances_transcriptions(timestamps_df)
	save_transcriptions_segments(audio_path, transcriptions_list, destination)