Spaces:

nahue-passano
/

librispeech-corpus-generator

Runtime error

App Files Files Community

nahue-passano commited on Jul 17, 2023

Commit

80f5b87

•

1 Parent(s): b2f2444

update: added lenght fixing in short utterances

Browse files

Files changed (2) hide show

utils/audio.py +2 -2
utils/text.py +52 -6

utils/audio.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import soundfile as sf
 import pandas as pd
-from utils.text import filter_dataframe_by_audiofile
 def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
@@ -91,6 +91,6 @@ def generate_audio_splits(
         Path were segments will be saved.
     """
     audio_array, sample_rate = load_audio(audio_path)
-    timestamp_list = filter_dataframe_by_audiofile(timestamps_df, audio_path.name)
     audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
     save_audio_segments(destination, audio_path, audio_segments, sample_rate)

 import soundfile as sf
 import pandas as pd
+from utils.text import get_utterance_boundaries
 def load_audio(audio_path: Path) -> Tuple[np.ndarray, float]:
         Path were segments will be saved.
     """
     audio_array, sample_rate = load_audio(audio_path)
+    timestamp_list = get_utterance_boundaries(timestamps_df)
     audio_segments = split_audio(audio_array, sample_rate, timestamp_list)
     save_audio_segments(destination, audio_path, audio_segments, sample_rate)

utils/text.py CHANGED Viewed

@@ -68,15 +68,13 @@ def get_word_data(filename: str, timestamp_dict: dict):
     return word_df
-def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str) -> List:
-    """Generates a list from timestamps_df with the timestamps belonging to audio_file
     Parameters
     ----------
-    timestamps_df : pd.DataFrame
         Dataframe containing timestamps
-    audio_file : str
-        Name of the audio file.
     Returns
     -------
@@ -84,10 +82,58 @@ def filter_dataframe_by_audiofile(timestamps_df: pd.DataFrame, audio_file: str)
         List of tuples containing the start and end of each stamp.
         E.g: [(start_1, end_2), ..., (start_n, end_n)]
     """
-    audio_df = timestamps_df[timestamps_df["Audio file"] == audio_file]
     return list(zip(audio_df["Start"], audio_df["End"]))
 def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
     """Gives column with transcriptions

     return word_df
+def get_utterance_boundaries(audio_df: pd.DataFrame) -> List:
+    """Generates a list from starts and ends of utterances in an audio.
     Parameters
     ----------
+    audio_df : pd.DataFrame
         Dataframe containing timestamps
     Returns
     -------
         List of tuples containing the start and end of each stamp.
         E.g: [(start_1, end_2), ..., (start_n, end_n)]
     """
     return list(zip(audio_df["Start"], audio_df["End"]))
+def check_ut_min_duration(dataframe: pd.DataFrame) -> pd.DataFrame:
+    """
+    Concatenates audio segments that are shorter than minimum utterance duration only for sentence inferece
+    Parameters
+    ----------
+    dataframe: pd.DataFrame
+        Selected DataFrame to process
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with corrected audio segments
+    """
+    corrected_dataframe = pd.DataFrame()
+    # Get lists from dataframe
+    segments = list(zip(dataframe['Start'], dataframe['End']))
+    segment_durations = list(dataframe['Duration'])
+    names = list(dataframe['Audio file'])
+    texts = list(dataframe['Sentence'])
+    i = 0
+    while i < len(segments) and len(segments) > 1:
+        if segment_durations[i] < 1.6:
+            # See if the segment can be re-attached with the right or the left segment
+            left_duration = float("inf") if i == 0 else segment_durations[i - 1]
+            right_duration = (
+                float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
+            )
+            joined_duration = segment_durations[i] + min(left_duration, right_duration)
+            # Re-attach the segment with the neighbour of shortest duration
+            j = i - 1 if left_duration <= right_duration else i
+            segments[j] = (segments[j][0], segments[j + 1][1])
+            segment_durations[j] = joined_duration
+            texts[j] = texts[j] + texts[j + 1]
+            del segments[j + 1], segment_durations[j + 1], names[j + 1], texts[j + 1]
+        else:
+            i += 1
+    # Append modified lists to new Dataframe
+    corrected_dataframe["Audio file"] = names
+    corrected_dataframe["Sentence"] = texts
+    corrected_dataframe["Start"], corrected_dataframe["End"] = zip(*segments)
+    corrected_dataframe["Duration"] = segment_durations
+    return corrected_dataframe
 def get_utterances_transcriptions(timestamps_df: pd.DataFrame) -> List[str]:
     """Gives column with transcriptions