|
import numpy as np |
|
import librosa |
|
|
|
|
|
def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250): |
|
""" |
|
Splits an audio signal into segments using a fixed frame size and hop size. |
|
|
|
Parameters: |
|
- audio (np.ndarray): The audio signal to split. |
|
- sr (int): The sample rate of the input audio (default is 16000). |
|
- silence_thresh (int): Silence threshold (default =-60dB) |
|
- min_silence_len (int): Minimum silence duration (default 250ms). |
|
|
|
Returns: |
|
- list of np.ndarray: A list of audio segments. |
|
- np.ndarray: The intervals where the audio was split. |
|
""" |
|
frame_length = int(min_silence_len / 1000 * sr) |
|
hop_length = frame_length // 2 |
|
intervals = librosa.effects.split( |
|
audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length |
|
) |
|
audio_segments = [audio[start:end] for start, end in intervals] |
|
|
|
return audio_segments, intervals |
|
|
|
|
|
def merge_audio(audio_segments_org, audio_segments_new, intervals, sr_orig, sr_new): |
|
""" |
|
Merges audio segments back into a single audio signal, filling gaps with silence. |
|
Assumes audio segments are already at sr_new. |
|
|
|
Parameters: |
|
- audio_segments_org (list of np.ndarray): The non-silent audio segments (at sr_orig). |
|
- audio_segments_new (list of np.ndarray): The non-silent audio segments (at sr_new). |
|
- intervals (np.ndarray): The intervals used for splitting the original audio. |
|
- sr_orig (int): The sample rate of the original audio |
|
- sr_new (int): The sample rate of the model |
|
Returns: |
|
- np.ndarray: The merged audio signal with silent gaps restored. |
|
""" |
|
merged_audio = np.array([], dtype=audio_segments_new[0].dtype) |
|
sr_ratio = sr_new / sr_orig |
|
|
|
for i, (start, end) in enumerate(intervals): |
|
|
|
start_new = int(start * sr_ratio) |
|
end_new = int(end * sr_ratio) |
|
|
|
original_duration = len(audio_segments_org[i]) / sr_orig |
|
new_duration = len(audio_segments_new[i]) / sr_new |
|
duration_diff = new_duration - original_duration |
|
|
|
silence_samples = int(abs(duration_diff) * sr_new) |
|
silence_compensation = np.zeros( |
|
silence_samples, dtype=audio_segments_new[0].dtype |
|
) |
|
|
|
if i == 0 and start_new > 0: |
|
initial_silence = np.zeros(start_new, dtype=audio_segments_new[0].dtype) |
|
merged_audio = np.concatenate((merged_audio, initial_silence)) |
|
|
|
if duration_diff > 0: |
|
merged_audio = np.concatenate((merged_audio, silence_compensation)) |
|
|
|
merged_audio = np.concatenate((merged_audio, audio_segments_new[i])) |
|
|
|
if duration_diff < 0: |
|
merged_audio = np.concatenate((merged_audio, silence_compensation)) |
|
|
|
if i < len(intervals) - 1: |
|
next_start_new = int(intervals[i + 1][0] * sr_ratio) |
|
silence_duration = next_start_new - end_new |
|
if silence_duration > 0: |
|
silence = np.zeros(silence_duration, dtype=audio_segments_new[0].dtype) |
|
merged_audio = np.concatenate((merged_audio, silence)) |
|
|
|
return merged_audio |
|
|