File size: 2,479 Bytes
2777fde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
""" Utility file for loaders """
import numpy as np
import soundfile as sf
import wave
# Function to convert frame level audio into atomic time
def frames_to_time(total_length, sr=44100):
in_time = total_length / sr
hour = int(in_time / 3600)
minute = int((in_time - hour*3600) / 60)
second = int(in_time - hour*3600 - minute*60)
return f"{hour:02d}:{minute:02d}:{second:02d}"
# Function to convert atomic labeled time into frames or seconds
def time_to_frames(input_time, to_frames=True, sr=44100):
hour, minute, second = input_time.split(':')
total_seconds = int(hour)*3600 + int(minute)*60 + int(second)
return total_seconds*sr if to_frames else total_seconds
# Function to convert seconds to atomic labeled time
def sec_to_time(input_time):
return frames_to_time(input_time, sr=1)
# Function to load total trainable raw audio lengths
def get_total_audio_length(audio_paths):
total_length = 0
for cur_audio_path in audio_paths:
cur_wav = wave.open(cur_audio_path, 'r')
total_length += cur_wav.getnframes() # here, length = # of frames
return total_length
# Function to load length of an input wav audio
def load_wav_length(audio_path):
pt_wav = wave.open(audio_path, 'r')
length = pt_wav.getnframes()
return length
# Function to load only selected 16 bit, stereo wav audio segment from an input wav audio
def load_wav_segment(audio_path, start_point=None, duration=None, axis=1, sample_rate=44100):
start_point = 0 if start_point==None else start_point
duration = load_wav_length(audio_path) if duration==None else duration
pt_wav = wave.open(audio_path, 'r')
if pt_wav.getframerate()!=sample_rate:
raise ValueError(f"ValueError: input audio's sample rate should be {sample_rate}")
pt_wav.setpos(start_point)
x = pt_wav.readframes(duration)
if pt_wav.getsampwidth()==2:
x = np.frombuffer(x, dtype=np.int16)
X = x / float(2**15) # needs to be 16 bit format
elif pt_wav.getsampwidth()==4:
x = np.frombuffer(x, dtype=np.int32)
X = x / float(2**31) # needs to be 32 bit format
else:
raise ValueError("ValueError: input audio's bit depth should be 16 or 32-bit")
# exception for stereo channels
if pt_wav.getnchannels()==2:
X_l = np.expand_dims(X[::2], axis=axis)
X_r = np.expand_dims(X[1::2], axis=axis)
X = np.concatenate((X_l, X_r), axis=axis)
return X
|