File size: 2,479 Bytes
2777fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
""" Utility file for loaders """

import numpy as np
import soundfile as sf
import wave



# Function to convert frame level audio into atomic time
def frames_to_time(total_length, sr=44100):
    in_time = total_length / sr
    hour = int(in_time / 3600)
    minute = int((in_time - hour*3600) / 60)
    second = int(in_time - hour*3600 - minute*60)
    return f"{hour:02d}:{minute:02d}:{second:02d}"


# Function to convert atomic labeled time into frames or seconds
def time_to_frames(input_time, to_frames=True, sr=44100):
    hour, minute, second = input_time.split(':')
    total_seconds = int(hour)*3600 + int(minute)*60 + int(second)
    return total_seconds*sr if to_frames else total_seconds


# Function to convert seconds to atomic labeled time
def sec_to_time(input_time):
    return frames_to_time(input_time, sr=1)


# Function to load total trainable raw audio lengths
def get_total_audio_length(audio_paths):
    total_length = 0
    for cur_audio_path in audio_paths:
        cur_wav = wave.open(cur_audio_path, 'r')
        total_length += cur_wav.getnframes()    # here, length = # of frames
    return total_length


# Function to load length of an input wav audio
def load_wav_length(audio_path):
    pt_wav = wave.open(audio_path, 'r')
    length = pt_wav.getnframes()
    return length


# Function to load only selected 16 bit, stereo wav audio segment from an input wav audio
def load_wav_segment(audio_path, start_point=None, duration=None, axis=1, sample_rate=44100):
    start_point = 0 if start_point==None else start_point
    duration = load_wav_length(audio_path) if duration==None else duration
    pt_wav = wave.open(audio_path, 'r')

    if pt_wav.getframerate()!=sample_rate:
        raise ValueError(f"ValueError: input audio's sample rate should be {sample_rate}")
    pt_wav.setpos(start_point)
    x = pt_wav.readframes(duration)
    if pt_wav.getsampwidth()==2:
        x = np.frombuffer(x, dtype=np.int16)
        X = x / float(2**15)    # needs to be 16 bit format 
    elif pt_wav.getsampwidth()==4:
        x = np.frombuffer(x, dtype=np.int32)
        X = x / float(2**31)    # needs to be 32 bit format 
    else:
        raise ValueError("ValueError: input audio's bit depth should be 16 or 32-bit")

    # exception for stereo channels 
    if pt_wav.getnchannels()==2:
        X_l = np.expand_dims(X[::2], axis=axis)
        X_r = np.expand_dims(X[1::2], axis=axis)
        X = np.concatenate((X_l, X_r), axis=axis)
    return X