Spaces:
Running
Running
File size: 5,577 Bytes
9df2e22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
from librosa.filters import mel
from scipy import signal
from scipy.fftpack import fft
class Audio:
"""
audio class which holds music data and timestamp for notes.
Args:
filename: file name.
stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
Variables:
Example:
>>>from music_processor import *
>>>song = Audio(filename)
>>># to get audio data
>>>song.data
>>># to import .tja files:
>>>song.import_tja(filename)
>>># to get data converted
>>>song.data = (song.data[:,0]+song.data[:,1])/2
>>>fft_and_melscale(song, include_zero_cross=False)
"""
def __init__(self, data, samplerate, stereo=True):
self.data = data
self.samplerate = samplerate
if stereo is False:
self.data = (self.data[:, 0] + self.data[:, 1]) / 2
self.timestamp = []
def plotaudio(self, start_t, stop_t):
plt.plot(
np.linspace(start_t, stop_t, stop_t - start_t), self.data[start_t:stop_t, 0]
)
plt.show()
def save(self, filename, start_t=0, stop_t=None):
if stop_t is None:
stop_t = self.data.shape[0]
sf.write(filename, self.data[start_t:stop_t], self.samplerate)
def synthesize(self, diff=True, don="./asset/don.wav", ka="./asset/ka.wav"):
donsound = sf.read(don)[0]
donsound = (donsound[:, 0] + donsound[:, 1]) / 2
kasound = sf.read(ka)[0]
kasound = (kasound[:, 0] + kasound[:, 1]) / 2
donlen = len(donsound)
kalen = len(kasound)
if diff is True:
for stamp in self.timestamp:
timing = int(stamp[0] * self.samplerate)
try:
if stamp[1] in (1, 3, 5, 6, 7):
self.data[timing : timing + donlen] += donsound
elif stamp[1] in (2, 4):
self.data[timing : timing + kalen] += kasound
except ValueError:
pass
elif diff == "don":
if isinstance(self.timestamp[0], tuple):
for stamp in self.timestamp:
if stamp * self.samplerate + donlen < self.data.shape[0]:
self.data[
int(stamp[0] * self.samplerate) : int(
stamp[0] * self.samplerate
)
+ donlen
] += donsound
else:
for stamp in self.timestamp:
if stamp * self.samplerate + donlen < self.data.shape[0]:
self.data[
int(stamp * self.samplerate) : int(stamp * self.samplerate)
+ donlen
] += donsound
elif diff == "ka":
if isinstance(self.timestamp[0], tuple):
for stamp in self.timestamp:
if stamp * self.samplerate + kalen < self.data.shape[0]:
self.data[
int(stamp[0] * self.samplerate) : int(
stamp[0] * self.samplerate
)
+ kalen
] += kasound
else:
for stamp in self.timestamp:
if stamp * self.samplerate + kalen < self.data.shape[0]:
self.data[
int(stamp * self.samplerate) : int(stamp * self.samplerate)
+ kalen
] += kasound
def make_frame(data, nhop, nfft):
"""
helping function for fftandmelscale.
細かい時間に切り分けたものを学習データとするため,nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
"""
length = data.shape[0]
framedata = np.concatenate((data, np.zeros(nfft))) # zero padding
return np.array(
[framedata[i * nhop : i * nhop + nfft] for i in range(length // nhop)]
)
# @jit
def fft_and_melscale(
song,
nhop=512,
nffts=[1024, 2048, 4096],
mel_nband=80,
mel_freqlo=27.5,
mel_freqhi=16000.0,
include_zero_cross=False,
):
"""
fft and melscale method.
fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う.
melscale: 周波数の次元を削減するとともに,log10の値を取っている.
"""
feat_channels = []
for nfft in nffts:
feats = []
window = signal.windows.blackmanharris(nfft)
filt = mel(
sr=song.samplerate,
n_fft=nfft,
n_mels=mel_nband,
fmin=mel_freqlo,
fmax=mel_freqhi,
)
# get normal frame
frame = make_frame(song.data, nhop, nfft)
# print(frame.shape)
# melscaling
processedframe = fft(window * frame)[:, : nfft // 2 + 1]
processedframe = np.dot(filt, np.transpose(np.abs(processedframe) ** 2))
processedframe = 20 * np.log10(processedframe + 0.1)
# print(processedframe.shape)
feat_channels.append(processedframe)
if include_zero_cross:
song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
print(song.zero_crossing)
return np.array(feat_channels)
|