Spaces:

JacobLinCool
/

tja-generator

Running

File size: 5,577 Bytes

9df2e22

import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
from librosa.filters import mel
from scipy import signal
from scipy.fftpack import fft


class Audio:
    """
    audio class which holds music data and timestamp for notes.

    Args:
        filename: file name.
        stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
    Variables:


    Example:
        >>>from music_processor import *
        >>>song = Audio(filename)
        >>># to get audio data
        >>>song.data
        >>># to import .tja files:
        >>>song.import_tja(filename)
        >>># to get data converted
        >>>song.data = (song.data[:,0]+song.data[:,1])/2
        >>>fft_and_melscale(song, include_zero_cross=False)
    """

    def __init__(self, data, samplerate, stereo=True):
        self.data = data
        self.samplerate = samplerate
        if stereo is False:
            self.data = (self.data[:, 0] + self.data[:, 1]) / 2
        self.timestamp = []

    def plotaudio(self, start_t, stop_t):
        plt.plot(
            np.linspace(start_t, stop_t, stop_t - start_t), self.data[start_t:stop_t, 0]
        )
        plt.show()

    def save(self, filename, start_t=0, stop_t=None):
        if stop_t is None:
            stop_t = self.data.shape[0]
        sf.write(filename, self.data[start_t:stop_t], self.samplerate)

    def synthesize(self, diff=True, don="./asset/don.wav", ka="./asset/ka.wav"):
        donsound = sf.read(don)[0]
        donsound = (donsound[:, 0] + donsound[:, 1]) / 2
        kasound = sf.read(ka)[0]
        kasound = (kasound[:, 0] + kasound[:, 1]) / 2
        donlen = len(donsound)
        kalen = len(kasound)

        if diff is True:
            for stamp in self.timestamp:
                timing = int(stamp[0] * self.samplerate)
                try:
                    if stamp[1] in (1, 3, 5, 6, 7):
                        self.data[timing : timing + donlen] += donsound
                    elif stamp[1] in (2, 4):
                        self.data[timing : timing + kalen] += kasound
                except ValueError:
                    pass

        elif diff == "don":
            if isinstance(self.timestamp[0], tuple):
                for stamp in self.timestamp:
                    if stamp * self.samplerate + donlen < self.data.shape[0]:
                        self.data[
                            int(stamp[0] * self.samplerate) : int(
                                stamp[0] * self.samplerate
                            )
                            + donlen
                        ] += donsound
            else:
                for stamp in self.timestamp:
                    if stamp * self.samplerate + donlen < self.data.shape[0]:
                        self.data[
                            int(stamp * self.samplerate) : int(stamp * self.samplerate)
                            + donlen
                        ] += donsound

        elif diff == "ka":
            if isinstance(self.timestamp[0], tuple):
                for stamp in self.timestamp:
                    if stamp * self.samplerate + kalen < self.data.shape[0]:
                        self.data[
                            int(stamp[0] * self.samplerate) : int(
                                stamp[0] * self.samplerate
                            )
                            + kalen
                        ] += kasound
            else:
                for stamp in self.timestamp:
                    if stamp * self.samplerate + kalen < self.data.shape[0]:
                        self.data[
                            int(stamp * self.samplerate) : int(stamp * self.samplerate)
                            + kalen
                        ] += kasound


def make_frame(data, nhop, nfft):
    """
    helping function for fftandmelscale.
    細かい時間に切り分けたものを学習データとするため，nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
    """

    length = data.shape[0]
    framedata = np.concatenate((data, np.zeros(nfft)))  # zero padding
    return np.array(
        [framedata[i * nhop : i * nhop + nfft] for i in range(length // nhop)]
    )


# @jit
def fft_and_melscale(
    song,
    nhop=512,
    nffts=[1024, 2048, 4096],
    mel_nband=80,
    mel_freqlo=27.5,
    mel_freqhi=16000.0,
    include_zero_cross=False,
):
    """
    fft and melscale method.
    fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う．
    melscale: 周波数の次元を削減するとともに，log10の値を取っている．
    """

    feat_channels = []

    for nfft in nffts:
        feats = []
        window = signal.windows.blackmanharris(nfft)
        filt = mel(
            sr=song.samplerate,
            n_fft=nfft,
            n_mels=mel_nband,
            fmin=mel_freqlo,
            fmax=mel_freqhi,
        )

        # get normal frame
        frame = make_frame(song.data, nhop, nfft)
        # print(frame.shape)

        # melscaling
        processedframe = fft(window * frame)[:, : nfft // 2 + 1]
        processedframe = np.dot(filt, np.transpose(np.abs(processedframe) ** 2))
        processedframe = 20 * np.log10(processedframe + 0.1)
        # print(processedframe.shape)

        feat_channels.append(processedframe)

    if include_zero_cross:
        song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
        print(song.zero_crossing)

    return np.array(feat_channels)