Spaces:
Build error
Build error
File size: 4,603 Bytes
ff43e05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
By kyubyong park. kbpark.linguist@gmail.com.
https://www.github.com/kyubyong/dc_tts
'''
from __future__ import print_function, division
import numpy as np
import librosa
import os, copy
import matplotlib
matplotlib.use('pdf')
import matplotlib.pyplot as plt
from scipy import signal
from .audio_params import Hyperparams as hp
import tensorflow as tf
def get_spectrograms(fpath):
'''Parse the wave file in `fpath` and
Returns normalized melspectrogram and linear spectrogram.
Args:
fpath: A string. The full path of a sound file.
Returns:
mel: A 2d array of shape (T, n_mels) and dtype of float32.
mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
'''
# Loading sound file
y, sr = librosa.load(fpath, sr=hp.sr)
# Trimming
y, _ = librosa.effects.trim(y)
# Preemphasis
y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
# stft
linear = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram
mag = np.abs(linear) # (1+n_fft//2, T)
# mel spectrogram
mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2)
mel = np.dot(mel_basis, mag) # (n_mels, t)
# to decibel
mel = 20 * np.log10(np.maximum(1e-5, mel))
mag = 20 * np.log10(np.maximum(1e-5, mag))
# normalize
mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
# Transpose
mel = mel.T.astype(np.float32) # (T, n_mels)
mag = mag.T.astype(np.float32) # (T, 1+n_fft//2)
return mel, mag
def spectrogram2wav(mag):
'''# Generate wave file from linear magnitude spectrogram
Args:
mag: A numpy array of (T, 1+n_fft//2)
Returns:
wav: A 1-D numpy array.
'''
# transpose
mag = mag.T
# de-noramlize
mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db
# to amplitude
mag = np.power(10.0, mag * 0.05)
# wav reconstruction
wav = griffin_lim(mag**hp.power)
# de-preemphasis
wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
# trim
wav, _ = librosa.effects.trim(wav)
return wav.astype(np.float32)
def griffin_lim(spectrogram):
'''Applies Griffin-Lim's raw.'''
X_best = copy.deepcopy(spectrogram)
for i in range(hp.n_iter):
X_t = invert_spectrogram(X_best)
est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
phase = est / np.maximum(1e-8, np.abs(est))
X_best = spectrogram * phase
X_t = invert_spectrogram(X_best)
y = np.real(X_t)
return y
def invert_spectrogram(spectrogram):
'''Applies inverse fft.
Args:
spectrogram: [1+n_fft//2, t]
'''
return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
def plot_alignment(alignment, gs, dir=hp.logdir):
"""Plots the alignment.
Args:
alignment: A numpy array with shape of (encoder_steps, decoder_steps)
gs: (int) global step.
dir: Output path.
"""
if not os.path.exists(dir): os.mkdir(dir)
fig, ax = plt.subplots()
im = ax.imshow(alignment)
fig.colorbar(im)
plt.title('{} Steps'.format(gs))
plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png')
plt.close(fig)
def guided_attention(g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g))
return W
def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
'''Noam scheme from tensor2tensor'''
step = tf.to_float(global_step + 1)
return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
def load_spectrograms(fpath):
'''Read the wave file in `fpath`
and extracts spectrograms'''
fname = os.path.basename(fpath)
mel, mag = get_spectrograms(fpath)
t = mel.shape[0]
# Marginal padding for reduction shape sync.
num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0
mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
# Reduction
mel = mel[::hp.r, :]
return fname, mel, mag
|