Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/examples
/tacotron2
/export_align.py
import os | |
import shutil | |
from tqdm import tqdm | |
import argparse | |
from scipy.ndimage import zoom | |
from skimage.data import camera | |
import numpy as np | |
from scipy.spatial.distance import cdist | |
def safemkdir(dirn): | |
if not os.path.isdir(dirn): | |
os.mkdir(dirn) | |
from pathlib import Path | |
def duration_to_alignment(in_duration): | |
total_len = np.sum(in_duration) | |
num_chars = len(in_duration) | |
attention = np.zeros(shape=(num_chars, total_len), dtype=np.float32) | |
y_offset = 0 | |
for duration_idx, duration_val in enumerate(in_duration): | |
for y_val in range(0, duration_val): | |
attention[duration_idx][y_offset + y_val] = 1.0 | |
y_offset += duration_val | |
return attention | |
def rescale_alignment(in_alignment, in_targcharlen): | |
current_x = in_alignment.shape[0] | |
x_ratio = in_targcharlen / current_x | |
pivot_points = [] | |
zoomed = zoom(in_alignment, (x_ratio, 1.0), mode="nearest") | |
for x_v in range(0, zoomed.shape[0]): | |
for y_v in range(0, zoomed.shape[1]): | |
val = zoomed[x_v][y_v] | |
if val < 0.5: | |
val = 0.0 | |
else: | |
val = 1.0 | |
pivot_points.append((x_v, y_v)) | |
zoomed[x_v][y_v] = val | |
if zoomed.shape[0] != in_targcharlen: | |
print("Zooming didn't rshape well, explicitly reshaping") | |
zoomed.resize((in_targcharlen, in_alignment.shape[1])) | |
return zoomed, pivot_points | |
def gather_dist(in_mtr, in_points): | |
# initialize with known size for fast | |
full_coords = [(0, 0) for x in range(in_mtr.shape[0] * in_mtr.shape[1])] | |
i = 0 | |
for x in range(0, in_mtr.shape[0]): | |
for y in range(0, in_mtr.shape[1]): | |
full_coords[i] = (x, y) | |
i += 1 | |
return cdist(full_coords, in_points, "euclidean") | |
def create_guided(in_align, in_pvt, looseness): | |
new_att = np.ones(in_align.shape, dtype=np.float32) | |
# It is dramatically faster that we first gather all the points and calculate than do it manually | |
# for each point in for loop | |
dist_arr = gather_dist(in_align, in_pvt) | |
# Scale looseness based on attention size. (addition works better than mul). Also divide by 100 | |
# because having user input 3.35 is nicer | |
real_loose = (looseness / 100) * (new_att.shape[0] + new_att.shape[1]) | |
g_idx = 0 | |
for x in range(0, new_att.shape[0]): | |
for y in range(0, new_att.shape[1]): | |
min_point_idx = dist_arr[g_idx].argmin() | |
closest_pvt = in_pvt[min_point_idx] | |
distance = dist_arr[g_idx][min_point_idx] / real_loose | |
distance = np.power(distance, 2) | |
g_idx += 1 | |
new_att[x, y] = distance | |
return np.clip(new_att, 0.0, 1.0) | |
def get_pivot_points(in_att): | |
ret_points = [] | |
for x in range(0, in_att.shape[0]): | |
for y in range(0, in_att.shape[1]): | |
if in_att[x, y] > 0.8: | |
ret_points.append((x, y)) | |
return ret_points | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Postprocess durations to become alignments" | |
) | |
parser.add_argument( | |
"--dump-dir", | |
default="dump", | |
type=str, | |
help="Path of dump directory", | |
) | |
parser.add_argument( | |
"--looseness", | |
default=3.5, | |
type=float, | |
help="Looseness of the generated guided attention map. Lower values = tighter", | |
) | |
args = parser.parse_args() | |
dump_dir = args.dump_dir | |
dump_sets = ["train", "valid"] | |
for d_set in dump_sets: | |
full_fol = os.path.join(dump_dir, d_set) | |
align_path = os.path.join(full_fol, "alignments") | |
ids_path = os.path.join(full_fol, "ids") | |
durations_path = os.path.join(full_fol, "durations") | |
safemkdir(align_path) | |
for duration_fn in tqdm(os.listdir(durations_path)): | |
if not ".npy" in duration_fn: | |
continue | |
id_fn = duration_fn.replace("-durations", "-ids") | |
id_path = os.path.join(ids_path, id_fn) | |
duration_path = os.path.join(durations_path, duration_fn) | |
duration_arr = np.load(duration_path) | |
id_arr = np.load(id_path) | |
id_true_size = len(id_arr) | |
align = duration_to_alignment(duration_arr) | |
if align.shape[0] != id_true_size: | |
align, points = rescale_alignment(align, id_true_size) | |
else: | |
points = get_pivot_points(align) | |
if len(points) == 0: | |
print("WARNING points are empty for", id_fn) | |
align = create_guided(align, points, args.looseness) | |
align_fn = id_fn.replace("-ids", "-alignment") | |
align_full_fn = os.path.join(align_path, align_fn) | |
np.save(align_full_fn, align.astype("float32")) | |
if __name__ == "__main__": | |
main() | |