Spaces:

kevinwang676
/

ChatGLM2-SadTalker-VC

Runtime error

App Files Files Community

kevinwang676 commited on Jul 18, 2023

Commit

1f74c86

1 Parent(s): dd6f15d

Upload 6 files

Browse files

Files changed (6) hide show

commons.py +171 -0
mel_processing.py +112 -0
models.py +351 -0
modules.py +342 -0
tts_voice.py +290 -0
utils.py +305 -0

commons.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+  """KL(P||Q)"""
+  kl = (logs_q - logs_p) - 0.5
+  kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+  return kl
+def rand_gumbel(shape):
+  """Sample from the Gumbel distribution, protect from overflows."""
+  uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+  return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+  g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+  return g
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def rand_spec_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+def get_timing_signal_1d(
+    length, channels, min_timescale=1.0, max_timescale=1.0e4):
+  position = torch.arange(length, dtype=torch.float)
+  num_timescales = channels // 2
+  log_timescale_increment = (
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (num_timescales - 1))
+  inv_timescales = min_timescale * torch.exp(
+      torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+  scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+  signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+  signal = F.pad(signal, [0, 0, 0, channels % 2])
+  signal = signal.view(1, channels, length)
+  return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+  b, channels, length = x.size()
+  signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+  return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+def shift_1d(x):
+  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+  return x
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+  if isinstance(parameters, torch.Tensor):
+    parameters = [parameters]
+  parameters = list(filter(lambda p: p.grad is not None, parameters))
+  norm_type = float(norm_type)
+  if clip_value is not None:
+    clip_value = float(clip_value)
+  total_norm = 0
+  for p in parameters:
+    param_norm = p.grad.data.norm(norm_type)
+    total_norm += param_norm.item() ** norm_type
+    if clip_value is not None:
+      p.grad.data.clamp_(min=-clip_value, max=clip_value)
+  total_norm = total_norm ** (1. / norm_type)
+  return total_norm

mel_processing.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import math
+import os
+import random
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.data
+import numpy as np
+import librosa
+import librosa.util as librosa_util
+from librosa.util import normalize, pad_center, tiny
+from scipy.signal import get_window
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

models.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import copy
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(modules.Flip())
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+class Encoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2,3,5,7,11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class SpeakerEncoder(torch.nn.Module):
+    def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
+        super(SpeakerEncoder, self).__init__()
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
+        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
+        self.relu = nn.ReLU()
+    def forward(self, mels):
+        self.lstm.flatten_parameters()
+        _, (hidden, _) = self.lstm(mels)
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+    def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
+        mel_slices = []
+        for i in range(0, total_frames-partial_frames, partial_hop):
+            mel_range = torch.arange(i, i+partial_frames)
+            mel_slices.append(mel_range)
+        return mel_slices
+    def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
+        mel_len = mel.size(1)
+        last_mel = mel[:,-partial_frames:]
+        if mel_len > partial_frames:
+            mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
+            mels = list(mel[:,s] for s in mel_slices)
+            mels.append(last_mel)
+            mels = torch.stack(tuple(mels), 0).squeeze(1)
+            with torch.no_grad():
+                partial_embeds = self(mels)
+            embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
+            #embed = embed / torch.linalg.norm(embed, 2)
+        else:
+            with torch.no_grad():
+                embed = self(last_mel)
+        return embed
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+  def __init__(self,
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock,
+    resblock_kernel_sizes,
+    resblock_dilation_sizes,
+    upsample_rates,
+    upsample_initial_channel,
+    upsample_kernel_sizes,
+    gin_channels,
+    ssl_dim,
+    use_spk,
+    **kwargs):
+    super().__init__()
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.gin_channels = gin_channels
+    self.ssl_dim = ssl_dim
+    self.use_spk = use_spk
+    self.enc_p = Encoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+    if not self.use_spk:
+      self.enc_spk = SpeakerEncoder(model_hidden_size=gin_channels, model_embedding_size=gin_channels)
+  def forward(self, c, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if spec_lengths == None:
+      spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
+    if not self.use_spk:
+      g = self.enc_spk(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    _, m_p, logs_p, _ = self.enc_p(c, c_lengths)
+    z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+    z_p = self.flow(z, spec_mask, g=g)
+    z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size)
+    o = self.dec(z_slice, g=g)
+    return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+  def infer(self, c, g=None, mel=None, c_lengths=None):
+    if c_lengths == None:
+      c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+    if not self.use_spk:
+      g = self.enc_spk.embed_utterance(mel.transpose(1,2))
+    g = g.unsqueeze(-1)
+    z_p, m_p, logs_p, c_mask = self.enc_p(c, c_lengths)
+    z = self.flow(z_p, c_mask, g=g, reverse=True)
+    o = self.dec(z * c_mask, g=g)
+    return o

modules.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+import commons
+from commons import init_weights, get_padding
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+class DDSConv(nn.Module):
+  """
+  Dialted and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+    if g is not None:
+      g = self.cond_layer(g)
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x

tts_voice.py ADDED Viewed

	@@ -0,0 +1,290 @@

+tts_order_voice = {'英语 (美国)-Jenny-女': 'en-US-JennyNeural',
+ '英语 (美国)-Guy-男': 'en-US-GuyNeural',
+ '英语 (美国)-Ana-女': 'en-US-AnaNeural',
+ '英语 (美国)-Aria-女': 'en-US-AriaNeural',
+ '英语 (美国)-Christopher-男': 'en-US-ChristopherNeural',
+ '英语 (美国)-Eric-男': 'en-US-EricNeural',
+ '英语 (美国)-Michelle-女': 'en-US-MichelleNeural',
+ '英语 (美国)-Roger-男': 'en-US-RogerNeural',
+ '西班牙语 (墨西哥)-Dalia-女': 'es-MX-DaliaNeural',
+ '西班牙语 (墨西哥)-Jorge-男': 'es-MX-JorgeNeural',
+ '韩语 (韩国)-Sun-Hi-女': 'ko-KR-SunHiNeural',
+ '韩语 (韩国)-InJoon-男': 'ko-KR-InJoonNeural',
+'泰语 (泰国)-Premwadee-女': 'th-TH-PremwadeeNeural',
+ '泰语 (泰国)-Niwat-男': 'th-TH-NiwatNeural',
+ '越南语 (越南)-HoaiMy-女': 'vi-VN-HoaiMyNeural',
+'越南语 (越南)-NamMinh-男': 'vi-VN-NamMinhNeural',
+ '日语 (日本)-Nanami-女': 'ja-JP-NanamiNeural',
+ '日语 (日本)-Keita-男': 'ja-JP-KeitaNeural',
+ '法语 (法国)-Denise-女': 'fr-FR-DeniseNeural',
+ '法语 (法国)-Eloise-女': 'fr-FR-EloiseNeural',
+ '法语 (法国)-Henri-男': 'fr-FR-HenriNeural',
+ '葡萄牙语 (巴西)-Francisca-女': 'pt-BR-FranciscaNeural',
+ '葡萄牙语 (巴西)-Antonio-男': 'pt-BR-AntonioNeural',
+ '印度尼西亚语 (印度尼西亚)-Ardi-男': 'id-ID-ArdiNeural',
+ '印度尼西亚语 (印度尼西亚)-Gadis-女': 'id-ID-GadisNeural',
+ '希伯来语 (以色列)-Avri-男': 'he-IL-AvriNeural',
+ '希伯来语 (以色列)-Hila-女': 'he-IL-HilaNeural',
+'意大利语 (意大利)-Isabella-女': 'it-IT-IsabellaNeural',
+ '意大利语 (意大利)-Diego-男': 'it-IT-DiegoNeural',
+ '意大利语 (意大利)-Elsa-女': 'it-IT-ElsaNeural',
+ '荷兰语 (荷兰)-Colette-女': 'nl-NL-ColetteNeural',
+ '荷兰语 (荷兰)-Fenna-女': 'nl-NL-FennaNeural',
+ '荷兰语 (荷兰)-Maarten-男': 'nl-NL-MaartenNeural',
+'马来语 (马来西亚)-Osman-男': 'ms-MY-OsmanNeural',
+ '马来语 (马来西亚)-Yasmin-女': 'ms-MY-YasminNeural',
+ '挪威语 (挪威)-Pernille-女': 'nb-NO-PernilleNeural',
+ '挪威语 (挪威)-Finn-男': 'nb-NO-FinnNeural',
+ '瑞典语 (瑞典)-Sofie-女': 'sv-SE-SofieNeural',
+ '瑞典语 (瑞典)-Mattias-男': 'sv-SE-MattiasNeural',
+ '阿拉伯语 (沙特阿拉伯)-Hamed-男': 'ar-SA-HamedNeural',
+ '阿拉伯语 (沙特阿拉伯)-Zariyah-女': 'ar-SA-ZariyahNeural',
+ '希腊语 (希腊)-Athina-女': 'el-GR-AthinaNeural',
+ '希腊语 (希腊)-Nestoras-男': 'el-GR-NestorasNeural',
+'德语 (德国)-Katja-女': 'de-DE-KatjaNeural',
+ '德语 (德国)-Amala-女': 'de-DE-AmalaNeural',
+ '德语 (德国)-Conrad-男': 'de-DE-ConradNeural',
+ '德语 (德国)-Killian-男': 'de-DE-KillianNeural',
+ '阿拉伯语 (南非)-Adri-女': 'af-ZA-AdriNeural',
+ '阿拉伯语 (南非)-Willem-男': 'af-ZA-WillemNeural',
+ '阿姆哈拉语 (埃塞俄比亚)-Ameha-男': 'am-ET-AmehaNeural',
+ '阿姆哈拉语 (埃塞俄比亚)-Mekdes-女': 'am-ET-MekdesNeural',
+ '阿拉伯语 (阿拉伯联合酋长国)-Fatima-女': 'ar-AE-FatimaNeural',
+ '阿拉伯语 (阿拉伯联合酋长国)-Hamdan-男': 'ar-AE-HamdanNeural',
+ '阿拉伯语 (巴林)-Ali-男': 'ar-BH-AliNeural',
+ '阿拉伯语 (巴林)-Laila-女': 'ar-BH-LailaNeural',
+ '阿拉伯语 (阿尔及利亚)-Ismael-男': 'ar-DZ-IsmaelNeural',
+ '阿拉伯语 (埃及)-Salma-女': 'ar-EG-SalmaNeural',
+ '阿拉伯语 (埃及)-Shakir-男': 'ar-EG-ShakirNeural',
+ '阿拉伯语 (伊拉克)-Bassel-男': 'ar-IQ-BasselNeural',
+ '阿拉伯语 (伊拉克)-Rana-女': 'ar-IQ-RanaNeural',
+ '阿拉伯语 (约旦)-Sana-女': 'ar-JO-SanaNeural',
+ '阿拉伯语 (约旦)-Taim-男': 'ar-JO-TaimNeural',
+ '阿拉伯语 (科威特)-Fahed-男': 'ar-KW-FahedNeural',
+ '阿拉伯语 (科威特)-Noura-女': 'ar-KW-NouraNeural',
+ '阿拉伯语 (黎巴嫩)-Layla-女': 'ar-LB-LaylaNeural',
+ '阿拉伯语 (黎巴嫩)-Rami-男': 'ar-LB-RamiNeural',
+ '阿拉伯语 (利比亚)-Iman-女': 'ar-LY-ImanNeural',
+ '阿拉伯语 (利比亚)-Omar-男': 'ar-LY-OmarNeural',
+ '阿拉伯语 (摩洛哥)-Jamal-男': 'ar-MA-JamalNeural',
+ '阿拉伯语 (摩洛哥)-Mouna-女': 'ar-MA-MounaNeural',
+ '阿拉伯语 (阿曼)-Abdullah-男': 'ar-OM-AbdullahNeural',
+ '阿拉伯语 (阿曼)-Aysha-女': 'ar-OM-AyshaNeural',
+ '阿拉伯语 (卡塔尔)-Amal-女': 'ar-QA-AmalNeural',
+ '阿拉伯语 (卡塔尔)-Moaz-男': 'ar-QA-MoazNeural',
+ '阿拉伯语 (叙利亚)-Amany-女': 'ar-SY-AmanyNeural',
+ '阿拉伯语 (叙利亚)-Laith-男': 'ar-SY-LaithNeural',
+ '阿拉伯语 (突尼斯)-Hedi-男': 'ar-TN-HediNeural',
+ '阿拉伯语 (突尼斯)-Reem-女': 'ar-TN-ReemNeural',
+ '阿拉伯语 (也门)-Maryam-女': 'ar-YE-MaryamNeural',
+ '阿拉伯语 (也门)-Saleh-男': 'ar-YE-SalehNeural',
+ '阿塞拜疆语 (阿塞拜疆)-Babek-男': 'az-AZ-BabekNeural',
+ '阿塞拜疆语 (阿塞拜疆)-Banu-女': 'az-AZ-BanuNeural',
+ '保加利亚语 (保加利亚)-Borislav-男': 'bg-BG-BorislavNeural',
+ '保加利亚语 (保加利亚)-Kalina-女': 'bg-BG-KalinaNeural',
+ '孟加拉语 (孟加拉国)-Nabanita-女': 'bn-BD-NabanitaNeural',
+ '孟加拉语 (孟加拉国)-Pradeep-男': 'bn-BD-PradeepNeural',
+ '孟加拉语 (印度)-Bashkar-男': 'bn-IN-BashkarNeural',
+ '孟加拉语 (印度)-Tanishaa-女': 'bn-IN-TanishaaNeural',
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Goran-男': 'bs-BA-GoranNeural',
+ '波斯尼亚语 (波斯尼亚和黑塞哥维那)-Vesna-女': 'bs-BA-VesnaNeural',
+ '加泰罗尼亚语 (西班牙)-Joana-女': 'ca-ES-JoanaNeural',
+ '加泰罗尼亚语 (西班牙)-Enric-男': 'ca-ES-EnricNeural',
+ '捷克语 (捷克共和国)-Antonin-男': 'cs-CZ-AntoninNeural',
+ '捷克语 (捷克共和国)-Vlasta-女': 'cs-CZ-VlastaNeural',
+ '威尔士语 (英国)-Aled-男': 'cy-GB-AledNeural',
+ '威尔士语 (英国)-Nia-女': 'cy-GB-NiaNeural',
+ '丹麦语 (丹麦)-Christel-女': 'da-DK-ChristelNeural',
+ '丹麦语 (丹麦)-Jeppe-男': 'da-DK-JeppeNeural',
+ '德语 (奥地利)-Ingrid-女': 'de-AT-IngridNeural',
+ '德语 (奥地利)-Jonas-男': 'de-AT-JonasNeural',
+ '德语 (瑞士)-Jan-男': 'de-CH-JanNeural',
+ '德语 (瑞士)-Leni-女': 'de-CH-LeniNeural',
+ '英语 (澳大利亚)-Natasha-女': 'en-AU-NatashaNeural',
+ '英语 (澳大利亚)-William-男': 'en-AU-WilliamNeural',
+ '英语 (加拿大)-Clara-女': 'en-CA-ClaraNeural',
+ '英语 (加拿大)-Liam-男': 'en-CA-LiamNeural',
+ '英语 (英国)-Libby-女': 'en-GB-LibbyNeural',
+ '英语 (英国)-Maisie-女': 'en-GB-MaisieNeural',
+ '英语 (英国)-Ryan-男': 'en-GB-RyanNeural',
+ '英语 (英国)-Sonia-女': 'en-GB-SoniaNeural',
+ '英语 (英国)-Thomas-男': 'en-GB-ThomasNeural',
+ '英语 (香港)-Sam-男': 'en-HK-SamNeural',
+ '英语 (香港)-Yan-女': 'en-HK-YanNeural',
+ '英语 (爱尔兰)-Connor-男': 'en-IE-ConnorNeural',
+ '英语 (爱尔兰)-Emily-女': 'en-IE-EmilyNeural',
+ '英语 (印度)-Neerja-女': 'en-IN-NeerjaNeural',
+ '英语 (印度)-Prabhat-男': 'en-IN-PrabhatNeural',
+ '英语 (肯尼亚)-Asilia-女': 'en-KE-AsiliaNeural',
+ '英语 (肯尼亚)-Chilemba-男': 'en-KE-ChilembaNeural',
+ '英语 (尼日利亚)-Abeo-男': 'en-NG-AbeoNeural',
+ '英语 (尼日利亚)-Ezinne-女': 'en-NG-EzinneNeural',
+ '英语 (新西兰)-Mitchell-男': 'en-NZ-MitchellNeural',
+ '英语 (菲律宾)-James-男': 'en-PH-JamesNeural',
+ '英语 (菲律宾)-Rosa-女': 'en-PH-RosaNeural',
+ '英语 (新加坡)-Luna-女': 'en-SG-LunaNeural',
+ '英语 (新加坡)-Wayne-男': 'en-SG-WayneNeural',
+ '英语 (坦桑尼亚)-Elimu-男': 'en-TZ-ElimuNeural',
+ '英语 (坦桑尼亚)-Imani-女': 'en-TZ-ImaniNeural',
+ '英语 (南非)-Leah-女': 'en-ZA-LeahNeural',
+ '英语 (南非)-Luke-男': 'en-ZA-LukeNeural',
+ '西班牙语 (阿根廷)-Elena-女': 'es-AR-ElenaNeural',
+ '西班牙语 (阿根廷)-Tomas-男': 'es-AR-TomasNeural',
+ '西班牙语 (玻利维亚)-Marcelo-男': 'es-BO-MarceloNeural',
+ '西班牙语 (玻利维亚)-Sofia-女': 'es-BO-SofiaNeural',
+ '西班牙语 (哥伦比亚)-Gonzalo-男': 'es-CO-GonzaloNeural',
+ '西班牙语 (哥伦比亚)-Salome-女': 'es-CO-SalomeNeural',
+ '西班牙语 (哥斯达黎加)-Juan-男': 'es-CR-JuanNeural',
+ '西班牙语 (哥斯达黎加)-Maria-女': 'es-CR-MariaNeural',
+ '西班牙语 (古巴)-Belkys-女': 'es-CU-BelkysNeural',
+ '西班牙语 (多米尼加共和国)-Emilio-男': 'es-DO-EmilioNeural',
+ '西班牙语 (多米尼加共和国)-Ramona-女': 'es-DO-RamonaNeural',
+ '西班牙语 (厄瓜多尔)-Andrea-女': 'es-EC-AndreaNeural',
+ '西班牙语 (厄瓜多尔)-Luis-男': 'es-EC-LuisNeural',
+ '西班牙语 (西班牙)-Alvaro-男': 'es-ES-AlvaroNeural',
+ '西班牙语 (西班牙)-Elvira-女': 'es-ES-ElviraNeural',
+ '西班牙语 (赤道几内亚)-Teresa-女': 'es-GQ-TeresaNeural',
+ '西班牙语 (危地马拉)-Andres-男': 'es-GT-AndresNeural',
+ '西班牙语 (危地马拉)-Marta-女': 'es-GT-MartaNeural',
+ '西班牙语 (洪都拉斯)-Carlos-男': 'es-HN-CarlosNeural',
+ '西班牙语 (洪都拉斯)-Karla-女': 'es-HN-KarlaNeural',
+ '西班牙语 (尼加拉瓜)-Federico-男': 'es-NI-FedericoNeural',
+ '西班牙语 (尼加拉瓜)-Yolanda-女': 'es-NI-YolandaNeural',
+ '西班牙语 (巴拿马)-Margarita-女': 'es-PA-MargaritaNeural',
+ '西班牙语 (巴拿马)-Roberto-男': 'es-PA-RobertoNeural',
+ '西班牙语 (秘鲁)-Alex-男': 'es-PE-AlexNeural',
+ '西班牙语 (秘鲁)-Camila-女': 'es-PE-CamilaNeural',
+ '西班牙语 (波多黎各)-Karina-女': 'es-PR-KarinaNeural',
+ '西班牙语 (波多黎各)-Victor-男': 'es-PR-VictorNeural',
+ '西班牙语 (巴拉圭)-Mario-男': 'es-PY-MarioNeural',
+ '西班牙语 (巴拉圭)-Tania-女': 'es-PY-TaniaNeural',
+ '西班牙语 (萨尔瓦多)-Lorena-女': 'es-SV-LorenaNeural',
+ '西班牙语 (萨尔瓦多)-Rodrigo-男': 'es-SV-RodrigoNeural',
+ '西班牙语 (美国)-Alonso-男': 'es-US-AlonsoNeural',
+ '西班牙语 (美国)-Paloma-女': 'es-US-PalomaNeural',
+ '西班牙语 (乌拉圭)-Mateo-男': 'es-UY-MateoNeural',
+ '西班牙语 (乌拉圭)-Valentina-女': 'es-UY-ValentinaNeural',
+ '西班牙语 (委内瑞拉)-Paola-女': 'es-VE-PaolaNeural',
+ '西班牙语 (委内瑞拉)-Sebastian-男': 'es-VE-SebastianNeural',
+ '爱沙尼亚语 (爱沙尼亚)-Anu-���': 'et-EE-AnuNeural',
+ '爱沙尼亚语 (爱沙尼亚)-Kert-男': 'et-EE-KertNeural',
+ '波斯语 (伊朗)-Dilara-女': 'fa-IR-DilaraNeural',
+ '波斯语 (伊朗)-Farid-男': 'fa-IR-FaridNeural',
+ '芬兰语 (芬兰)-Harri-男': 'fi-FI-HarriNeural',
+ '芬兰语 (芬兰)-Noora-女': 'fi-FI-NooraNeural',
+ '法语 (比利时)-Charline-女': 'fr-BE-CharlineNeural',
+ '法语 (比利时)-Gerard-男': 'fr-BE-GerardNeural',
+ '法语 (加拿大)-Sylvie-女': 'fr-CA-SylvieNeural',
+ '法语 (加拿大)-Antoine-男': 'fr-CA-AntoineNeural',
+ '法语 (加拿大)-Jean-男': 'fr-CA-JeanNeural',
+ '法语 (瑞士)-Ariane-女': 'fr-CH-ArianeNeural',
+ '法语 (瑞士)-Fabrice-男': 'fr-CH-FabriceNeural',
+ '爱尔兰语 (爱尔兰)-Colm-男': 'ga-IE-ColmNeural',
+ '爱尔兰语 (爱尔兰)-Orla-女': 'ga-IE-OrlaNeural',
+ '加利西亚语 (西班牙)-Roi-男': 'gl-ES-RoiNeural',
+ '加利西亚语 (西班牙)-Sabela-女': 'gl-ES-SabelaNeural',
+ '古吉拉特语 (印度)-Dhwani-女': 'gu-IN-DhwaniNeural',
+ '古吉拉特语 (印度)-Niranjan-男': 'gu-IN-NiranjanNeural',
+ '印地语 (印度)-Madhur-男': 'hi-IN-MadhurNeural',
+ '印地语 (印度)-Swara-女': 'hi-IN-SwaraNeural',
+ '克罗地亚语 (克罗地亚)-Gabrijela-女': 'hr-HR-GabrijelaNeural',
+ '克罗地亚语 (克罗地亚)-Srecko-男': 'hr-HR-SreckoNeural',
+ '匈牙利语 (匈牙利)-Noemi-女': 'hu-HU-NoemiNeural',
+ '匈牙利语 (匈牙利)-Tamas-男': 'hu-HU-TamasNeural',
+ '冰岛语 (冰岛)-Gudrun-女': 'is-IS-GudrunNeural',
+ '冰岛语 (冰岛)-Gunnar-男': 'is-IS-GunnarNeural',
+ '爪哇语 (印度尼西亚)-Dimas-男': 'jv-ID-DimasNeural',
+ '爪哇语 (印度尼西亚)-Siti-女': 'jv-ID-SitiNeural',
+ '格鲁吉亚语 (格鲁吉亚)-Eka-女': 'ka-GE-EkaNeural',
+ '格鲁吉亚语 (格鲁吉亚)-Giorgi-男': 'ka-GE-GiorgiNeural',
+ '哈萨克语 (哈萨克斯坦)-Aigul-女': 'kk-KZ-AigulNeural',
+ '哈萨克语 (哈萨克斯坦)-Daulet-男': 'kk-KZ-DauletNeural',
+ '高棉语 (柬埔寨)-Piseth-男': 'km-KH-PisethNeural',
+ '高棉语 (柬埔寨)-Sreymom-女': 'km-KH-SreymomNeural',
+ '卡纳达语 (印度)-Gagan-男': 'kn-IN-GaganNeural',
+ '卡纳达语 (印度)-Sapna-女': 'kn-IN-SapnaNeural',
+ '老挝语 (老挝)-Chanthavong-男': 'lo-LA-ChanthavongNeural',
+ '老挝语 (老挝)-Keomany-女': 'lo-LA-KeomanyNeural',
+ '立陶宛语 (立陶宛)-Leonas-男': 'lt-LT-LeonasNeural',
+ '立陶宛语 (立陶宛)-Ona-女': 'lt-LT-OnaNeural',
+ '拉脱维亚语 (拉脱维亚)-Everita-女': 'lv-LV-EveritaNeural',
+ '拉脱维亚语 (拉脱维亚)-Nils-男': 'lv-LV-NilsNeural',
+ '马其顿语 (北马其顿共和国)-Aleksandar-男': 'mk-MK-AleksandarNeural',
+ '马其顿语 (北马其顿共和国)-Marija-女': 'mk-MK-MarijaNeural',
+ '马拉雅拉姆语 (印度)-Midhun-男': 'ml-IN-MidhunNeural',
+ '马拉雅拉姆语 (印度)-Sobhana-女': 'ml-IN-SobhanaNeural',
+ '蒙古语 (蒙古)-Bataa-男': 'mn-MN-BataaNeural',
+ '蒙古语 (蒙古)-Yesui-女': 'mn-MN-YesuiNeural',
+ '马拉地语 (印度)-Aarohi-女': 'mr-IN-AarohiNeural',
+ '马拉地语 (印度)-Manohar-男': 'mr-IN-ManoharNeural',
+ '马耳他语 (马耳他)-Grace-女': 'mt-MT-GraceNeural',
+ '马耳他语 (马耳他)-Joseph-男': 'mt-MT-JosephNeural',
+ '缅甸语 (缅甸)-Nilar-女': 'my-MM-NilarNeural',
+ '缅甸语 (缅甸)-Thiha-男': 'my-MM-ThihaNeural',
+ '尼泊尔语 (尼泊尔)-Hemkala-女': 'ne-NP-HemkalaNeural',
+ '尼泊尔语 (尼泊尔)-Sagar-男': 'ne-NP-SagarNeural',
+ '荷兰语 (比利时)-Arnaud-男': 'nl-BE-ArnaudNeural',
+ '荷兰语 (比利时)-Dena-女': 'nl-BE-DenaNeural',
+ '波兰语 (波兰)-Marek-男': 'pl-PL-MarekNeural',
+ '波兰语 (波兰)-Zofia-女': 'pl-PL-ZofiaNeural',
+ '普什图语 (阿富汗)-Gul Nawaz-男': 'ps-AF-GulNawazNeural',
+ '普什图语 (阿富汗)-Latifa-女': 'ps-AF-LatifaNeural',
+ '葡萄牙语 (葡萄牙)-Duarte-男': 'pt-PT-DuarteNeural',
+ '葡萄牙语 (葡萄牙)-Raquel-女': 'pt-PT-RaquelNeural',
+ '罗马尼亚语 (罗马尼亚)-Alina-女': 'ro-RO-AlinaNeural',
+ '罗马尼亚语 (罗马尼亚)-Emil-男': 'ro-RO-EmilNeural',
+ '俄语 (俄罗斯)-Svetlana-女': 'ru-RU-SvetlanaNeural',
+ '俄语 (俄罗斯)-Dmitry-男': 'ru-RU-DmitryNeural',
+ '僧伽罗语 (斯里兰卡)-Sameera-男': 'si-LK-SameeraNeural',
+ '僧伽罗语 (斯里兰卡)-Thilini-女': 'si-LK-ThiliniNeural',
+ '斯洛伐克语 (斯洛伐克)-Lukas-男': 'sk-SK-LukasNeural',
+ '斯洛伐克语 (斯洛伐克)-Viktoria-女': 'sk-SK-ViktoriaNeural',
+ '斯洛文尼亚语 (斯洛文尼亚)-Petra-女': 'sl-SI-PetraNeural',
+ '斯洛文尼亚语 (斯洛文尼亚)-Rok-男': 'sl-SI-RokNeural',
+ '索马里语 (索马里)-Muuse-男': 'so-SO-MuuseNeural',
+ '索马里语 (索马里)-Ubax-女': 'so-SO-UbaxNeural',
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Anila-女': 'sq-AL-AnilaNeural',
+ '阿尔巴尼亚语 (阿尔巴尼亚)-Ilir-男': 'sq-AL-IlirNeural',
+ '塞尔维亚语 (塞尔维亚)-Nicholas-男': 'sr-RS-NicholasNeural',
+ '塞尔维亚语 (塞尔维亚)-Sophie-女': 'sr-RS-SophieNeural',
+ '巽他语 (印度尼西亚)-Jajang-男': 'su-ID-JajangNeural',
+ '巽他语 (印度尼��亚)-Tuti-女': 'su-ID-TutiNeural',
+ '斯瓦希里语 (肯尼亚)-Rafiki-男': 'sw-KE-RafikiNeural',
+ '斯瓦希里语 (肯尼亚)-Zuri-女': 'sw-KE-ZuriNeural',
+ '斯瓦希里语 (坦桑尼亚)-Daudi-男': 'sw-TZ-DaudiNeural',
+ '斯瓦希里语 (坦桑尼亚)-Rehema-女': 'sw-TZ-RehemaNeural',
+ '泰米尔语 (印度)-Pallavi-女': 'ta-IN-PallaviNeural',
+ '泰米尔语 (印度)-Valluvar-男': 'ta-IN-ValluvarNeural',
+ '泰米尔语 (斯里兰卡)-Kumar-男': 'ta-LK-KumarNeural',
+ '泰米尔语 (斯里兰卡)-Saranya-女': 'ta-LK-SaranyaNeural',
+ '泰米尔语 (马来西亚)-Kani-女': 'ta-MY-KaniNeural',
+ '泰米尔语 (马来西亚)-Surya-男': 'ta-MY-SuryaNeural',
+ '泰米尔语 (新加坡)-Anbu-男': 'ta-SG-AnbuNeural',
+ '泰卢固语 (印度)-Mohan-男': 'te-IN-MohanNeural',
+ '泰卢固语 (印度)-Shruti-女': 'te-IN-ShrutiNeural',
+ '土耳其语 (土耳其)-Ahmet-男': 'tr-TR-AhmetNeural',
+ '土耳其语 (土耳其)-Emel-女': 'tr-TR-EmelNeural',
+ '乌克兰语 (乌克兰)-Ostap-男': 'uk-UA-OstapNeural',
+ '乌克兰语 (乌克兰)-Polina-女': 'uk-UA-PolinaNeural',
+ '乌尔都语 (印度)-Gul-女': 'ur-IN-GulNeural',
+ '乌尔都语 (印度)-Salman-男': 'ur-IN-SalmanNeural',
+ '乌尔都语 (巴基斯坦)-Asad-男': 'ur-PK-AsadNeural',
+ '乌尔都语 (巴基斯坦)-Uzma-女': 'ur-PK-UzmaNeural',
+ '乌兹别克语 (乌兹别克斯坦)-Madina-女': 'uz-UZ-MadinaNeural',
+ '乌兹别克语 (乌兹别克斯坦)-Sardor-男': 'uz-UZ-SardorNeural',
+ '普通话 (中国大陆)-Xiaoxiao-女': 'zh-CN-XiaoxiaoNeural',
+ '普通话 (中国大陆)-Yunyang-男': 'zh-CN-YunyangNeural',
+ '普通话 (中国大陆)-Yunxi-男': 'zh-CN-YunxiNeural',
+ '普通话 (中国大陆)-Xiaoyi-女': 'zh-CN-XiaoyiNeural',
+ '普通话 (中国大陆)-Yunjian-男': 'zh-CN-YunjianNeural',
+ '普通话 (中国大陆)-Yunxia-男': 'zh-CN-YunxiaNeural',
+ '东北话 (中国大陆)-Xiaobei-女': 'zh-CN-liaoning-XiaobeiNeural',
+ '中原官话 (中国陕西)-Xiaoni-女': 'zh-CN-shaanxi-XiaoniNeural',
+ '粤语 (中国香港)-HiuMaan-女': 'zh-HK-HiuMaanNeural',
+ '粤语 (中国香港)-HiuGaai-女': 'zh-HK-HiuGaaiNeural',
+ '粤语 (中国香港)-WanLung-男': 'zh-HK-WanLungNeural',
+ '台湾普通话-HsiaoChen-女': 'zh-TW-HsiaoChenNeural',
+ '台湾普通话-HsiaoYu-女': 'zh-TW-HsiaoYuNeural',
+ '台湾普通话-YunJhe-男': 'zh-TW-YunJheNeural',
+ '祖鲁语 (南非)-Thando-女': 'zu-ZA-ThandoNeural',
+ '祖鲁语 (南非)-Themba-男': 'zu-ZA-ThembaNeural'}

utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import os
+import sys
+import argparse
+import logging
+import json
+import subprocess
+import numpy as np
+from scipy.io.wavfile import read
+import torch
+from torch.nn import functional as F
+from commons import sequence_mask
+MATPLOTLIB_FLAG = False
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+def get_cmodel(rank):
+    checkpoint = torch.load('wavlm/WavLM-Large.pt')
+    cfg = WavLMConfig(checkpoint['cfg'])
+    cmodel = WavLM(cfg).cuda(rank)
+    cmodel.load_state_dict(checkpoint['model'])
+    cmodel.eval()
+    return cmodel
+def get_content(cmodel, y):
+    with torch.no_grad():
+        c = cmodel.extract_features(y.squeeze(1))[0]
+    c = c.transpose(1, 2)
+    return c
+def get_vocoder(rank):
+    with open("hifigan/config.json", "r") as f:
+        config = json.load(f)
+    config = hifigan.AttrDict(config)
+    vocoder = hifigan.Generator(config)
+    ckpt = torch.load("hifigan/generator_v1")
+    vocoder.load_state_dict(ckpt["generator"])
+    vocoder.eval()
+    vocoder.remove_weight_norm()
+    vocoder.cuda(rank)
+    return vocoder
+def transform(mel, height): # 68-92
+    #r = np.random.random()
+    #rate = r * 0.3 + 0.85 # 0.85-1.15
+    #height = int(mel.size(-2) * rate)
+    tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
+    if height >= mel.size(-2):
+        return tgt[:, :mel.size(-2), :]
+    else:
+        silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
+        silence += torch.randn_like(silence) / 10
+        return torch.cat((tgt, silence), 1)
+def stretch(mel, width): # 0.5-2
+    return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
+def load_checkpoint(checkpoint_path, model, optimizer=None):
+  assert os.path.isfile(checkpoint_path)
+  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  learning_rate = checkpoint_dict['learning_rate']
+  if optimizer is not None:
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logger.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logger.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return model, optimizer, learning_rate, iteration
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+  logger.info("Saving model and optimizer state at iteration {} to {}".format(
+    iteration, checkpoint_path))
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  torch.save({'model': state_dict,
+              'iteration': iteration,
+              'optimizer': optimizer.state_dict(),
+              'learning_rate': learning_rate}, checkpoint_path)
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+  for k, v in scalars.items():
+    writer.add_scalar(k, v, global_step)
+  for k, v in histograms.items():
+    writer.add_histogram(k, v, global_step)
+  for k, v in images.items():
+    writer.add_image(k, v, global_step, dataformats='HWC')
+  for k, v in audios.items():
+    writer.add_audio(k, v, global_step, audio_sampling_rate)
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+  f_list = glob.glob(os.path.join(dir_path, regex))
+  f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+  x = f_list[-1]
+  print(x)
+  return x
+def plot_spectrogram_to_numpy(spectrogram):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  fig, ax = plt.subplots(figsize=(10,2))
+  im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                  interpolation='none')
+  plt.colorbar(im, ax=ax)
+  plt.xlabel("Frames")
+  plt.ylabel("Channels")
+  plt.tight_layout()
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+def plot_alignment_to_numpy(alignment, info=None):
+  global MATPLOTLIB_FLAG
+  if not MATPLOTLIB_FLAG:
+    import matplotlib
+    matplotlib.use("Agg")
+    MATPLOTLIB_FLAG = True
+    mpl_logger = logging.getLogger('matplotlib')
+    mpl_logger.setLevel(logging.WARNING)
+  import matplotlib.pylab as plt
+  import numpy as np
+  fig, ax = plt.subplots(figsize=(6, 4))
+  im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
+                  interpolation='none')
+  fig.colorbar(im, ax=ax)
+  xlabel = 'Decoder timestep'
+  if info is not None:
+      xlabel += '\n\n' + info
+  plt.xlabel(xlabel)
+  plt.ylabel('Encoder timestep')
+  plt.tight_layout()
+  fig.canvas.draw()
+  data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+  data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+  plt.close()
+  return data
+def load_wav_to_torch(full_path):
+  sampling_rate, data = read(full_path)
+  return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+def load_filepaths_and_text(filename, split="|"):
+  with open(filename, encoding='utf-8') as f:
+    filepaths_and_text = [line.strip().split(split) for line in f]
+  return filepaths_and_text
+def get_hparams(init=True):
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
+                      help='JSON file for configuration')
+  parser.add_argument('-m', '--model', type=str, required=True,
+                      help='Model name')
+  args = parser.parse_args()
+  model_dir = os.path.join("./logs", args.model)
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  config_path = args.config
+  config_save_path = os.path.join(model_dir, "config.json")
+  if init:
+    with open(config_path, "r") as f:
+      data = f.read()
+    with open(config_save_path, "w") as f:
+      f.write(data)
+  else:
+    with open(config_save_path, "r") as f:
+      data = f.read()
+  config = json.loads(data)
+  hparams = HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+def get_hparams_from_dir(model_dir):
+  config_save_path = os.path.join(model_dir, "config.json")
+  with open(config_save_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+  hparams =HParams(**config)
+  hparams.model_dir = model_dir
+  return hparams
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = json.loads(data)
+  hparams =HParams(**config)
+  return hparams
+def check_git_hash(model_dir):
+  source_dir = os.path.dirname(os.path.realpath(__file__))
+  if not os.path.exists(os.path.join(source_dir, ".git")):
+    logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
+      source_dir
+    ))
+    return
+  cur_hash = subprocess.getoutput("git rev-parse HEAD")
+  path = os.path.join(model_dir, "githash")
+  if os.path.exists(path):
+    saved_hash = open(path).read()
+    if saved_hash != cur_hash:
+      logger.warn("git hash values are different. {}(saved) != {}(current)".format(
+        saved_hash[:8], cur_hash[:8]))
+  else:
+    open(path, "w").write(cur_hash)
+def get_logger(model_dir, filename="train.log"):
+  global logger
+  logger = logging.getLogger(os.path.basename(model_dir))
+  logger.setLevel(logging.DEBUG)
+  formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+  if not os.path.exists(model_dir):
+    os.makedirs(model_dir)
+  h = logging.FileHandler(os.path.join(model_dir, filename))
+  h.setLevel(logging.DEBUG)
+  h.setFormatter(formatter)
+  logger.addHandler(h)
+  return logger
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+  def keys(self):
+    return self.__dict__.keys()
+  def items(self):
+    return self.__dict__.items()
+  def values(self):
+    return self.__dict__.values()
+  def __len__(self):
+    return len(self.__dict__)
+  def __getitem__(self, key):
+    return getattr(self, key)
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+  def __contains__(self, key):
+    return key in self.__dict__
+  def __repr__(self):
+    return self.__dict__.__repr__()