Spaces:
Runtime error
Runtime error
File size: 5,478 Bytes
436153a 8383235 ea04389 8383235 ea04389 8383235 ea04389 8383235 ea04389 8383235 436153a 8383235 ea04389 8383235 06297b7 c15733e 8383235 c15733e 8383235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
#git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
#cd "tacotron_pytorch/" && pip install -e .
#mkdir trained_models
import gdown
url = 'https://drive.google.com/file/d/1ARUhLvzfIAkrpH3X9wGz_3oA6ocn-w-o/view?usp=sharing'
output = 'trained_models/angry_dctts.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1fwU_Kex9tYuwBMja3djJ4M1oBzhLaUU-/view?usp=sharing'
output = 'trained_models/neutral_dctts.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1iVhLbBQVMYjO4L1yhz0_rfKLeIUwIUAB/view?usp=sharing'
output = 'trained_models/ssrn.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1ARUhLvzfIAkrpH3X9wGz_3oA6ocn-w-o/view?usp=sharing'
output = 'trained_models/disgust_tacotron.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1xMGnS0vvgW703a9lGXeJNLK1G140RbNI/view?usp=share_link'
output = 'trained_models/amused_tacotron.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1-uVf8-LZ935X3ZOjtw5DnuclH5Rlw_xP/view?usp=sharing'
output = 'trained_models/sleepiness_tacotron.pth'
gdown.download(url, output, quiet=False)
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#tensorflow_version 1.x
#pylab inline
rcParams["figure.figsize"] = (10,5)
import os
import sys
import numpy as np
sys.path.append('pytorch-dc-tts/')
sys.path.append('pytorch-dc-tts/models')
sys.path.append("tacotron_pytorch/")
sys.path.append("tacotron_pytorch/lib/tacotron")
# For the DC-TTS
import torch
from text2mel import Text2Mel
from ssrn import SSRN
from audio import save_to_wav, spectrogram2wav
from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
from datasets.emovdb import vocab, get_test_data
# For the Tacotron
from text import text_to_sequence, symbols
# from util import audio
from tacotron_pytorch import Tacotron
from synthesis import tts as _tts
# For Audio/Display purposes
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import display
from google.colab import widgets
from google.colab import output
import warnings
warnings.filterwarnings('ignore')
torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab).eval()
ssrn = SSRN().eval()
load_checkpoint('trained_models/ssrn.pth', ssrn, None)
model = Tacotron(n_vocab=len(symbols),
embedding_dim=256,
mel_dim=80,
linear_dim=1025,
r=5,
padding_idx=None,
use_memory_mask=False,
)
def visualize(alignment, spectrogram, Emotion):
label_fontsize = 16
tb = widgets.TabBar(['Alignment', 'Spectrogram'], location='top')
with tb.output_to('Alignment'):
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
xlabel("Decoder timestamp", fontsize=label_fontsize)
ylabel("Encoder timestamp", fontsize=label_fontsize)
with tb.output_to('Spectrogram'):
if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
librosa.display.specshow(spectrogram.T, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
else:
librosa.display.specshow(spectrogram, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
xlabel("Time", fontsize=label_fontsize)
ylabel("Hz", fontsize=label_fontsize)
def tts_dctts(text2mel, ssrn, text):
sentences = [text]
max_N = len(text)
L = torch.from_numpy(get_test_data(sentences, max_N))
zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
Y = zeros
A = None
for t in range(210):
_, Y_t, A = text2mel(L, Y, monotonic_attention=True)
Y = torch.cat((zeros, Y_t), -1)
_, attention = torch.max(A[0, :, -1], 0)
attention = attention.item()
if L[0, attention] == vocab.index('E'): # EOS
break
_, Z = ssrn(Y)
Y = Y.cpu().detach().numpy()
A = A.cpu().detach().numpy()
Z = Z.cpu().detach().numpy()
return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]
def tts_tacotron(model, text):
waveform, alignment, spectrogram = _tts(model, text)
return waveform, alignment, spectrogram
def present(waveform, Emotion, figures=False):
if figures!=False:
visualize(figures[0], figures[1], Emotion)
IPython.display.display(Audio(waveform, rate=fs))
fs = 20000 #20000
hop_length = 250
model.decoder.max_decoder_steps = 200
#@title Select the emotion and type the text
#pylab inline
Emotion = "Neutral" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
Text = 'I am exhausted.' #@param {type:"string"}
wav, align, mel = None, None, None
if Emotion == "Neutral":
load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
wav, align, mel = tts_dctts(text2mel, ssrn, Text)
elif Emotion == "Angry":
load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
wav, align, mel = tts_dctts(text2mel, ssrn, Text)
# wav = wav.T
elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["state_dict"])
wav, align, mel = tts_tacotron(model, Text)
present(wav, Emotion, (align,mel))
|