Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
! git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
|
2 |
+
! git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
|
3 |
+
! cd "tacotron_pytorch/" && pip install -e .
|
4 |
+
! mkdir trained_models
|
5 |
+
|
6 |
+
import gdown
|
7 |
+
url = 'https://drive.google.com/uc?id=1rmhtEl3N3kAfnQM6J0vDGSCCHlHLK6kw'
|
8 |
+
output = 'trained_models/angry_dctts.pth'
|
9 |
+
gdown.download(url, output, quiet=False)
|
10 |
+
url = 'https://drive.google.com/uc?id=1bP0eJ6z4onr2klolzU17Y8SaNspxQjF-'
|
11 |
+
output = 'trained_models/neutral_dctts.pth'
|
12 |
+
gdown.download(url, output, quiet=False)
|
13 |
+
url = 'https://drive.google.com/uc?id=1WWE9zxS3FRgD0Y5yIdNmLY9-t5gnBsNt'
|
14 |
+
output = 'trained_models/ssrn.pth'
|
15 |
+
gdown.download(url, output, quiet=False)
|
16 |
+
url = 'https://drive.google.com/uc?id=1N6Ykrd1IaPiNdos_iv0J6JbY2gBDghod'
|
17 |
+
output = 'trained_models/disgust_tacotron.pth'
|
18 |
+
gdown.download(url, output, quiet=False)
|
19 |
+
url = 'https://drive.google.com/uc?id=15m0PZ8xaBocb_6wDjAU6S4Aunbr3TKkM'
|
20 |
+
output = 'trained_models/amused_tacotron.pth'
|
21 |
+
gdown.download(url, output, quiet=False)
|
22 |
+
url = 'https://drive.google.com/uc?id=1D6HGWYWvhdvLWQt4uOYqdmuVO7ZVLWNa'
|
23 |
+
output = 'trained_models/sleepiness_tacotron.pth'
|
24 |
+
gdown.download(url, output, quiet=False)
|
25 |
+
|
26 |
+
%tensorflow_version 1.x
|
27 |
+
%pylab inline
|
28 |
+
rcParams["figure.figsize"] = (10,5)
|
29 |
+
|
30 |
+
import os
|
31 |
+
import sys
|
32 |
+
import numpy as np
|
33 |
+
sys.path.append('pytorch-dc-tts/')
|
34 |
+
sys.path.append('pytorch-dc-tts/models')
|
35 |
+
sys.path.append("tacotron_pytorch/")
|
36 |
+
sys.path.append("tacotron_pytorch/lib/tacotron")
|
37 |
+
|
38 |
+
# For the DC-TTS
|
39 |
+
import torch
|
40 |
+
from text2mel import Text2Mel
|
41 |
+
from ssrn import SSRN
|
42 |
+
from audio import save_to_wav, spectrogram2wav
|
43 |
+
from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
|
44 |
+
from datasets.emovdb import vocab, get_test_data
|
45 |
+
|
46 |
+
# For the Tacotron
|
47 |
+
from text import text_to_sequence, symbols
|
48 |
+
# from util import audio
|
49 |
+
|
50 |
+
from tacotron_pytorch import Tacotron
|
51 |
+
from synthesis import tts as _tts
|
52 |
+
|
53 |
+
# For Audio/Display purposes
|
54 |
+
import librosa.display
|
55 |
+
import IPython
|
56 |
+
from IPython.display import Audio
|
57 |
+
from IPython.display import display
|
58 |
+
from google.colab import widgets
|
59 |
+
from google.colab import output
|
60 |
+
import warnings
|
61 |
+
warnings.filterwarnings('ignore')
|
62 |
+
|
63 |
+
|
64 |
+
torch.set_grad_enabled(False)
|
65 |
+
text2mel = Text2Mel(vocab).eval()
|
66 |
+
|
67 |
+
ssrn = SSRN().eval()
|
68 |
+
load_checkpoint('trained_models/ssrn.pth', ssrn, None)
|
69 |
+
|
70 |
+
model = Tacotron(n_vocab=len(symbols),
|
71 |
+
embedding_dim=256,
|
72 |
+
mel_dim=80,
|
73 |
+
linear_dim=1025,
|
74 |
+
r=5,
|
75 |
+
padding_idx=None,
|
76 |
+
use_memory_mask=False,
|
77 |
+
)
|
78 |
+
|
79 |
+
def visualize(alignment, spectrogram, Emotion):
|
80 |
+
label_fontsize = 16
|
81 |
+
tb = widgets.TabBar(['Alignment', 'Spectrogram'], location='top')
|
82 |
+
with tb.output_to('Alignment'):
|
83 |
+
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
84 |
+
xlabel("Decoder timestamp", fontsize=label_fontsize)
|
85 |
+
ylabel("Encoder timestamp", fontsize=label_fontsize)
|
86 |
+
with tb.output_to('Spectrogram'):
|
87 |
+
if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
|
88 |
+
librosa.display.specshow(spectrogram.T, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
|
89 |
+
else:
|
90 |
+
librosa.display.specshow(spectrogram, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
|
91 |
+
|
92 |
+
xlabel("Time", fontsize=label_fontsize)
|
93 |
+
ylabel("Hz", fontsize=label_fontsize)
|
94 |
+
|
95 |
+
def tts_dctts(text2mel, ssrn, text):
|
96 |
+
sentences = [text]
|
97 |
+
|
98 |
+
max_N = len(text)
|
99 |
+
L = torch.from_numpy(get_test_data(sentences, max_N))
|
100 |
+
zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
|
101 |
+
Y = zeros
|
102 |
+
A = None
|
103 |
+
|
104 |
+
for t in range(210):
|
105 |
+
_, Y_t, A = text2mel(L, Y, monotonic_attention=True)
|
106 |
+
Y = torch.cat((zeros, Y_t), -1)
|
107 |
+
_, attention = torch.max(A[0, :, -1], 0)
|
108 |
+
attention = attention.item()
|
109 |
+
if L[0, attention] == vocab.index('E'): # EOS
|
110 |
+
break
|
111 |
+
|
112 |
+
_, Z = ssrn(Y)
|
113 |
+
Y = Y.cpu().detach().numpy()
|
114 |
+
A = A.cpu().detach().numpy()
|
115 |
+
Z = Z.cpu().detach().numpy()
|
116 |
+
|
117 |
+
return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]
|
118 |
+
|
119 |
+
|
120 |
+
def tts_tacotron(model, text):
|
121 |
+
waveform, alignment, spectrogram = _tts(model, text)
|
122 |
+
return waveform, alignment, spectrogram
|
123 |
+
|
124 |
+
def present(waveform, Emotion, figures=False):
|
125 |
+
if figures!=False:
|
126 |
+
visualize(figures[0], figures[1], Emotion)
|
127 |
+
IPython.display.display(Audio(waveform, rate=fs))
|
128 |
+
|
129 |
+
|
130 |
+
fs = 20000 #20000
|
131 |
+
hop_length = 250
|
132 |
+
model.decoder.max_decoder_steps = 200
|
133 |
+
|
134 |
+
#@title Select the emotion and type the text
|
135 |
+
|
136 |
+
%pylab inline
|
137 |
+
|
138 |
+
Emotion = "Neutral" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
|
139 |
+
Text = 'I am exhausted.' #@param {type:"string"}
|
140 |
+
|
141 |
+
wav, align, mel = None, None, None
|
142 |
+
|
143 |
+
if Emotion == "Neutral":
|
144 |
+
load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
|
145 |
+
wav, align, mel = tts_dctts(text2mel, ssrn, Text)
|
146 |
+
elif Emotion == "Angry":
|
147 |
+
load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
|
148 |
+
wav, align, mel = tts_dctts(text2mel, ssrn, Text)
|
149 |
+
# wav = wav.T
|
150 |
+
elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
|
151 |
+
checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
|
152 |
+
model.load_state_dict(checkpoint["state_dict"])
|
153 |
+
wav, align, mel = tts_tacotron(model, Text)
|
154 |
+
|
155 |
+
present(wav, Emotion, (align,mel))
|
156 |
+
|
157 |
+
|