sudip1310 commited on
Commit
8383235
1 Parent(s): 4f94998

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ! git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
2
+ ! git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
3
+ ! cd "tacotron_pytorch/" && pip install -e .
4
+ ! mkdir trained_models
5
+
6
+ import gdown
7
+ url = 'https://drive.google.com/uc?id=1rmhtEl3N3kAfnQM6J0vDGSCCHlHLK6kw'
8
+ output = 'trained_models/angry_dctts.pth'
9
+ gdown.download(url, output, quiet=False)
10
+ url = 'https://drive.google.com/uc?id=1bP0eJ6z4onr2klolzU17Y8SaNspxQjF-'
11
+ output = 'trained_models/neutral_dctts.pth'
12
+ gdown.download(url, output, quiet=False)
13
+ url = 'https://drive.google.com/uc?id=1WWE9zxS3FRgD0Y5yIdNmLY9-t5gnBsNt'
14
+ output = 'trained_models/ssrn.pth'
15
+ gdown.download(url, output, quiet=False)
16
+ url = 'https://drive.google.com/uc?id=1N6Ykrd1IaPiNdos_iv0J6JbY2gBDghod'
17
+ output = 'trained_models/disgust_tacotron.pth'
18
+ gdown.download(url, output, quiet=False)
19
+ url = 'https://drive.google.com/uc?id=15m0PZ8xaBocb_6wDjAU6S4Aunbr3TKkM'
20
+ output = 'trained_models/amused_tacotron.pth'
21
+ gdown.download(url, output, quiet=False)
22
+ url = 'https://drive.google.com/uc?id=1D6HGWYWvhdvLWQt4uOYqdmuVO7ZVLWNa'
23
+ output = 'trained_models/sleepiness_tacotron.pth'
24
+ gdown.download(url, output, quiet=False)
25
+
26
+ %tensorflow_version 1.x
27
+ %pylab inline
28
+ rcParams["figure.figsize"] = (10,5)
29
+
30
+ import os
31
+ import sys
32
+ import numpy as np
33
+ sys.path.append('pytorch-dc-tts/')
34
+ sys.path.append('pytorch-dc-tts/models')
35
+ sys.path.append("tacotron_pytorch/")
36
+ sys.path.append("tacotron_pytorch/lib/tacotron")
37
+
38
+ # For the DC-TTS
39
+ import torch
40
+ from text2mel import Text2Mel
41
+ from ssrn import SSRN
42
+ from audio import save_to_wav, spectrogram2wav
43
+ from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
44
+ from datasets.emovdb import vocab, get_test_data
45
+
46
+ # For the Tacotron
47
+ from text import text_to_sequence, symbols
48
+ # from util import audio
49
+
50
+ from tacotron_pytorch import Tacotron
51
+ from synthesis import tts as _tts
52
+
53
+ # For Audio/Display purposes
54
+ import librosa.display
55
+ import IPython
56
+ from IPython.display import Audio
57
+ from IPython.display import display
58
+ from google.colab import widgets
59
+ from google.colab import output
60
+ import warnings
61
+ warnings.filterwarnings('ignore')
62
+
63
+
64
+ torch.set_grad_enabled(False)
65
+ text2mel = Text2Mel(vocab).eval()
66
+
67
+ ssrn = SSRN().eval()
68
+ load_checkpoint('trained_models/ssrn.pth', ssrn, None)
69
+
70
+ model = Tacotron(n_vocab=len(symbols),
71
+ embedding_dim=256,
72
+ mel_dim=80,
73
+ linear_dim=1025,
74
+ r=5,
75
+ padding_idx=None,
76
+ use_memory_mask=False,
77
+ )
78
+
79
+ def visualize(alignment, spectrogram, Emotion):
80
+ label_fontsize = 16
81
+ tb = widgets.TabBar(['Alignment', 'Spectrogram'], location='top')
82
+ with tb.output_to('Alignment'):
83
+ imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
84
+ xlabel("Decoder timestamp", fontsize=label_fontsize)
85
+ ylabel("Encoder timestamp", fontsize=label_fontsize)
86
+ with tb.output_to('Spectrogram'):
87
+ if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
88
+ librosa.display.specshow(spectrogram.T, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
89
+ else:
90
+ librosa.display.specshow(spectrogram, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
91
+
92
+ xlabel("Time", fontsize=label_fontsize)
93
+ ylabel("Hz", fontsize=label_fontsize)
94
+
95
+ def tts_dctts(text2mel, ssrn, text):
96
+ sentences = [text]
97
+
98
+ max_N = len(text)
99
+ L = torch.from_numpy(get_test_data(sentences, max_N))
100
+ zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
101
+ Y = zeros
102
+ A = None
103
+
104
+ for t in range(210):
105
+ _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
106
+ Y = torch.cat((zeros, Y_t), -1)
107
+ _, attention = torch.max(A[0, :, -1], 0)
108
+ attention = attention.item()
109
+ if L[0, attention] == vocab.index('E'): # EOS
110
+ break
111
+
112
+ _, Z = ssrn(Y)
113
+ Y = Y.cpu().detach().numpy()
114
+ A = A.cpu().detach().numpy()
115
+ Z = Z.cpu().detach().numpy()
116
+
117
+ return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]
118
+
119
+
120
+ def tts_tacotron(model, text):
121
+ waveform, alignment, spectrogram = _tts(model, text)
122
+ return waveform, alignment, spectrogram
123
+
124
+ def present(waveform, Emotion, figures=False):
125
+ if figures!=False:
126
+ visualize(figures[0], figures[1], Emotion)
127
+ IPython.display.display(Audio(waveform, rate=fs))
128
+
129
+
130
+ fs = 20000 #20000
131
+ hop_length = 250
132
+ model.decoder.max_decoder_steps = 200
133
+
134
+ #@title Select the emotion and type the text
135
+
136
+ %pylab inline
137
+
138
+ Emotion = "Neutral" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
139
+ Text = 'I am exhausted.' #@param {type:"string"}
140
+
141
+ wav, align, mel = None, None, None
142
+
143
+ if Emotion == "Neutral":
144
+ load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
145
+ wav, align, mel = tts_dctts(text2mel, ssrn, Text)
146
+ elif Emotion == "Angry":
147
+ load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
148
+ wav, align, mel = tts_dctts(text2mel, ssrn, Text)
149
+ # wav = wav.T
150
+ elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
151
+ checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
152
+ model.load_state_dict(checkpoint["state_dict"])
153
+ wav, align, mel = tts_tacotron(model, Text)
154
+
155
+ present(wav, Emotion, (align,mel))
156
+
157
+