File size: 5,476 Bytes
436153a
 
 
 
8383235
 
ea04389
8383235
 
ea04389
8383235
 
ea04389
8383235
 
ea04389
8383235
 
436153a
8383235
 
ea04389
8383235
 
 
06297b7
 
 
b3bf6b3
8383235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3bf6b3
8383235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
#git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
#cd "tacotron_pytorch/" && pip install -e .
#mkdir trained_models

import gdown
url = 'https://drive.google.com/file/d/1ARUhLvzfIAkrpH3X9wGz_3oA6ocn-w-o/view?usp=sharing'
output = 'trained_models/angry_dctts.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1fwU_Kex9tYuwBMja3djJ4M1oBzhLaUU-/view?usp=sharing'
output = 'trained_models/neutral_dctts.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1iVhLbBQVMYjO4L1yhz0_rfKLeIUwIUAB/view?usp=sharing'
output = 'trained_models/ssrn.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1ARUhLvzfIAkrpH3X9wGz_3oA6ocn-w-o/view?usp=sharing'
output = 'trained_models/disgust_tacotron.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1xMGnS0vvgW703a9lGXeJNLK1G140RbNI/view?usp=share_link'
output = 'trained_models/amused_tacotron.pth'
gdown.download(url, output, quiet=False)
url = 'https://drive.google.com/file/d/1-uVf8-LZ935X3ZOjtw5DnuclH5Rlw_xP/view?usp=sharing'
output = 'trained_models/sleepiness_tacotron.pth'
gdown.download(url, output, quiet=False)

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#tensorflow_version 1.x
pylab inline
rcParams["figure.figsize"] = (10,5)

import os
import sys
import numpy as np
sys.path.append('pytorch-dc-tts/')
sys.path.append('pytorch-dc-tts/models')
sys.path.append("tacotron_pytorch/")
sys.path.append("tacotron_pytorch/lib/tacotron")

# For the DC-TTS
import torch
from text2mel import Text2Mel
from ssrn import SSRN
from audio import save_to_wav, spectrogram2wav
from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
from datasets.emovdb import vocab, get_test_data

# For the Tacotron
from text import text_to_sequence, symbols
# from util import audio

from tacotron_pytorch import Tacotron
from synthesis import tts as _tts

# For Audio/Display purposes
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import display
from google.colab import widgets
from google.colab import output
import warnings
warnings.filterwarnings('ignore')


torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab).eval()

ssrn = SSRN().eval()
load_checkpoint('trained_models/ssrn.pth', ssrn, None)

model = Tacotron(n_vocab=len(symbols),
                 embedding_dim=256,
                 mel_dim=80,
                 linear_dim=1025,
                 r=5,
                 padding_idx=None,
                 use_memory_mask=False,
                 )

def visualize(alignment, spectrogram, Emotion):
    label_fontsize = 16
    tb = widgets.TabBar(['Alignment', 'Spectrogram'], location='top')
    with tb.output_to('Alignment'):
      imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
      xlabel("Decoder timestamp", fontsize=label_fontsize)
      ylabel("Encoder timestamp", fontsize=label_fontsize)
    with tb.output_to('Spectrogram'):
      if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
        librosa.display.specshow(spectrogram.T, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
      else:
        librosa.display.specshow(spectrogram, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")

      xlabel("Time", fontsize=label_fontsize)
      ylabel("Hz", fontsize=label_fontsize)

def tts_dctts(text2mel, ssrn, text):
  sentences = [text]

  max_N = len(text)
  L = torch.from_numpy(get_test_data(sentences, max_N))
  zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
  Y = zeros
  A = None

  for t in range(210):
      _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
      Y = torch.cat((zeros, Y_t), -1)
      _, attention = torch.max(A[0, :, -1], 0)
      attention = attention.item()
      if L[0, attention] == vocab.index('E'):  # EOS
          break

  _, Z = ssrn(Y)
  Y = Y.cpu().detach().numpy()
  A = A.cpu().detach().numpy()
  Z = Z.cpu().detach().numpy()

  return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]


def tts_tacotron(model, text):
    waveform, alignment, spectrogram = _tts(model, text)
    return waveform, alignment, spectrogram

def present(waveform, Emotion, figures=False):
  if figures!=False:
        visualize(figures[0], figures[1], Emotion)
  IPython.display.display(Audio(waveform, rate=fs))


fs = 20000 #20000
hop_length = 250
model.decoder.max_decoder_steps = 200

#@title Select the emotion and type the text

pylab inline

Emotion = "Neutral" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
Text = 'I am exhausted.' #@param {type:"string"}

wav, align, mel = None, None, None

if Emotion == "Neutral":
  load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
  wav, align, mel = tts_dctts(text2mel, ssrn, Text)
elif Emotion == "Angry":
  load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
  wav, align, mel = tts_dctts(text2mel, ssrn, Text)
  # wav = wav.T
elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
  checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
  model.load_state_dict(checkpoint["state_dict"])
  wav, align, mel = tts_tacotron(model, Text)

present(wav, Emotion, (align,mel))