##### Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Author : [jaeyoo@](https://github.com/jaeyoo), [khanhlvg@](https://github.com/khanhlvg), [abattery@](https://github.com/abattery), [thaink@](https://github.com/thaink) (Google Research)

Created : 2020-06-30 KST

Last updated : 2020-07-04 KST

-----
Change logs
* 2020-07-04 KST : Update notebook with the lastest TensorflowTTS repo.
 * compatible with https://github.com/TensorSpeech/TensorflowTTS/pull/83
* 2020-07-02 KST : Third implementation (outputs : `tacotron2.tflite`) 
 * **varied-length** input tensor, **varied-length** output tensor
-----

**Status** : successfully converted (`tacotron2.tflite`)

**Disclaimer** 
- This colab doesn't care about the latency, so it compressed the model with quantization. (129 MB -> 33 MB)
- The TFLite file doesn't have LJSpeechProcessor. So you need to run it before feeding input vectors.
- `tf-nightly>=2.4.0-dev20200630`


# Generate voice with Tacotron2

In [2]:
!pip install tf-nightly



In [3]:
import numpy as np
import soundfile as sf
import yaml
import tensorflow as tf

from tensorflow_tts.inference import AutoProcessor
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel

from IPython.display import Audio
print(tf.__version__)

TensorFlow Addons offers no support for the nightly versions of TensorFlow. Some things might work, some other might not. 
If you encounter a bug, do not file an issue on GitHub.


2.4.0-dev20200716


In [4]:
# initialize melgan model
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en")

In [6]:
# initialize Tacotron2 model.
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", enable_tflite_convertible=True)

# Newly added :
tacotron2.setup_window(win_front=6, win_back=6)
tacotron2.setup_maximum_iterations(3000)

tacotron2.summary()

Model: "tacotron2v2"
_________________________________________________________________
Layer (type) Output Shape Param # 
encoder (TFTacotronEncoder) multiple 8218624 
_________________________________________________________________
decoder_cell (TFTacotronDeco multiple 18246402 
_________________________________________________________________
post_net (TFTacotronPostnet) multiple 5460480 
_________________________________________________________________
residual_projection (Dense) multiple 41040 
Total params: 31,966,546
Trainable params: 31,956,306
Non-trainable params: 10,240
_________________________________________________________________


# Convert to TF Lite

In [10]:
# Concrete Function
tacotron2_concrete_function = tacotron2.inference_tflite.get_concrete_function()

In [11]:
converter = tf.lite.TFLiteConverter.from_concrete_functions(
 [tacotron2_concrete_function]
)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
 tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()

In [12]:
# Save the TF Lite model.
with open('tacotron2.tflite', 'wb') as f:
 f.write(tflite_model)

print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0) )

Model size is 33.242188 MBs.


In [13]:
# Download the TF Lite model
# from google.colab import files
# files.download('tacotron2.tflite') 

# Inference from TFLite

In [14]:
import numpy as np
import tensorflow as tf

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path='tacotron2.tflite')
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Prepare input data.
def prepare_input(input_ids):
 return (tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
 tf.convert_to_tensor([len(input_ids)], tf.int32),
 tf.convert_to_tensor([0], dtype=tf.int32))
 
# Test the model on random input data.
def infer(input_text):
 processor = LJSpeechProcessor(None, "english_cleaners")
 input_ids = processor.text_to_sequence(input_text.lower())
 input_ids = np.concatenate([input_ids, [len(symbols) - 1]], -1) # eos.
 interpreter.resize_tensor_input(input_details[0]['index'], 
 [1, len(input_ids)])
 interpreter.allocate_tensors()
 input_data = prepare_input(input_ids)
 for i, detail in enumerate(input_details):
 print(detail)
 input_shape = detail['shape']
 interpreter.set_tensor(detail['index'], input_data[i])

 interpreter.invoke()

 # The function `get_tensor()` returns a copy of the tensor data.
 # Use `tensor()` in order to get a pointer to the tensor.
 return (interpreter.get_tensor(output_details[0]['index']),
 interpreter.get_tensor(output_details[1]['index']))

In [15]:
input_text = "Recent research at Harvard has shown meditating\
for as little as 8 weeks, can actually increase the grey matter in the \
parts of the brain responsible for emotional regulation, and learning."

decoder_output_tflite, mel_output_tflite = infer(input_text)
audio_before_tflite = melgan(decoder_output_tflite)[0, :, 0]
audio_after_tflite = melgan(mel_output_tflite)[0, :, 0]

{'name': 'input_ids', 'index': 0, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([ 1, -1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
{'name': 'input_lengths', 'index': 1, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
{'name': 'speaker_ids', 'index': 2, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


In [16]:
Audio(data=audio_before_tflite, rate=22050)

In [17]:
Audio(data=audio_after_tflite, rate=22050)

In [18]:
input_text = "I love TensorFlow Lite converted Tacotron 2."

decoder_output_tflite, mel_output_tflite = infer(input_text)
audio_before_tflite = melgan(decoder_output_tflite)[0, :, 0]
audio_after_tflite = melgan(mel_output_tflite)[0, :, 0]

{'name': 'input_ids', 'index': 0, 'shape': array([1, 1], dtype=int32), 'shape_signature': array([ 1, -1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
{'name': 'input_lengths', 'index': 1, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
{'name': 'speaker_ids', 'index': 2, 'shape': array([1], dtype=int32), 'shape_signature': array([1], dtype=int32), 'dtype': , 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


In [19]:
Audio(data=audio_before_tflite, rate=22050)

In [20]:
Audio(data=audio_after_tflite, rate=22050)