dathudeptrai commited on
Commit
7ca3b80
1 Parent(s): 0afa761

Update Tacotron2 French model

Browse files
Files changed (4) hide show
  1. README.md +93 -0
  2. config.yml +86 -0
  3. model.h5 +3 -0
  4. processor.json +1 -0
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - tensorflowtts
4
+ - audio
5
+ - text-to-speech
6
+ - text-to-mel
7
+ language: fr
8
+ license: apache-2.0
9
+ datasets:
10
+ - synpaflex
11
+ widget:
12
+ - text: "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
13
+ ---
14
+
15
+ # Tacotron 2 with Guided Attention trained on Synpaflex (Fr)
16
+ This repository provides a pretrained [Tacotron2](https://arxiv.org/abs/1712.05884) trained with [Guided Attention](https://arxiv.org/abs/1710.08969) on Synpaflex dataset (Fr). For a detail of the model, we encourage you to read more about
17
+ [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS).
18
+
19
+
20
+ ## Install TensorFlowTTS
21
+ First of all, please install TensorFlowTTS with the following command:
22
+ ```
23
+ pip install TensorFlowTTS
24
+ ```
25
+
26
+ ### Converting your Text to Mel Spectrogram
27
+ ```python
28
+ import numpy as np
29
+ import soundfile as sf
30
+ import yaml
31
+
32
+ import tensorflow as tf
33
+
34
+ from tensorflow_tts.inference import AutoProcessor
35
+ from tensorflow_tts.inference import TFAutoModel
36
+
37
+ processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
38
+ tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
39
+
40
+ text = "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
41
+
42
+ input_ids = processor.text_to_sequence(text)
43
+
44
+ decoder_output, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
45
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
46
+ input_lengths=tf.convert_to_tensor([len(input_ids)], tf.int32),
47
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
48
+ )
49
+
50
+ ```
51
+
52
+ #### Referencing Tacotron 2
53
+ ```
54
+ @article{DBLP:journals/corr/abs-1712-05884,
55
+ author = {Jonathan Shen and
56
+ Ruoming Pang and
57
+ Ron J. Weiss and
58
+ Mike Schuster and
59
+ Navdeep Jaitly and
60
+ Zongheng Yang and
61
+ Zhifeng Chen and
62
+ Yu Zhang and
63
+ Yuxuan Wang and
64
+ R. J. Skerry{-}Ryan and
65
+ Rif A. Saurous and
66
+ Yannis Agiomyrgiannakis and
67
+ Yonghui Wu},
68
+ title = {Natural {TTS} Synthesis by Conditioning WaveNet on Mel Spectrogram
69
+ Predictions},
70
+ journal = {CoRR},
71
+ volume = {abs/1712.05884},
72
+ year = {2017},
73
+ url = {http://arxiv.org/abs/1712.05884},
74
+ archivePrefix = {arXiv},
75
+ eprint = {1712.05884},
76
+ timestamp = {Thu, 28 Nov 2019 08:59:52 +0100},
77
+ biburl = {https://dblp.org/rec/journals/corr/abs-1712-05884.bib},
78
+ bibsource = {dblp computer science bibliography, https://dblp.org}
79
+ }
80
+ ```
81
+
82
+ #### Referencing TensorFlowTTS
83
+ ```
84
+ @misc{TFTTS,
85
+ author = {Minh Nguyen, Alejandro Miguel Velasquez, Erogol, Kuan Chen, Dawid Kobus, Takuya Ebata,
86
+ Trinh Le and Yunchao He},
87
+ title = {TensorflowTTS},
88
+ year = {2020},
89
+ publisher = {GitHub},
90
+ journal = {GitHub repository},
91
+ howpublished = {\\url{https://github.com/TensorSpeech/TensorFlowTTS}},
92
+ }
93
+ ```
config.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the hyperparameter configuration file for Tacotron2 v1.
2
+ # Please make sure this is adjusted for the synpaflex dataset. If you want to
3
+ # apply to the other dataset, you might need to carefully change some parameters.
4
+ # This configuration performs 200k iters but 65k iters is enough to get a good models.
5
+
6
+ ###########################################################
7
+ # FEATURE EXTRACTION SETTING #
8
+ ###########################################################
9
+ hop_size: 256 # Hop size.
10
+ format: "npy"
11
+
12
+
13
+ ###########################################################
14
+ # NETWORK ARCHITECTURE SETTING #
15
+ ###########################################################
16
+ model_type: "tacotron2"
17
+
18
+ tacotron2_params:
19
+ dataset: synpaflex
20
+ embedding_hidden_size: 512
21
+ initializer_range: 0.02
22
+ embedding_dropout_prob: 0.1
23
+ n_speakers: 1
24
+ n_conv_encoder: 5
25
+ encoder_conv_filters: 512
26
+ encoder_conv_kernel_sizes: 5
27
+ encoder_conv_activation: 'relu'
28
+ encoder_conv_dropout_rate: 0.5
29
+ encoder_lstm_units: 256
30
+ n_prenet_layers: 2
31
+ prenet_units: 256
32
+ prenet_activation: 'relu'
33
+ prenet_dropout_rate: 0.5
34
+ n_lstm_decoder: 1
35
+ reduction_factor: 1
36
+ decoder_lstm_units: 1024
37
+ attention_dim: 128
38
+ attention_filters: 32
39
+ attention_kernel: 31
40
+ n_mels: 80
41
+ n_conv_postnet: 5
42
+ postnet_conv_filters: 512
43
+ postnet_conv_kernel_sizes: 5
44
+ postnet_dropout_rate: 0.1
45
+ attention_type: "lsa"
46
+
47
+ ###########################################################
48
+ # DATA LOADER SETTING #
49
+ ###########################################################
50
+ batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
51
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
52
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
53
+ mel_length_threshold: 32 # remove all targets has mel_length <= 32
54
+ is_shuffle: true # shuffle dataset after each epoch.
55
+ use_fixed_shapes: true # use_fixed_shapes for training (2x speed-up)
56
+ # refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
57
+
58
+ ###########################################################
59
+ # OPTIMIZER & SCHEDULER SETTING #
60
+ ###########################################################
61
+ optimizer_params:
62
+ initial_learning_rate: 0.001
63
+ end_learning_rate: 0.00001
64
+ decay_steps: 150000 # < train_max_steps is recommend.
65
+ warmup_proportion: 0.02
66
+ weight_decay: 0.001
67
+
68
+ gradient_accumulation_steps: 1
69
+ var_train_expr: null # trainable variable expr (eg. 'embeddings|decoder_cell' )
70
+ # must separate by |. if var_train_expr is null then we
71
+ # training all variables.
72
+ ###########################################################
73
+ # INTERVAL SETTING #
74
+ ###########################################################
75
+ train_max_steps: 200000 # Number of training steps.
76
+ save_interval_steps: 2000 # Interval steps to save checkpoint.
77
+ eval_interval_steps: 500 # Interval steps to evaluate the network.
78
+ log_interval_steps: 200 # Interval steps to record the training log.
79
+ start_schedule_teacher_forcing: 200001 # don't need to apply schedule teacher forcing.
80
+ start_ratio_value: 0.5 # start ratio of scheduled teacher forcing.
81
+ schedule_decay_steps: 50000 # decay step scheduled teacher forcing.
82
+ end_ratio_value: 0.0 # end ratio of scheduled teacher forcing.
83
+ ###########################################################
84
+ # OTHER SETTING #
85
+ ###########################################################
86
+ num_save_intermediate_results: 1 # Number of results to be saved as intermediate results.
model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7761e61d0dd3bbe9387ff6191d1507d9fd308d6117c8d3ec2f8151c6f9ea4470
3
+ size 127842184
processor.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"symbol_to_id": {"pad": 0, "!": 1, "/": 2, "'": 3, "(": 4, ")": 5, ",": 6, "-": 7, ".": 8, ":": 9, ";": 10, "?": 11, " ": 12, "A": 13, "B": 14, "C": 15, "D": 16, "E": 17, "F": 18, "G": 19, "H": 20, "I": 21, "J": 22, "K": 23, "L": 24, "M": 25, "N": 26, "O": 27, "P": 28, "Q": 29, "R": 30, "S": 31, "T": 32, "U": 33, "V": 34, "W": 35, "X": 36, "Y": 37, "Z": 38, "a": 39, "b": 40, "c": 41, "d": 42, "e": 43, "f": 44, "g": 45, "h": 46, "i": 47, "j": 48, "k": 49, "l": 50, "m": 51, "n": 52, "o": 53, "p": 54, "q": 55, "r": 56, "s": 57, "t": 58, "u": 59, "v": 60, "w": 61, "x": 62, "y": 63, "z": 64, "\u00e9": 65, "\u00e8": 66, "\u00e0": 67, "\u00f9": 68, "\u00e2": 69, "\u00ea": 70, "\u00ee": 71, "\u00f4": 72, "\u00fb": 73, "\u00e7": 74, "\u00e4": 75, "\u00eb": 76, "\u00ef": 77, "\u00f6": 78, "\u00fc": 79, "\u00ff": 80, "\u0153": 81, "\u00e6": 82, "eos": 83}, "id_to_symbol": {"0": "pad", "1": "!", "2": "/", "3": "'", "4": "(", "5": ")", "6": ",", "7": "-", "8": ".", "9": ":", "10": ";", "11": "?", "12": " ", "13": "A", "14": "B", "15": "C", "16": "D", "17": "E", "18": "F", "19": "G", "20": "H", "21": "I", "22": "J", "23": "K", "24": "L", "25": "M", "26": "N", "27": "O", "28": "P", "29": "Q", "30": "R", "31": "S", "32": "T", "33": "U", "34": "V", "35": "W", "36": "X", "37": "Y", "38": "Z", "39": "a", "40": "b", "41": "c", "42": "d", "43": "e", "44": "f", "45": "g", "46": "h", "47": "i", "48": "j", "49": "k", "50": "l", "51": "m", "52": "n", "53": "o", "54": "p", "55": "q", "56": "r", "57": "s", "58": "t", "59": "u", "60": "v", "61": "w", "62": "x", "63": "y", "64": "z", "65": "\u00e9", "66": "\u00e8", "67": "\u00e0", "68": "\u00f9", "69": "\u00e2", "70": "\u00ea", "71": "\u00ee", "72": "\u00f4", "73": "\u00fb", "74": "\u00e7", "75": "\u00e4", "76": "\u00eb", "77": "\u00ef", "78": "\u00f6", "79": "\u00fc", "80": "\u00ff", "81": "\u0153", "82": "\u00e6", "83": "eos"}, "speakers_map": {"synpaflex": 0}, "processor_name": "SynpaflexProcessor"}