Text-to-Speech
speechbrain
English
TTS
speech-synthesis
Tacotron2
Krisshvamsi commited on
Commit
09d1286
1 Parent(s): 5edb9a0

Upload 2 files

Browse files
Files changed (2) hide show
  1. hyperparams.yaml +60 -163
  2. model.ckpt +2 -2
hyperparams.yaml CHANGED
@@ -1,173 +1,70 @@
 
 
 
 
1
 
2
-
3
- ################################
4
- # Audio Parameters #
5
- ################################
6
- sample_rate: 22050
7
- hop_length: 256
8
- win_length: 1024
9
  n_mel_channels: 80
10
- n_fft: 1024
11
- mel_fmin: 0.0
12
- mel_fmax: 8000.0
13
- power: 1
14
- normalized: False
15
- min_max_energy_norm: True
16
- norm: "slaney"
17
- mel_scale: "slaney"
18
- dynamic_range_compression: True
19
- mel_normalized: False
20
- min_f0: 65 #(torchaudio pyin values)
21
- max_f0: 2093 #(torchaudio pyin values)
22
-
23
- positive_weight: 5.0
24
- lexicon:
25
- - AA
26
- - AE
27
- - AH
28
- - AO
29
- - AW
30
- - AY
31
- - B
32
- - CH
33
- - D
34
- - DH
35
- - EH
36
- - ER
37
- - EY
38
- - F
39
- - G
40
- - HH
41
- - IH
42
- - IY
43
- - JH
44
- - K
45
- - L
46
- - M
47
- - N
48
- - NG
49
- - OW
50
- - OY
51
- - P
52
- - R
53
- - S
54
- - SH
55
- - T
56
- - TH
57
- - UH
58
- - UW
59
- - V
60
- - W
61
- - Y
62
- - Z
63
- - ZH
64
- - ' '
65
- n_symbols: 42 #fixed depending on symbols in the lexicon +1 for a dummy symbol used for padding
66
- padding_idx: 0
67
-
68
- # Define model architecture
69
- d_model: 512
70
- nhead: 8
71
- num_encoder_layers: 6
72
- num_decoder_layers: 6
73
- dim_feedforward: 2048
74
- dropout: 0.2
75
- blank_index: 0 # This special token is for padding
76
- bos_index: 1
77
- eos_index: 2
78
- stop_weight: 0.45
79
- stop_threshold: 0.5
80
-
81
-
82
- ###################PRENET#######################
83
- enc_pre_net: !new:models.EncoderPrenet
84
- dec_pre_net: !new:models.DecoderPrenet
85
-
86
-
87
- encoder_emb: !new:torch.nn.Embedding
88
- num_embeddings: 128
89
- embedding_dim: !ref <d_model>
90
- padding_idx: !ref <blank_index>
91
-
92
- pos_emb_enc: !new:models.ScaledPositionalEncoding
93
- d_model: !ref <d_model>
94
-
95
- decoder_emb: !new:torch.nn.Embedding
96
- num_embeddings: 128
97
- embedding_dim: !ref <d_model>
98
- padding_idx: !ref <blank_index>
99
-
100
- pos_emb_dec: !new:models.ScaledPositionalEncoding
101
- d_model: !ref <d_model>
102
-
103
-
104
- Seq2SeqTransformer: !new:torch.nn.Transformer
105
- d_model: !ref <d_model>
106
- nhead: !ref <nhead>
107
- num_encoder_layers: !ref <num_encoder_layers>
108
- num_decoder_layers: !ref <num_decoder_layers>
109
- dim_feedforward: !ref <dim_feedforward>
110
- dropout: !ref <dropout>
111
- batch_first: True
112
-
113
- postnet: !new:models.PostNet
114
- mel_channels: !ref <n_mel_channels>
115
- postnet_channels: 512
116
- kernel_size: 5
117
- postnet_layers: 5
118
-
119
- mel_lin: !new:speechbrain.nnet.linear.Linear
120
- input_size: !ref <d_model>
121
- n_neurons: !ref <n_mel_channels>
122
-
123
- stop_lin: !new:speechbrain.nnet.linear.Linear
124
- input_size: !ref <d_model>
125
- n_neurons: 1
126
 
127
- mel_spec_feats: !name:speechbrain.lobes.models.FastSpeech2.mel_spectogram
128
- sample_rate: !ref <sample_rate>
129
- hop_length: !ref <hop_length>
130
- win_length: !ref <win_length>
131
- n_fft: !ref <n_fft>
132
- n_mels: !ref <n_mel_channels>
133
- f_min: !ref <mel_fmin>
134
- f_max: !ref <mel_fmax>
135
- power: !ref <power>
136
- normalized: !ref <normalized>
137
- min_max_energy_norm: !ref <min_max_energy_norm>
138
- norm: !ref <norm>
139
- mel_scale: !ref <mel_scale>
140
- compression: !ref <dynamic_range_compression>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  modules:
143
- enc_pre_net: !ref <enc_pre_net>
144
- encoder_emb: !ref <encoder_emb>
145
- pos_emb_enc: !ref <pos_emb_enc>
146
-
147
- dec_pre_net: !ref <dec_pre_net>
148
- #decoder_emb: !ref <decoder_emb>
149
- pos_emb_dec: !ref <pos_emb_dec>
150
-
151
- Seq2SeqTransformer: !ref <Seq2SeqTransformer>
152
- postnet: !ref <postnet>
153
- mel_lin: !ref <mel_lin>
154
- stop_lin: !ref <stop_lin>
155
  model: !ref <model>
156
 
157
- lookahead_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_lookahead_mask
158
- padding_mask: !name:speechbrain.lobes.models.transformer.Transformer.get_key_padding_mask
159
-
160
- model: !new:torch.nn.ModuleList
161
- - [!ref <enc_pre_net>, !ref <encoder_emb>, !ref <pos_emb_enc>, !ref <dec_pre_net>, !ref <pos_emb_dec>, !ref <Seq2SeqTransformer>, !ref <postnet>, !ref <mel_lin>, !ref <stop_lin>]
162
-
163
- label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
164
-
165
- pretrained_path: /content/
166
-
167
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
168
  loadables:
169
  model: !ref <model>
170
- label_encoder: !ref <label_encoder>
171
- paths:
172
- model: !ref <pretrained_path>/model.ckpt
173
- label_encoder: !ref <pretrained_path>/label_encoder.txt
 
1
+ # ################################
2
+ # Model: Tacotroon2 for TTS
3
+ # Authors: Artem Ploujnikov, Yingzhi Wang
4
+ # ################################
5
 
6
+ mask_padding: True
 
 
 
 
 
 
7
  n_mel_channels: 80
8
+ n_symbols: 148
9
+ symbols_embedding_dim: 512
10
+ encoder_kernel_size: 5
11
+ encoder_n_convolutions: 3
12
+ encoder_embedding_dim: 512
13
+ attention_rnn_dim: 1024
14
+ attention_dim: 128
15
+ attention_location_n_filters: 32
16
+ attention_location_kernel_size: 31
17
+ n_frames_per_step: 1
18
+ decoder_rnn_dim: 1024
19
+ prenet_dim: 256
20
+ max_decoder_steps: 1000
21
+ gate_threshold: 0.5
22
+ p_attention_dropout: 0.1
23
+ p_decoder_dropout: 0.1
24
+ postnet_embedding_dim: 512
25
+ postnet_kernel_size: 5
26
+ postnet_n_convolutions: 5
27
+ decoder_no_early_stopping: False
28
+ sample_rate: 22050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Model
31
+ model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
32
+ mask_padding: !ref <mask_padding>
33
+ n_mel_channels: !ref <n_mel_channels>
34
+ # symbols
35
+ n_symbols: !ref <n_symbols>
36
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
37
+ # encoder
38
+ encoder_kernel_size: !ref <encoder_kernel_size>
39
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
40
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
41
+ # attention
42
+ attention_rnn_dim: !ref <attention_rnn_dim>
43
+ attention_dim: !ref <attention_dim>
44
+ # attention location
45
+ attention_location_n_filters: !ref <attention_location_n_filters>
46
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
47
+ # decoder
48
+ n_frames_per_step: !ref <n_frames_per_step>
49
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
50
+ prenet_dim: !ref <prenet_dim>
51
+ max_decoder_steps: !ref <max_decoder_steps>
52
+ gate_threshold: !ref <gate_threshold>
53
+ p_attention_dropout: !ref <p_attention_dropout>
54
+ p_decoder_dropout: !ref <p_decoder_dropout>
55
+ # postnet
56
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
57
+ postnet_kernel_size: !ref <postnet_kernel_size>
58
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
59
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
60
+
61
+ # Function that converts the text into a sequence of valid characters.
62
+ text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence
63
 
64
  modules:
 
 
 
 
 
 
 
 
 
 
 
 
65
  model: !ref <model>
66
 
 
 
 
 
 
 
 
 
 
 
67
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
68
  loadables:
69
  model: !ref <model>
70
+
 
 
 
model.ckpt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45e8e10afd13fa8bf1563f8babdc4779d3316ec227eaabf8c57dab9e4f794ded
3
- size 226346982
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02d1aa3fc1cc8fb0981895c765e9d0017416bb78861b5450e458dda92e567856
3
+ size 112830206