meets2tarun commited on
Commit
cb16f23
·
1 Parent(s): 09040e4
Files changed (5) hide show
  1. README.md +9 -0
  2. best_bundle.pth +3 -0
  3. config.json +68 -172
  4. giga330M.pth +3 -0
  5. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-sa-4.0
3
+ language:
4
+ - en
5
+ pipeline_tag: text-to-speech
6
+ ---
7
+
8
+ Repo: https://github.com/jasonppy/VoiceCraft
9
+ paper: https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf
best_bundle.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:979b3c7914add8734a5a9bdddd377fe928820bd708224ad2c133207c68eb5664
3
+ size 2689895581
config.json CHANGED
@@ -1,174 +1,70 @@
1
  {
2
- "args": null,
3
- "audio_embedding_dim": 1024,
4
- "audio_embedding_dropout": 0.0,
5
- "audio_max_length": 20.0,
6
- "audio_min_length": 2.0,
7
- "audio_pad_token": 2050,
8
- "audio_positional_embedding_dropout": 0.0,
9
- "audio_vocab_size": 2048,
10
- "batch_size": 100,
11
- "clipping_update_period": 1000,
12
- "codebook_weight": "[2,1,1,1]",
13
- "d_model": 1024,
14
- "dataset": "gigaspeech",
15
- "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
16
- "drop_long": 1,
17
- "dynamic_batching": 1,
18
- "early_stop_step": 3200,
19
- "early_stop_threshold": -1.0,
20
- "empty_token": 2048,
21
- "encodec_folder_name": "encodec_16khz_4codebooks",
22
- "encodec_sr": 50,
23
- "eog": 2049,
24
- "eos": 2051,
25
- "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
26
- "gradient_accumulation_steps": 24,
27
- "gradient_clip_val": 1.0,
28
- "load_model_from": null,
29
- "lr": 1e-05,
30
- "manifest_name": "manifest_large16khz_lessambi",
31
- "mask_len_max": 600,
32
- "mask_len_min": 1,
33
- "mask_sample_dist": "poisson1",
34
- "max_mask_portion": 0.9,
35
- "max_n_spans": 3,
36
- "max_num_tokens": 50000,
37
- "min_gap": 5,
38
- "n_codebooks": 4,
39
- "n_special": 4,
40
- "nhead": 16,
41
- "num_buckets": 10,
42
- "num_decoder_layers": 24,
43
- "num_epochs": 10,
44
- "num_steps": 500000,
45
- "num_workers": 8,
46
- "optimizer_name": "AdamW",
47
- "pad_x": 0,
48
- "phn2num": {
49
- "!": 17,
50
- "\"": 97,
51
- ",": 64,
52
- ".": 77,
53
- "1": 80,
54
- ":": 93,
55
- ";": 81,
56
- "<MUSIC>": 39,
57
- "<NOISE>": 52,
58
- "<OTHER>": 60,
59
- "<SIL>": 53,
60
- "?": 78,
61
- "_": 15,
62
- "a\u026a": 48,
63
- "a\u026a\u0259": 56,
64
- "a\u026a\u025a": 2,
65
- "a\u028a": 36,
66
- "b": 20,
67
- "d": 72,
68
- "d\u0292": 57,
69
- "e": 85,
70
- "e\u026a": 6,
71
- "f": 69,
72
- "h": 14,
73
- "i": 27,
74
- "i\u0259": 42,
75
- "i\u02d0": 68,
76
- "i\u02d0\u02d0": 51,
77
- "j": 67,
78
- "k": 41,
79
- "kh": 84,
80
- "l": 63,
81
- "m": 9,
82
- "n": 23,
83
- "n\u02b2": 8,
84
- "o": 86,
85
- "o\u028a": 25,
86
- "o\u02d0": 74,
87
- "o\u02d0\u0279": 40,
88
- "p": 34,
89
- "q": 96,
90
- "r": 79,
91
- "s": 66,
92
- "t": 73,
93
- "t\u0255": 87,
94
- "t\u0283": 75,
95
- "t\u02b0": 94,
96
- "u": 1,
97
- "u\u02d0": 47,
98
- "v": 31,
99
- "w": 19,
100
- "x": 4,
101
- "z": 22,
102
- "\u00a1": 98,
103
- "\u00ab": 88,
104
- "\u00bb": 89,
105
- "\u00bf": 95,
106
- "\u00e6": 32,
107
- "\u00e6\u00e6": 50,
108
- "\u00e7": 10,
109
- "\u00f0": 7,
110
- "\u014b": 58,
111
- "\u0250": 70,
112
- "\u0250\u0250": 71,
113
- "\u0251": 61,
114
- "\u0251\u02d0": 0,
115
- "\u0251\u02d0\u0279": 44,
116
- "\u0252": 83,
117
- "\u0254": 3,
118
- "\u0254\u026a": 13,
119
- "\u0254\u02d0": 29,
120
- "\u0254\u02d0\u0279": 33,
121
- "\u0259": 54,
122
- "\u0259l": 16,
123
- "\u0259\u028a": 90,
124
- "\u025a": 35,
125
- "\u025b": 18,
126
- "\u025b\u0279": 11,
127
- "\u025b\u02d0": 82,
128
- "\u025c\u02d0": 21,
129
- "\u0261": 49,
130
- "\u0261\u02b2": 37,
131
- "\u026a": 65,
132
- "\u026a\u0279": 76,
133
- "\u026a\u02d0": 100,
134
- "\u026c": 46,
135
- "\u026f": 91,
136
- "\u0279": 5,
137
- "\u027e": 24,
138
- "\u0283": 26,
139
- "\u028a": 43,
140
- "\u028a\u0279": 28,
141
- "\u028c": 38,
142
- "\u0292": 55,
143
- "\u0294": 59,
144
- "\u0303": 45,
145
- "\u0329": 12,
146
- "\u03b8": 30,
147
- "\u1d7b": 62,
148
- "\u2014": 99,
149
- "\u2026": 92
150
- },
151
- "phn_folder_name": "phonemes",
152
- "precision": "float16",
153
- "print_every_n_steps": 800,
154
- "pseudo_epoch_size": 3000,
155
- "reduce_lr_start_epoch": 4,
156
- "reduce_lr_start_step": 3000,
157
- "reduced_eog": 1,
158
- "resume": false,
159
- "seed": 1,
160
- "shuffle_mask_embedding": 0,
161
- "special_first": 0,
162
- "tb_write_every_n_steps": 100,
163
- "text_embedding_dropout": 0.0,
164
- "text_max_length": 400,
165
- "text_min_length": 10.0,
166
- "text_pad_token": 120,
167
- "text_positional_embedding_dropout": 0.0,
168
- "text_vocab_size": 120,
169
- "trm_dropout": 0.0,
170
- "val_every_n_steps": 3200,
171
- "val_max_num_tokens": 6000,
172
- "warmup_fraction": 0.1,
173
- "weight_decay": 0.0
174
  }
 
1
  {
2
+ "seed": 1,
3
+ "precision": "float16",
4
+ "num_workers": 8,
5
+ "resume": false,
6
+ "tb_write_every_n_steps": 100,
7
+ "print_every_n_steps": 400,
8
+ "val_every_n_steps": 1600,
9
+ "lr": 1e-05,
10
+ "batch_size": 100,
11
+ "weight_decay": 0.0,
12
+ "warmup_fraction": 0.1,
13
+ "num_epochs": 10,
14
+ "num_steps": 500000,
15
+ "gradient_accumulation_steps": 24,
16
+ "gradient_clip_val": 1.0,
17
+ "early_stop_step": 3200,
18
+ "early_stop_threshold": -1.0,
19
+ "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
20
+ "dataset": "gigaspeech",
21
+ "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
22
+ "pseudo_epoch_size": 3000,
23
+ "phn_folder_name": "phonemes",
24
+ "encodec_folder_name": "encodec_16khz_4codebooks",
25
+ "manifest_name": "manifest_large16khz_lessambi",
26
+ "pad_x": 0,
27
+ "max_num_tokens": 20000,
28
+ "val_max_num_tokens": 6000,
29
+ "num_buckets": 10,
30
+ "dynamic_batching": 1,
31
+ "audio_max_length": 16.0,
32
+ "audio_min_length": 1.0,
33
+ "text_max_length": 400,
34
+ "text_min_length": 10.0,
35
+ "encodec_sr": 50,
36
+ "mask_len_min": 1,
37
+ "mask_len_max": 600,
38
+ "drop_long": 1,
39
+ "eos": 2051,
40
+ "reduced_eog": 1,
41
+ "special_first": 0,
42
+ "n_special": 4,
43
+ "codebook_weight": "[2,1,1,1]",
44
+ "empty_token": 2048,
45
+ "optimizer_name": "AdamW",
46
+ "reduce_lr_start_step": 3000,
47
+ "reduce_lr_start_epoch": 4,
48
+ "clipping_update_period": 1000,
49
+ "max_mask_portion": 0.9,
50
+ "max_n_spans": 3,
51
+ "shuffle_mask_embedding": 0,
52
+ "mask_sample_dist": "poisson1",
53
+ "min_gap": 5,
54
+ "n_codebooks": 4,
55
+ "text_vocab_size": 120,
56
+ "text_pad_token": 120,
57
+ "audio_vocab_size": 2048,
58
+ "eog": 2049,
59
+ "audio_pad_token": 2050,
60
+ "d_model": 1024,
61
+ "audio_embedding_dim": 1024,
62
+ "text_embedding_dropout": 0.0,
63
+ "audio_embedding_dropout": 0.0,
64
+ "text_positional_embedding_dropout": 0.0,
65
+ "audio_positional_embedding_dropout": 0.0,
66
+ "trm_dropout": 0.0,
67
+ "nhead": 16,
68
+ "num_decoder_layers": 24,
69
+ "load_model_from": "./pretrained_models/giga330M.pth"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
giga330M.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e028b8c5237cb4a6050ca81d4569b98e3a34ad9175fa252f7b1d13e6a9ad26
3
+ size 1746844161
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aec2638d007ccb427de8b7778d9c35d3b1bf41b3aa86dd62c0bd5a4d182cc837
3
+ size 1293853416