taras-sereda commited on
Commit
3980644
1 Parent(s): 586e19e

speech tokenizer, requirements

Browse files
ckpt/speechtokenizer/SpeechTokenizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d04593b6c9a4b475f91ca481141a6ef5b23e6ac112f347dd2b2717f193c1c728
3
+ size 481906997
ckpt/speechtokenizer/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 3,
4
+ "batch_size": 60,
5
+ "learning_rate": 0.0001,
6
+ "adam_b1": 0.5,
7
+ "adam_b2": 0.9,
8
+ "lr_decay": 0.98,
9
+ "seed": 1234,
10
+ "lambda_distill": 0.15,
11
+
12
+ "n_filters": 64,
13
+ "strides": [8,5,4,2],
14
+ "dimension": 1024,
15
+ "semantic_dimension": 768,
16
+ "bidirectional": true,
17
+ "dilation_base": 2,
18
+ "residual_kernel_size": 3,
19
+ "n_residual_layers": 1,
20
+ "lstm_layers": 2,
21
+ "activation": "ELU",
22
+
23
+
24
+ "segment_size": 48000,
25
+ "num_mels": 80,
26
+ "num_freq": 1025,
27
+ "n_fft": 1024,
28
+ "hop_size": 240,
29
+ "win_size": 1024,
30
+
31
+ "sampling_rate": 16000,
32
+ "sample_rate": 16000,
33
+
34
+ "codebook_size": 1024,
35
+ "n_q": 8,
36
+
37
+ "fmin": 0,
38
+ "fmax": 8000,
39
+ "fmax_for_loss": null,
40
+
41
+ "num_workers": 12,
42
+
43
+ "dist_config": {
44
+ "dist_backend": "nccl",
45
+ "dist_url": "tcp://localhost:54322",
46
+ "world_size": 1
47
+ }
48
+ }
ckpt/unique_text_tokens.k2symbols ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <eps> 0
2
+ ! 1
3
+ " 2
4
+ ( 3
5
+ ) 4
6
+ , 5
7
+ . 6
8
+ : 7
9
+ ; 8
10
+ ? 9
11
+ _ 10
12
+ aɪ 11
13
+ aɪə 12
14
+ aɪɚ 13
15
+ aɪʊ 14
16
+ aɪʊɹ 15
17
+ aʊ 16
18
+ b 17
19
+ d 18
20
+ dʒ 19
21
+ e 20
22
+ enus 21
23
+ es 22
24
+ eɪ 23
25
+ f 24
26
+ fr 25
27
+ h 26
28
+ i 27
29
+ iə 28
30
+ iː 29
31
+ j 30
32
+ k 31
33
+ l 32
34
+ m 33
35
+ n 34
36
+ nʲ 35
37
+ oʊ 36
38
+ oː 37
39
+ oːɹ 38
40
+ p 39
41
+ r 40
42
+ s 41
43
+ t 42
44
+ tʃ 43
45
+ uː 44
46
+ v 45
47
+ w 46
48
+ x 47
49
+ z 48
50
+ æ 49
51
+ ç 50
52
+ ð 51
53
+ ø 52
54
+ ŋ 53
55
+ ɐ 54
56
+ ɑ 55
57
+ ɑː 56
58
+ ɑːɹ 57
59
+ ɔ 58
60
+ ɔɪ 59
61
+ ɔː 60
62
+ ɔːɹ 61
63
+ ə 62
64
+ əl 63
65
+ ɚ 64
66
+ ɛ 65
67
+ ɛɹ 66
68
+ ɛː 67
69
+ ɜː 68
70
+ ɡ 69
71
+ ɡʲ 70
72
+ ɣ 71
73
+ ɪ 72
74
+ ɪɹ 73
75
+ ɫ 74
76
+ ɬ 75
77
+ ɲ 76
78
+ ɹ 77
79
+ ɾ 78
80
+ ʃ 79
81
+ ʊ 80
82
+ ʊɹ 81
83
+ ʌ 82
84
+ ʒ 83
85
+ ʔ 84
86
+ ̃ 85
87
+ ̩ 86
88
+ θ 87
89
+ ᵻ 88
90
+ — 89
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
  torchvision
2
  torchaudio
3
  torch
 
 
 
 
 
 
 
1
  torchvision
2
  torchaudio
3
  torch
4
+ transformers
5
+ einops
6
+ librosa
7
+ pyannote.audio @ https://github.com/pyannote/pyannote-audio/archive/develop.zip
8
+ wheel
9
+ phonemizer