csukuangfj commited on
Commit
8a484fa
1 Parent(s): 4440717

add more files

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. generate-lexicon.py +57 -0
  3. lexicon.txt +0 -0
  4. test.py +189 -0
  5. tokens.txt +178 -0
  6. vits-ljs.int8.onnx +3 -0
  7. vits-ljs.onnx +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ test.wav
generate-lexicon.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
3
+
4
+ from phonemizer import phonemize
5
+ import re
6
+
7
+
8
+ def read_lexicon():
9
+ in_file = "./CMU.in.IPA.txt"
10
+ words = set()
11
+ pattern = re.compile("^[a-zA-Z'-\.]+$")
12
+ with open(in_file) as f:
13
+ for line in f:
14
+ try:
15
+ line = line.strip()
16
+ word, _ = line.split(",")
17
+ word = word.strip()
18
+ if not pattern.match(word):
19
+ # print(line, "word is", word)
20
+ continue
21
+ except:
22
+ # print(line)
23
+ continue
24
+
25
+ assert word not in words, word
26
+ words.add(word)
27
+ return list(words)
28
+
29
+
30
+ def main():
31
+ words = read_lexicon()
32
+ num_words = len(words)
33
+ batch = 5000
34
+ i = 0
35
+ word2ipa = dict()
36
+ while i < num_words:
37
+ print(f"{i}/{num_words}, {i/num_words*100:.3f}%")
38
+ this_batch = words[i : i + batch]
39
+ i += batch
40
+ phonemes = phonemize(
41
+ this_batch,
42
+ language="en-us",
43
+ backend="espeak",
44
+ strip=True,
45
+ preserve_punctuation=True,
46
+ with_stress=True,
47
+ )
48
+ for w, p in zip(this_batch, phonemes):
49
+ word2ipa[w] = " ".join(list(p))
50
+
51
+ with open("lexicon.txt", "w", encoding="utf-8") as f:
52
+ for w, p in word2ipa.items():
53
+ f.write(f"{w} {p}\n")
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
lexicon.txt ADDED
The diff for this file is too large to render. See raw diff
 
test.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
3
+
4
+ from typing import Dict, List
5
+
6
+ import onnxruntime
7
+ import soundfile
8
+ import torch
9
+
10
+
11
+ def display(sess):
12
+ for i in sess.get_inputs():
13
+ print(i)
14
+
15
+ print("-" * 10)
16
+ for o in sess.get_outputs():
17
+ print(o)
18
+
19
+
20
+ class OnnxModel:
21
+ def __init__(
22
+ self,
23
+ model: str,
24
+ ):
25
+ session_opts = onnxruntime.SessionOptions()
26
+ session_opts.inter_op_num_threads = 1
27
+ session_opts.intra_op_num_threads = 4
28
+
29
+ self.session_opts = session_opts
30
+
31
+ self.model = onnxruntime.InferenceSession(
32
+ model,
33
+ sess_options=self.session_opts,
34
+ )
35
+ display(self.model)
36
+
37
+ meta = self.model.get_modelmeta().custom_metadata_map
38
+ self.add_blank = int(meta["add_blank"])
39
+ self.sample_rate = int(meta["sample_rate"])
40
+ self.punctuation = meta["punctuation"].split()
41
+ print(meta)
42
+
43
+ def __call__(
44
+ self,
45
+ x: torch.Tensor,
46
+ ) -> torch.Tensor:
47
+ """
48
+ Args:
49
+ x:
50
+ A int64 tensor of shape (L,)
51
+ """
52
+ x = x.unsqueeze(0)
53
+ x_length = torch.tensor([x.shape[1]], dtype=torch.int64)
54
+ noise_scale = torch.tensor([1], dtype=torch.float32)
55
+ length_scale = torch.tensor([1], dtype=torch.float32)
56
+ noise_scale_w = torch.tensor([1], dtype=torch.float32)
57
+
58
+ y = self.model.run(
59
+ [
60
+ self.model.get_outputs()[0].name,
61
+ ],
62
+ {
63
+ self.model.get_inputs()[0].name: x.numpy(),
64
+ self.model.get_inputs()[1].name: x_length.numpy(),
65
+ self.model.get_inputs()[2].name: noise_scale.numpy(),
66
+ self.model.get_inputs()[3].name: length_scale.numpy(),
67
+ self.model.get_inputs()[4].name: noise_scale_w.numpy(),
68
+ },
69
+ )[0]
70
+ return torch.from_numpy(y).squeeze()
71
+
72
+
73
+ def read_lexicon() -> Dict[str, List[str]]:
74
+ ans = dict()
75
+ with open("./lexicon.txt", encoding="utf-8") as f:
76
+ for line in f:
77
+ w_p = line.split()
78
+ w = w_p[0]
79
+ p = w_p[1:]
80
+ ans[w] = p
81
+ return ans
82
+
83
+
84
+ def read_tokens() -> Dict[str, int]:
85
+ ans = dict()
86
+ with open("./tokens.txt", encoding="utf-8") as f:
87
+ for line in f:
88
+ t_i = line.strip().split()
89
+ if len(t_i) == 1:
90
+ token = " "
91
+ idx = t_i[0]
92
+ else:
93
+ assert len(t_i) == 2, (t_i, line)
94
+ token = t_i[0]
95
+ idx = t_i[1]
96
+ ans[token] = int(idx)
97
+ return ans
98
+
99
+
100
+ def convert_lexicon(lexicon, tokens):
101
+ for w in lexicon:
102
+ phones = lexicon[w]
103
+ try:
104
+ p = [tokens[i] for i in phones]
105
+ lexicon[w] = p
106
+ except Exception:
107
+ # print("skip", w)
108
+ continue
109
+
110
+
111
+ """
112
+ skip rapprochement
113
+ skip croissants
114
+ skip aix-en-provence
115
+ skip provence
116
+ skip croissant
117
+ skip denouement
118
+ skip hola
119
+ skip blanc
120
+ """
121
+
122
+
123
+ def get_text(text, lexicon, tokens, punctuation):
124
+ text = text.lower().split()
125
+ ans = []
126
+ for i in range(len(text)):
127
+ w = text[i]
128
+ punct = None
129
+
130
+ if w[0] in punctuation:
131
+ ans.append(tokens[w[0]])
132
+ w = w[1:]
133
+
134
+ if w[-1] in punctuation:
135
+ punct = tokens[w[-1]]
136
+ w = w[:-1]
137
+
138
+ if w in lexicon:
139
+ ans.extend(lexicon[w])
140
+ if punct:
141
+ ans.append(punct)
142
+
143
+ if i != len(text) - 1:
144
+ ans.append(tokens[" "])
145
+ continue
146
+ print("ignore", w)
147
+ return ans
148
+
149
+
150
+ def main():
151
+ model = OnnxModel("./vits-ljs.onnx")
152
+
153
+ lexicon = read_lexicon()
154
+ tokens = read_tokens()
155
+ convert_lexicon(lexicon, tokens)
156
+
157
+ x = get_text(
158
+ "Liliana, our most beautiful and lovely assistant",
159
+ lexicon,
160
+ tokens,
161
+ model.punctuation,
162
+ )
163
+ # x = get_text(
164
+ # "Ask not what your country can do for you; ask what you can do for your country.",
165
+ # lexicon,
166
+ # tokens,
167
+ # model.punctuation,
168
+ # )
169
+
170
+ x = get_text(
171
+ "Success is not final, failure is not fatal, it is the courage to continue that counts!",
172
+ lexicon,
173
+ tokens,
174
+ model.punctuation,
175
+ )
176
+
177
+ if model.add_blank:
178
+ x2 = [0] * (2 * len(x) + 1)
179
+ x2[1::2] = x
180
+ x = x2
181
+
182
+ x = torch.tensor(x, dtype=torch.int64)
183
+
184
+ y = model(x)
185
+ soundfile.write("test.wav", y.numpy(), model.sample_rate)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ main()
tokens.txt ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _ 0
2
+ ; 1
3
+ : 2
4
+ , 3
5
+ . 4
6
+ ! 5
7
+ ? 6
8
+ ¡ 7
9
+ ¿ 8
10
+ — 9
11
+ … 10
12
+ " 11
13
+ « 12
14
+ » 13
15
+ “ 14
16
+ ” 15
17
+ 16
18
+ A 17
19
+ B 18
20
+ C 19
21
+ D 20
22
+ E 21
23
+ F 22
24
+ G 23
25
+ H 24
26
+ I 25
27
+ J 26
28
+ K 27
29
+ L 28
30
+ M 29
31
+ N 30
32
+ O 31
33
+ P 32
34
+ Q 33
35
+ R 34
36
+ S 35
37
+ T 36
38
+ U 37
39
+ V 38
40
+ W 39
41
+ X 40
42
+ Y 41
43
+ Z 42
44
+ a 43
45
+ b 44
46
+ c 45
47
+ d 46
48
+ e 47
49
+ f 48
50
+ g 49
51
+ h 50
52
+ i 51
53
+ j 52
54
+ k 53
55
+ l 54
56
+ m 55
57
+ n 56
58
+ o 57
59
+ p 58
60
+ q 59
61
+ r 60
62
+ s 61
63
+ t 62
64
+ u 63
65
+ v 64
66
+ w 65
67
+ x 66
68
+ y 67
69
+ z 68
70
+ ɑ 69
71
+ ɐ 70
72
+ ɒ 71
73
+ æ 72
74
+ ɓ 73
75
+ ʙ 74
76
+ β 75
77
+ ɔ 76
78
+ ɕ 77
79
+ ç 78
80
+ ɗ 79
81
+ ɖ 80
82
+ ð 81
83
+ ʤ 82
84
+ ə 83
85
+ ɘ 84
86
+ ɚ 85
87
+ ɛ 86
88
+ ɜ 87
89
+ ɝ 88
90
+ ɞ 89
91
+ ɟ 90
92
+ ʄ 91
93
+ ɡ 92
94
+ ɠ 93
95
+ ɢ 94
96
+ ʛ 95
97
+ ɦ 96
98
+ ɧ 97
99
+ ħ 98
100
+ ɥ 99
101
+ ʜ 100
102
+ ɨ 101
103
+ ɪ 102
104
+ ʝ 103
105
+ ɭ 104
106
+ ɬ 105
107
+ ɫ 106
108
+ ɮ 107
109
+ ʟ 108
110
+ ɱ 109
111
+ ɯ 110
112
+ ɰ 111
113
+ ŋ 112
114
+ ɳ 113
115
+ ɲ 114
116
+ ɴ 115
117
+ ø 116
118
+ ɵ 117
119
+ ɸ 118
120
+ θ 119
121
+ œ 120
122
+ ɶ 121
123
+ ʘ 122
124
+ ɹ 123
125
+ ɺ 124
126
+ ɾ 125
127
+ ɻ 126
128
+ ʀ 127
129
+ ʁ 128
130
+ ɽ 129
131
+ ʂ 130
132
+ ʃ 131
133
+ ʈ 132
134
+ ʧ 133
135
+ ʉ 134
136
+ ʊ 135
137
+ ʋ 136
138
+ ⱱ 137
139
+ ʌ 138
140
+ ɣ 139
141
+ ɤ 140
142
+ ʍ 141
143
+ χ 142
144
+ ʎ 143
145
+ ʏ 144
146
+ ʑ 145
147
+ ʐ 146
148
+ ʒ 147
149
+ ʔ 148
150
+ ʡ 149
151
+ ʕ 150
152
+ ʢ 151
153
+ ǀ 152
154
+ ǁ 153
155
+ ǂ 154
156
+ ǃ 155
157
+ ˈ 156
158
+ ˌ 157
159
+ ː 158
160
+ ˑ 159
161
+ ʼ 160
162
+ ʴ 161
163
+ ʰ 162
164
+ ʱ 163
165
+ ʲ 164
166
+ ʷ 165
167
+ ˠ 166
168
+ ˤ 167
169
+ ˞ 168
170
+ ↓ 169
171
+ ↑ 170
172
+ → 171
173
+ ↗ 172
174
+ ↘ 173
175
+ ' 174
176
+ ̩ 175
177
+ ' 176
178
+ ᵻ 177
vits-ljs.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6913156be3452aa77f626fc79c04a129ead5be3186f411080feb8a32ce559105
3
+ size 37423543
vits-ljs.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf2959231e8474ba6c567794859527488c579fb4f7d9c7b2b1b686db521974fd
3
+ size 114124439