csukuangfj
commited on
Commit
•
8a484fa
1
Parent(s):
4440717
add more files
Browse files- .gitignore +1 -0
- generate-lexicon.py +57 -0
- lexicon.txt +0 -0
- test.py +189 -0
- tokens.txt +178 -0
- vits-ljs.int8.onnx +3 -0
- vits-ljs.onnx +3 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
test.wav
|
generate-lexicon.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
3 |
+
|
4 |
+
from phonemizer import phonemize
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def read_lexicon():
|
9 |
+
in_file = "./CMU.in.IPA.txt"
|
10 |
+
words = set()
|
11 |
+
pattern = re.compile("^[a-zA-Z'-\.]+$")
|
12 |
+
with open(in_file) as f:
|
13 |
+
for line in f:
|
14 |
+
try:
|
15 |
+
line = line.strip()
|
16 |
+
word, _ = line.split(",")
|
17 |
+
word = word.strip()
|
18 |
+
if not pattern.match(word):
|
19 |
+
# print(line, "word is", word)
|
20 |
+
continue
|
21 |
+
except:
|
22 |
+
# print(line)
|
23 |
+
continue
|
24 |
+
|
25 |
+
assert word not in words, word
|
26 |
+
words.add(word)
|
27 |
+
return list(words)
|
28 |
+
|
29 |
+
|
30 |
+
def main():
|
31 |
+
words = read_lexicon()
|
32 |
+
num_words = len(words)
|
33 |
+
batch = 5000
|
34 |
+
i = 0
|
35 |
+
word2ipa = dict()
|
36 |
+
while i < num_words:
|
37 |
+
print(f"{i}/{num_words}, {i/num_words*100:.3f}%")
|
38 |
+
this_batch = words[i : i + batch]
|
39 |
+
i += batch
|
40 |
+
phonemes = phonemize(
|
41 |
+
this_batch,
|
42 |
+
language="en-us",
|
43 |
+
backend="espeak",
|
44 |
+
strip=True,
|
45 |
+
preserve_punctuation=True,
|
46 |
+
with_stress=True,
|
47 |
+
)
|
48 |
+
for w, p in zip(this_batch, phonemes):
|
49 |
+
word2ipa[w] = " ".join(list(p))
|
50 |
+
|
51 |
+
with open("lexicon.txt", "w", encoding="utf-8") as f:
|
52 |
+
for w, p in word2ipa.items():
|
53 |
+
f.write(f"{w} {p}\n")
|
54 |
+
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
main()
|
lexicon.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
3 |
+
|
4 |
+
from typing import Dict, List
|
5 |
+
|
6 |
+
import onnxruntime
|
7 |
+
import soundfile
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
def display(sess):
|
12 |
+
for i in sess.get_inputs():
|
13 |
+
print(i)
|
14 |
+
|
15 |
+
print("-" * 10)
|
16 |
+
for o in sess.get_outputs():
|
17 |
+
print(o)
|
18 |
+
|
19 |
+
|
20 |
+
class OnnxModel:
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
model: str,
|
24 |
+
):
|
25 |
+
session_opts = onnxruntime.SessionOptions()
|
26 |
+
session_opts.inter_op_num_threads = 1
|
27 |
+
session_opts.intra_op_num_threads = 4
|
28 |
+
|
29 |
+
self.session_opts = session_opts
|
30 |
+
|
31 |
+
self.model = onnxruntime.InferenceSession(
|
32 |
+
model,
|
33 |
+
sess_options=self.session_opts,
|
34 |
+
)
|
35 |
+
display(self.model)
|
36 |
+
|
37 |
+
meta = self.model.get_modelmeta().custom_metadata_map
|
38 |
+
self.add_blank = int(meta["add_blank"])
|
39 |
+
self.sample_rate = int(meta["sample_rate"])
|
40 |
+
self.punctuation = meta["punctuation"].split()
|
41 |
+
print(meta)
|
42 |
+
|
43 |
+
def __call__(
|
44 |
+
self,
|
45 |
+
x: torch.Tensor,
|
46 |
+
) -> torch.Tensor:
|
47 |
+
"""
|
48 |
+
Args:
|
49 |
+
x:
|
50 |
+
A int64 tensor of shape (L,)
|
51 |
+
"""
|
52 |
+
x = x.unsqueeze(0)
|
53 |
+
x_length = torch.tensor([x.shape[1]], dtype=torch.int64)
|
54 |
+
noise_scale = torch.tensor([1], dtype=torch.float32)
|
55 |
+
length_scale = torch.tensor([1], dtype=torch.float32)
|
56 |
+
noise_scale_w = torch.tensor([1], dtype=torch.float32)
|
57 |
+
|
58 |
+
y = self.model.run(
|
59 |
+
[
|
60 |
+
self.model.get_outputs()[0].name,
|
61 |
+
],
|
62 |
+
{
|
63 |
+
self.model.get_inputs()[0].name: x.numpy(),
|
64 |
+
self.model.get_inputs()[1].name: x_length.numpy(),
|
65 |
+
self.model.get_inputs()[2].name: noise_scale.numpy(),
|
66 |
+
self.model.get_inputs()[3].name: length_scale.numpy(),
|
67 |
+
self.model.get_inputs()[4].name: noise_scale_w.numpy(),
|
68 |
+
},
|
69 |
+
)[0]
|
70 |
+
return torch.from_numpy(y).squeeze()
|
71 |
+
|
72 |
+
|
73 |
+
def read_lexicon() -> Dict[str, List[str]]:
|
74 |
+
ans = dict()
|
75 |
+
with open("./lexicon.txt", encoding="utf-8") as f:
|
76 |
+
for line in f:
|
77 |
+
w_p = line.split()
|
78 |
+
w = w_p[0]
|
79 |
+
p = w_p[1:]
|
80 |
+
ans[w] = p
|
81 |
+
return ans
|
82 |
+
|
83 |
+
|
84 |
+
def read_tokens() -> Dict[str, int]:
|
85 |
+
ans = dict()
|
86 |
+
with open("./tokens.txt", encoding="utf-8") as f:
|
87 |
+
for line in f:
|
88 |
+
t_i = line.strip().split()
|
89 |
+
if len(t_i) == 1:
|
90 |
+
token = " "
|
91 |
+
idx = t_i[0]
|
92 |
+
else:
|
93 |
+
assert len(t_i) == 2, (t_i, line)
|
94 |
+
token = t_i[0]
|
95 |
+
idx = t_i[1]
|
96 |
+
ans[token] = int(idx)
|
97 |
+
return ans
|
98 |
+
|
99 |
+
|
100 |
+
def convert_lexicon(lexicon, tokens):
|
101 |
+
for w in lexicon:
|
102 |
+
phones = lexicon[w]
|
103 |
+
try:
|
104 |
+
p = [tokens[i] for i in phones]
|
105 |
+
lexicon[w] = p
|
106 |
+
except Exception:
|
107 |
+
# print("skip", w)
|
108 |
+
continue
|
109 |
+
|
110 |
+
|
111 |
+
"""
|
112 |
+
skip rapprochement
|
113 |
+
skip croissants
|
114 |
+
skip aix-en-provence
|
115 |
+
skip provence
|
116 |
+
skip croissant
|
117 |
+
skip denouement
|
118 |
+
skip hola
|
119 |
+
skip blanc
|
120 |
+
"""
|
121 |
+
|
122 |
+
|
123 |
+
def get_text(text, lexicon, tokens, punctuation):
|
124 |
+
text = text.lower().split()
|
125 |
+
ans = []
|
126 |
+
for i in range(len(text)):
|
127 |
+
w = text[i]
|
128 |
+
punct = None
|
129 |
+
|
130 |
+
if w[0] in punctuation:
|
131 |
+
ans.append(tokens[w[0]])
|
132 |
+
w = w[1:]
|
133 |
+
|
134 |
+
if w[-1] in punctuation:
|
135 |
+
punct = tokens[w[-1]]
|
136 |
+
w = w[:-1]
|
137 |
+
|
138 |
+
if w in lexicon:
|
139 |
+
ans.extend(lexicon[w])
|
140 |
+
if punct:
|
141 |
+
ans.append(punct)
|
142 |
+
|
143 |
+
if i != len(text) - 1:
|
144 |
+
ans.append(tokens[" "])
|
145 |
+
continue
|
146 |
+
print("ignore", w)
|
147 |
+
return ans
|
148 |
+
|
149 |
+
|
150 |
+
def main():
|
151 |
+
model = OnnxModel("./vits-ljs.onnx")
|
152 |
+
|
153 |
+
lexicon = read_lexicon()
|
154 |
+
tokens = read_tokens()
|
155 |
+
convert_lexicon(lexicon, tokens)
|
156 |
+
|
157 |
+
x = get_text(
|
158 |
+
"Liliana, our most beautiful and lovely assistant",
|
159 |
+
lexicon,
|
160 |
+
tokens,
|
161 |
+
model.punctuation,
|
162 |
+
)
|
163 |
+
# x = get_text(
|
164 |
+
# "Ask not what your country can do for you; ask what you can do for your country.",
|
165 |
+
# lexicon,
|
166 |
+
# tokens,
|
167 |
+
# model.punctuation,
|
168 |
+
# )
|
169 |
+
|
170 |
+
x = get_text(
|
171 |
+
"Success is not final, failure is not fatal, it is the courage to continue that counts!",
|
172 |
+
lexicon,
|
173 |
+
tokens,
|
174 |
+
model.punctuation,
|
175 |
+
)
|
176 |
+
|
177 |
+
if model.add_blank:
|
178 |
+
x2 = [0] * (2 * len(x) + 1)
|
179 |
+
x2[1::2] = x
|
180 |
+
x = x2
|
181 |
+
|
182 |
+
x = torch.tensor(x, dtype=torch.int64)
|
183 |
+
|
184 |
+
y = model(x)
|
185 |
+
soundfile.write("test.wav", y.numpy(), model.sample_rate)
|
186 |
+
|
187 |
+
|
188 |
+
if __name__ == "__main__":
|
189 |
+
main()
|
tokens.txt
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_ 0
|
2 |
+
; 1
|
3 |
+
: 2
|
4 |
+
, 3
|
5 |
+
. 4
|
6 |
+
! 5
|
7 |
+
? 6
|
8 |
+
¡ 7
|
9 |
+
¿ 8
|
10 |
+
— 9
|
11 |
+
… 10
|
12 |
+
" 11
|
13 |
+
« 12
|
14 |
+
» 13
|
15 |
+
“ 14
|
16 |
+
” 15
|
17 |
+
16
|
18 |
+
A 17
|
19 |
+
B 18
|
20 |
+
C 19
|
21 |
+
D 20
|
22 |
+
E 21
|
23 |
+
F 22
|
24 |
+
G 23
|
25 |
+
H 24
|
26 |
+
I 25
|
27 |
+
J 26
|
28 |
+
K 27
|
29 |
+
L 28
|
30 |
+
M 29
|
31 |
+
N 30
|
32 |
+
O 31
|
33 |
+
P 32
|
34 |
+
Q 33
|
35 |
+
R 34
|
36 |
+
S 35
|
37 |
+
T 36
|
38 |
+
U 37
|
39 |
+
V 38
|
40 |
+
W 39
|
41 |
+
X 40
|
42 |
+
Y 41
|
43 |
+
Z 42
|
44 |
+
a 43
|
45 |
+
b 44
|
46 |
+
c 45
|
47 |
+
d 46
|
48 |
+
e 47
|
49 |
+
f 48
|
50 |
+
g 49
|
51 |
+
h 50
|
52 |
+
i 51
|
53 |
+
j 52
|
54 |
+
k 53
|
55 |
+
l 54
|
56 |
+
m 55
|
57 |
+
n 56
|
58 |
+
o 57
|
59 |
+
p 58
|
60 |
+
q 59
|
61 |
+
r 60
|
62 |
+
s 61
|
63 |
+
t 62
|
64 |
+
u 63
|
65 |
+
v 64
|
66 |
+
w 65
|
67 |
+
x 66
|
68 |
+
y 67
|
69 |
+
z 68
|
70 |
+
ɑ 69
|
71 |
+
ɐ 70
|
72 |
+
ɒ 71
|
73 |
+
æ 72
|
74 |
+
ɓ 73
|
75 |
+
ʙ 74
|
76 |
+
β 75
|
77 |
+
ɔ 76
|
78 |
+
ɕ 77
|
79 |
+
ç 78
|
80 |
+
ɗ 79
|
81 |
+
ɖ 80
|
82 |
+
ð 81
|
83 |
+
ʤ 82
|
84 |
+
ə 83
|
85 |
+
ɘ 84
|
86 |
+
ɚ 85
|
87 |
+
ɛ 86
|
88 |
+
ɜ 87
|
89 |
+
ɝ 88
|
90 |
+
ɞ 89
|
91 |
+
ɟ 90
|
92 |
+
ʄ 91
|
93 |
+
ɡ 92
|
94 |
+
ɠ 93
|
95 |
+
ɢ 94
|
96 |
+
ʛ 95
|
97 |
+
ɦ 96
|
98 |
+
ɧ 97
|
99 |
+
ħ 98
|
100 |
+
ɥ 99
|
101 |
+
ʜ 100
|
102 |
+
ɨ 101
|
103 |
+
ɪ 102
|
104 |
+
ʝ 103
|
105 |
+
ɭ 104
|
106 |
+
ɬ 105
|
107 |
+
ɫ 106
|
108 |
+
ɮ 107
|
109 |
+
ʟ 108
|
110 |
+
ɱ 109
|
111 |
+
ɯ 110
|
112 |
+
ɰ 111
|
113 |
+
ŋ 112
|
114 |
+
ɳ 113
|
115 |
+
ɲ 114
|
116 |
+
ɴ 115
|
117 |
+
ø 116
|
118 |
+
ɵ 117
|
119 |
+
ɸ 118
|
120 |
+
θ 119
|
121 |
+
œ 120
|
122 |
+
ɶ 121
|
123 |
+
ʘ 122
|
124 |
+
ɹ 123
|
125 |
+
ɺ 124
|
126 |
+
ɾ 125
|
127 |
+
ɻ 126
|
128 |
+
ʀ 127
|
129 |
+
ʁ 128
|
130 |
+
ɽ 129
|
131 |
+
ʂ 130
|
132 |
+
ʃ 131
|
133 |
+
ʈ 132
|
134 |
+
ʧ 133
|
135 |
+
ʉ 134
|
136 |
+
ʊ 135
|
137 |
+
ʋ 136
|
138 |
+
ⱱ 137
|
139 |
+
ʌ 138
|
140 |
+
ɣ 139
|
141 |
+
ɤ 140
|
142 |
+
ʍ 141
|
143 |
+
χ 142
|
144 |
+
ʎ 143
|
145 |
+
ʏ 144
|
146 |
+
ʑ 145
|
147 |
+
ʐ 146
|
148 |
+
ʒ 147
|
149 |
+
ʔ 148
|
150 |
+
ʡ 149
|
151 |
+
ʕ 150
|
152 |
+
ʢ 151
|
153 |
+
ǀ 152
|
154 |
+
ǁ 153
|
155 |
+
ǂ 154
|
156 |
+
ǃ 155
|
157 |
+
ˈ 156
|
158 |
+
ˌ 157
|
159 |
+
ː 158
|
160 |
+
ˑ 159
|
161 |
+
ʼ 160
|
162 |
+
ʴ 161
|
163 |
+
ʰ 162
|
164 |
+
ʱ 163
|
165 |
+
ʲ 164
|
166 |
+
ʷ 165
|
167 |
+
ˠ 166
|
168 |
+
ˤ 167
|
169 |
+
˞ 168
|
170 |
+
↓ 169
|
171 |
+
↑ 170
|
172 |
+
→ 171
|
173 |
+
↗ 172
|
174 |
+
↘ 173
|
175 |
+
' 174
|
176 |
+
̩ 175
|
177 |
+
' 176
|
178 |
+
ᵻ 177
|
vits-ljs.int8.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6913156be3452aa77f626fc79c04a129ead5be3186f411080feb8a32ce559105
|
3 |
+
size 37423543
|
vits-ljs.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf2959231e8474ba6c567794859527488c579fb4f7d9c7b2b1b686db521974fd
|
3 |
+
size 114124439
|