add more files

Browse files

Files changed (7) hide show

.gitignore +1 -0
generate-lexicon.py +57 -0
lexicon.txt +0 -0
test.py +189 -0
tokens.txt +178 -0
vits-ljs.int8.onnx +3 -0
vits-ljs.onnx +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ test.wav

generate-lexicon.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+from phonemizer import phonemize
+import re
+def read_lexicon():
+    in_file = "./CMU.in.IPA.txt"
+    words = set()
+    pattern = re.compile("^[a-zA-Z'-\.]+$")
+    with open(in_file) as f:
+        for line in f:
+            try:
+                line = line.strip()
+                word, _ = line.split(",")
+                word = word.strip()
+                if not pattern.match(word):
+                    #  print(line, "word is", word)
+                    continue
+            except:
+                #  print(line)
+                continue
+            assert word not in words, word
+            words.add(word)
+    return list(words)
+def main():
+    words = read_lexicon()
+    num_words = len(words)
+    batch = 5000
+    i = 0
+    word2ipa = dict()
+    while i < num_words:
+        print(f"{i}/{num_words}, {i/num_words*100:.3f}%")
+        this_batch = words[i : i + batch]
+        i += batch
+        phonemes = phonemize(
+            this_batch,
+            language="en-us",
+            backend="espeak",
+            strip=True,
+            preserve_punctuation=True,
+            with_stress=True,
+        )
+        for w, p in zip(this_batch, phonemes):
+            word2ipa[w] = " ".join(list(p))
+    with open("lexicon.txt", "w", encoding="utf-8") as f:
+        for w, p in word2ipa.items():
+            f.write(f"{w} {p}\n")
+if __name__ == "__main__":
+    main()

lexicon.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

test.py ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+from typing import Dict, List
+import onnxruntime
+import soundfile
+import torch
+def display(sess):
+    for i in sess.get_inputs():
+        print(i)
+    print("-" * 10)
+    for o in sess.get_outputs():
+        print(o)
+class OnnxModel:
+    def __init__(
+        self,
+        model: str,
+    ):
+        session_opts = onnxruntime.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 4
+        self.session_opts = session_opts
+        self.model = onnxruntime.InferenceSession(
+            model,
+            sess_options=self.session_opts,
+        )
+        display(self.model)
+        meta = self.model.get_modelmeta().custom_metadata_map
+        self.add_blank = int(meta["add_blank"])
+        self.sample_rate = int(meta["sample_rate"])
+        self.punctuation = meta["punctuation"].split()
+        print(meta)
+    def __call__(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A int64 tensor of shape (L,)
+        """
+        x = x.unsqueeze(0)
+        x_length = torch.tensor([x.shape[1]], dtype=torch.int64)
+        noise_scale = torch.tensor([1], dtype=torch.float32)
+        length_scale = torch.tensor([1], dtype=torch.float32)
+        noise_scale_w = torch.tensor([1], dtype=torch.float32)
+        y = self.model.run(
+            [
+                self.model.get_outputs()[0].name,
+            ],
+            {
+                self.model.get_inputs()[0].name: x.numpy(),
+                self.model.get_inputs()[1].name: x_length.numpy(),
+                self.model.get_inputs()[2].name: noise_scale.numpy(),
+                self.model.get_inputs()[3].name: length_scale.numpy(),
+                self.model.get_inputs()[4].name: noise_scale_w.numpy(),
+            },
+        )[0]
+        return torch.from_numpy(y).squeeze()
+def read_lexicon() -> Dict[str, List[str]]:
+    ans = dict()
+    with open("./lexicon.txt", encoding="utf-8") as f:
+        for line in f:
+            w_p = line.split()
+            w = w_p[0]
+            p = w_p[1:]
+            ans[w] = p
+    return ans
+def read_tokens() -> Dict[str, int]:
+    ans = dict()
+    with open("./tokens.txt", encoding="utf-8") as f:
+        for line in f:
+            t_i = line.strip().split()
+            if len(t_i) == 1:
+                token = " "
+                idx = t_i[0]
+            else:
+                assert len(t_i) == 2, (t_i, line)
+                token = t_i[0]
+                idx = t_i[1]
+            ans[token] = int(idx)
+    return ans
+def convert_lexicon(lexicon, tokens):
+    for w in lexicon:
+        phones = lexicon[w]
+        try:
+            p = [tokens[i] for i in phones]
+            lexicon[w] = p
+        except Exception:
+            #  print("skip", w)
+            continue
+"""
+skip rapprochement
+skip croissants
+skip aix-en-provence
+skip provence
+skip croissant
+skip denouement
+skip hola
+skip blanc
+"""
+def get_text(text, lexicon, tokens, punctuation):
+    text = text.lower().split()
+    ans = []
+    for i in range(len(text)):
+        w = text[i]
+        punct = None
+        if w[0] in punctuation:
+            ans.append(tokens[w[0]])
+            w = w[1:]
+        if w[-1] in punctuation:
+            punct = tokens[w[-1]]
+            w = w[:-1]
+        if w in lexicon:
+            ans.extend(lexicon[w])
+            if punct:
+                ans.append(punct)
+            if i != len(text) - 1:
+                ans.append(tokens[" "])
+            continue
+        print("ignore", w)
+    return ans
+def main():
+    model = OnnxModel("./vits-ljs.onnx")
+    lexicon = read_lexicon()
+    tokens = read_tokens()
+    convert_lexicon(lexicon, tokens)
+    x = get_text(
+        "Liliana, our most beautiful and lovely assistant",
+        lexicon,
+        tokens,
+        model.punctuation,
+    )
+    #  x = get_text(
+    #      "Ask not what your country can do for you; ask what you can do for your country.",
+    #      lexicon,
+    #      tokens,
+    #      model.punctuation,
+    #  )
+    x = get_text(
+        "Success is not final, failure is not fatal, it is the courage to continue that counts!",
+        lexicon,
+        tokens,
+        model.punctuation,
+    )
+    if model.add_blank:
+        x2 = [0] * (2 * len(x) + 1)
+        x2[1::2] = x
+        x = x2
+    x = torch.tensor(x, dtype=torch.int64)
+    y = model(x)
+    soundfile.write("test.wav", y.numpy(), model.sample_rate)
+if __name__ == "__main__":
+    main()

tokens.txt ADDED Viewed

	@@ -0,0 +1,178 @@

+_ 0
+; 1
+: 2
+, 3
+. 4
+! 5
+? 6
+¡ 7
+¿ 8
+— 9
+… 10
+" 11
+« 12
+» 13
+“ 14
+” 15
+  16
+A 17
+B 18
+C 19
+D 20
+E 21
+F 22
+G 23
+H 24
+I 25
+J 26
+K 27
+L 28
+M 29
+N 30
+O 31
+P 32
+Q 33
+R 34
+S 35
+T 36
+U 37
+V 38
+W 39
+X 40
+Y 41
+Z 42
+a 43
+b 44
+c 45
+d 46
+e 47
+f 48
+g 49
+h 50
+i 51
+j 52
+k 53
+l 54
+m 55
+n 56
+o 57
+p 58
+q 59
+r 60
+s 61
+t 62
+u 63
+v 64
+w 65
+x 66
+y 67
+z 68
+ɑ 69
+ɐ 70
+ɒ 71
+æ 72
+ɓ 73
+ʙ 74
+β 75
+ɔ 76
+ɕ 77
+ç 78
+ɗ 79
+ɖ 80
+ð 81
+ʤ 82
+ə 83
+ɘ 84
+ɚ 85
+ɛ 86
+ɜ 87
+ɝ 88
+ɞ 89
+ɟ 90
+ʄ 91
+ɡ 92
+ɠ 93
+ɢ 94
+ʛ 95
+ɦ 96
+ɧ 97
+ħ 98
+ɥ 99
+ʜ 100
+ɨ 101
+ɪ 102
+ʝ 103
+ɭ 104
+ɬ 105
+ɫ 106
+ɮ 107
+ʟ 108
+ɱ 109
+ɯ 110
+ɰ 111
+ŋ 112
+ɳ 113
+ɲ 114
+ɴ 115
+ø 116
+ɵ 117
+ɸ 118
+θ 119
+œ 120
+ɶ 121
+ʘ 122
+ɹ 123
+ɺ 124
+ɾ 125
+ɻ 126
+ʀ 127
+ʁ 128
+ɽ 129
+ʂ 130
+ʃ 131
+ʈ 132
+ʧ 133
+ʉ 134
+ʊ 135
+ʋ 136
+ⱱ 137
+ʌ 138
+ɣ 139
+ɤ 140
+ʍ 141
+χ 142
+ʎ 143
+ʏ 144
+ʑ 145
+ʐ 146
+ʒ 147
+ʔ 148
+ʡ 149
+ʕ 150
+ʢ 151
+ǀ 152
+ǁ 153
+ǂ 154
+ǃ 155
+ˈ 156
+ˌ 157
+ː 158
+ˑ 159
+ʼ 160
+ʴ 161
+ʰ 162
+ʱ 163
+ʲ 164
+ʷ 165
+ˠ 166
+ˤ 167
+˞ 168
+↓ 169
+↑ 170
+→ 171
+↗ 172
+↘ 173
+' 174
+̩ 175
+' 176
+ᵻ 177

vits-ljs.int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6913156be3452aa77f626fc79c04a129ead5be3186f411080feb8a32ce559105
+size 37423543

vits-ljs.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf2959231e8474ba6c567794859527488c579fb4f7d9c7b2b1b686db521974fd
+size 114124439