PB Unity commited on
Commit
23787a8
1 Parent(s): 0f72756

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ jets-text-to-speech.sentis filter=lfs diff=lfs merge=lfs -text
RunJets.cs ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ using System.Collections.Generic;
2
+ using UnityEngine;
3
+ using Unity.Sentis;
4
+ using System.IO;
5
+
6
+ // Jets Text-To-Speech Inference
7
+ // =============================
8
+ //
9
+ // This file implements the Jets Text-to-speech model in Unity Sentis
10
+ // The model uses phenomes instead of raw text so you have to convert it first.
11
+ // Place this file on the Main Camera
12
+ // Add an audio source
13
+ // Change the inputText
14
+ // When running you can press space bar to play it again
15
+
16
+ public class RunJets : MonoBehaviour
17
+ {
18
+ public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
19
+ //string inputText = "The quick brown fox jumped over the lazy dog";
20
+ //string inputText = "Hello, my name is Ginger the Giraffe!";
21
+ //string inputText = "There are many uses of the things she uses!";
22
+
23
+ //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
24
+ bool hasPhenomeDictionary = true;
25
+
26
+ readonly string[] phonemes = new string[] {
27
+ "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
28
+ "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", "','", "AA1", "B",
29
+ "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
30
+ "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
31
+ "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
32
+ "AH2", "AY0", "IY2", "AW2", "AA0", "''''", "ER2", "UH2", "'?'", "OY2", "'!'", "AW0",
33
+ "UH0", "OY0", "..", "<sos/eos>" };
34
+
35
+ readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
36
+
37
+ //Can change pitch and speed with this for a slightly different voice:
38
+ const int samplerate = 22050;
39
+
40
+ Dictionary<string, string> dict = new ();
41
+
42
+ IWorker engine;
43
+
44
+ AudioClip clip;
45
+
46
+ void Start()
47
+ {
48
+ LoadModel();
49
+ ReadDictionary();
50
+ TextToSpeech();
51
+ }
52
+
53
+ void LoadModel()
54
+ {
55
+ var model = ModelLoader.Load(Application.streamingAssetsPath + "/jets-text-to-speech.sentis");
56
+ engine = WorkerFactory.CreateWorker(BackendType.GPUCompute, model);
57
+ }
58
+
59
+ void TextToSpeech()
60
+ {
61
+ string ptext;
62
+ if (hasPhenomeDictionary)
63
+ {
64
+ ptext = TextToPhonemes(inputText);
65
+ Debug.Log(ptext);
66
+ }
67
+ else
68
+ {
69
+ //If we have no phenome dictionary we can use one of these examples:
70
+ ptext = "DH AH0 K W IH1 K B R AW1 N F AA1 K S JH AH1 M P S OW1 V ER0 DH AH0 L EY1 Z IY0 D AO1 G .";
71
+ //ptext = "W AH1 N S AH0 P AA1 N AH0 T AY1 M , AH0 F R AA1 G M EH1 T AH0 P R IH1 N S EH0 S . DH AH0 F R AA1 G K IH1 S T DH AH0 P R IH1 N S EH0 S AH0 N D B IH0 K EY1 M AH0 P R IH1 N S .";
72
+ //ptext = "D UW1 P L AH0 K EY2 T";
73
+ }
74
+ DoInference(ptext);
75
+ }
76
+
77
+ void ReadDictionary()
78
+ {
79
+ if (!hasPhenomeDictionary) return;
80
+ string[] words = File.ReadAllLines(Application.streamingAssetsPath+"/phoneme_dict.txt");
81
+ for (int i = 0; i < words.Length; i++)
82
+ {
83
+ string s = words[i];
84
+ string[] parts = s.Split(' ', System.StringSplitOptions.RemoveEmptyEntries);
85
+ if (parts[0] != ";;;")
86
+ {
87
+ string key = parts[0];
88
+ dict.Add(key, s.Substring(key.Length + 2));
89
+ }
90
+ }
91
+ // Add codes for punctuation to the dictionary
92
+ dict.Add(",", "','");
93
+ dict.Add(".", ".");
94
+ dict.Add("!", "'!'");
95
+ dict.Add("?", "'?'");
96
+ dict.Add("\"", "''''");
97
+ }
98
+
99
+ public string ExpandNumbers(string text)
100
+ {
101
+ return text
102
+ .Replace("0", " ZERO ")
103
+ .Replace("1", " ONE ")
104
+ .Replace("2", " TWO ")
105
+ .Replace("3", " THREE ")
106
+ .Replace("4", " FOUR ")
107
+ .Replace("5", " FIVE ")
108
+ .Replace("6", " SIX ")
109
+ .Replace("7", " SEVEN ")
110
+ .Replace("8", " EIGHT ")
111
+ .Replace("9", " NINE ");
112
+ }
113
+
114
+ public string TextToPhonemes(string text)
115
+ {
116
+ string output = "";
117
+ text = ExpandNumbers(text).ToUpper();
118
+
119
+ string[] words = text.Split();
120
+ for (int i = 0; i < words.Length; i++)
121
+ {
122
+ output += DecodeWord(words[i]);
123
+ }
124
+ return output;
125
+ }
126
+
127
+ //Decode the word into phenomes by looking for the longest word in the dictionary that matches
128
+ //the first part of the word and so on.
129
+ //This is works fairly well but could be improved. The original paper had a model that
130
+ //dealt with guessing the phonemes of words
131
+ public string DecodeWord(string word)
132
+ {
133
+ string output = "";
134
+ int start = 0;
135
+ for (int i = word.Length; i >= 0; i--)
136
+ {
137
+ string subword = word.Substring(start, i - start);
138
+ if (dict.TryGetValue(subword, out string value))
139
+ {
140
+ output += value + " ";
141
+ if (i == word.Length) break;
142
+ start = i;
143
+ i = word.Length + 1;
144
+ }
145
+ }
146
+ return output;
147
+ }
148
+
149
+ int[] GetTokens(string ptext)
150
+ {
151
+ string[] p = ptext.Split();
152
+ var tokens = new int[p.Length];
153
+ for (int i = 0; i < tokens.Length; i++)
154
+ {
155
+ tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i]));
156
+ }
157
+ return tokens;
158
+ }
159
+
160
+ public void DoInference(string ptext)
161
+ {
162
+ int[] tokens = GetTokens(ptext);
163
+
164
+ using var input = new TensorInt(new TensorShape(tokens.Length), tokens);
165
+ var result = engine.Execute(input);
166
+
167
+ var output = result.PeekOutput("wav") as TensorFloat;
168
+ output.MakeReadable();
169
+ var samples = output.ToReadOnlyArray();
170
+
171
+ Debug.Log($"Audio size = {samples.Length / samplerate} seconds");
172
+
173
+ clip = AudioClip.Create("voice audio", samples.Length, 1, samplerate, false);
174
+ clip.SetData(samples, 0);
175
+
176
+ Speak();
177
+ }
178
+ private void Speak()
179
+ {
180
+ AudioSource audioSource = GetComponent<AudioSource>();
181
+ if (audioSource != null)
182
+ {
183
+ audioSource.clip = clip;
184
+ audioSource.Play();
185
+ }
186
+ else
187
+ {
188
+ Debug.Log("There is no audio source");
189
+ }
190
+ }
191
+
192
+ void Update()
193
+ {
194
+ if (Input.GetKeyDown(KeyCode.Space))
195
+ {
196
+ TextToSpeech();
197
+ }
198
+ }
199
+
200
+ private void OnDestroy()
201
+ {
202
+ engine?.Dispose();
203
+ }
204
+ }
jets-text-to-speech.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f71523a869ae567d3f0b6db61a6a84a27288d8f794b564778f1c6cff79eef82
3
+ size 132619847
jets-text-to-speech.sentis ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1541f940099ef2d851adfd23b43b9d0226208ba0b062da7ff7038a0315295bd
3
+ size 138538708
phoneme_dict.txt ADDED
The diff for this file is too large to render. See raw diff