|
using System.Collections.Generic; |
|
using UnityEngine; |
|
using Unity.Sentis; |
|
using System.IO; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public class RunJets : MonoBehaviour |
|
{ |
|
public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods."; |
|
|
|
|
|
|
|
|
|
bool hasPhenomeDictionary = true; |
|
|
|
readonly string[] phonemes = new string[] { |
|
"<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1", |
|
"IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B", |
|
"HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G", |
|
"ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2", |
|
"AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2", |
|
"AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0", |
|
"UH0", "OY0", "..", "<sos/eos>" }; |
|
|
|
readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' '); |
|
|
|
|
|
const int samplerate = 22050; |
|
|
|
Dictionary<string, string> dict = new (); |
|
|
|
IWorker engine; |
|
|
|
AudioClip clip; |
|
|
|
void Start() |
|
{ |
|
LoadModel(); |
|
ReadDictionary(); |
|
TextToSpeech(); |
|
} |
|
|
|
void LoadModel() |
|
{ |
|
var model = ModelLoader.Load(Path.Join(Application.streamingAssetsPath ,"jets-text-to-speech.sentis")); |
|
engine = WorkerFactory.CreateWorker(BackendType.GPUCompute, model); |
|
} |
|
|
|
void TextToSpeech() |
|
{ |
|
string ptext; |
|
if (hasPhenomeDictionary) |
|
{ |
|
ptext = TextToPhonemes(inputText); |
|
Debug.Log(ptext); |
|
} |
|
else |
|
{ |
|
|
|
ptext = "DH AH0 K W IH1 K B R AW1 N F AA1 K S JH AH1 M P S OW1 V ER0 DH AH0 L EY1 Z IY0 D AO1 G ."; |
|
|
|
|
|
} |
|
DoInference(ptext); |
|
} |
|
|
|
void ReadDictionary() |
|
{ |
|
if (!hasPhenomeDictionary) return; |
|
string[] words = File.ReadAllLines(Path.Join(Application.streamingAssetsPath,"phoneme_dict.txt")); |
|
for (int i = 0; i < words.Length; i++) |
|
{ |
|
string s = words[i]; |
|
string[] parts = s.Split(); |
|
if (parts[0] != ";;;") |
|
{ |
|
string key = parts[0]; |
|
dict.Add(key, s.Substring(key.Length + 2)); |
|
} |
|
} |
|
|
|
dict.Add(",", ","); |
|
dict.Add(".", "."); |
|
dict.Add("!", "!"); |
|
dict.Add("?", "?"); |
|
dict.Add("\"", "\""); |
|
|
|
|
|
} |
|
|
|
public string ExpandNumbers(string text) |
|
{ |
|
return text |
|
.Replace("0", " ZERO ") |
|
.Replace("1", " ONE ") |
|
.Replace("2", " TWO ") |
|
.Replace("3", " THREE ") |
|
.Replace("4", " FOUR ") |
|
.Replace("5", " FIVE ") |
|
.Replace("6", " SIX ") |
|
.Replace("7", " SEVEN ") |
|
.Replace("8", " EIGHT ") |
|
.Replace("9", " NINE "); |
|
} |
|
|
|
public string TextToPhonemes(string text) |
|
{ |
|
string output = ""; |
|
text = ExpandNumbers(text).ToUpper(); |
|
|
|
string[] words = text.Split(); |
|
for (int i = 0; i < words.Length; i++) |
|
{ |
|
output += DecodeWord(words[i]); |
|
} |
|
return output; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
public string DecodeWord(string word) |
|
{ |
|
string output = ""; |
|
int start = 0; |
|
for (int end = word.Length; end >= 0 && start < word.Length ; end--) |
|
{ |
|
if (end <= start) |
|
{ |
|
start++; |
|
end = word.Length + 1; |
|
continue; |
|
} |
|
string subword = word.Substring(start, end - start); |
|
if (dict.TryGetValue(subword, out string value)) |
|
{ |
|
output += value + " "; |
|
start = end; |
|
end = word.Length + 1; |
|
} |
|
} |
|
return output; |
|
} |
|
|
|
int[] GetTokens(string ptext) |
|
{ |
|
string[] p = ptext.Split(); |
|
var tokens = new int[p.Length]; |
|
for (int i = 0; i < tokens.Length; i++) |
|
{ |
|
tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i])); |
|
} |
|
return tokens; |
|
} |
|
|
|
public void DoInference(string ptext) |
|
{ |
|
int[] tokens = GetTokens(ptext); |
|
|
|
using var input = new TensorInt(new TensorShape(tokens.Length), tokens); |
|
var result = engine.Execute(input); |
|
|
|
var output = result.PeekOutput("wav") as TensorFloat; |
|
output.CompleteOperationsAndDownload(); |
|
var samples = output.ToReadOnlyArray(); |
|
|
|
Debug.Log($"Audio size = {samples.Length / samplerate} seconds"); |
|
|
|
clip = AudioClip.Create("voice audio", samples.Length, 1, samplerate, false); |
|
clip.SetData(samples, 0); |
|
|
|
Speak(); |
|
} |
|
private void Speak() |
|
{ |
|
AudioSource audioSource = GetComponent<AudioSource>(); |
|
if (audioSource != null) |
|
{ |
|
audioSource.clip = clip; |
|
audioSource.Play(); |
|
} |
|
else |
|
{ |
|
Debug.Log("There is no audio source"); |
|
} |
|
} |
|
|
|
void Update() |
|
{ |
|
if (Input.GetKeyDown(KeyCode.Space)) |
|
{ |
|
TextToSpeech(); |
|
} |
|
} |
|
|
|
private void OnDestroy() |
|
{ |
|
engine?.Dispose(); |
|
} |
|
} |