csukuangfj commited on
Commit
1265f08
1 Parent(s): 963063e

update model

Browse files
app.js ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const generateBtn = document.getElementById('generateBtn');
2
+ const hint = document.getElementById('hint');
3
+ const speakerIdLabel = document.getElementById('speakerIdLabel');
4
+ const speakerIdInput = document.getElementById('speakerId');
5
+ const speedInput = document.getElementById('speed');
6
+ const speedValue = document.getElementById('speedValue');
7
+ const textArea = document.getElementById('text');
8
+ const soundClips = document.getElementById('sound-clips');
9
+
10
+ speedValue.innerHTML = speedInput.value;
11
+
12
+ let index = 0;
13
+
14
+
15
+ let tts = null;
16
+
17
+ let audioCtx = null;
18
+
19
+
20
+ Module = {};
21
+ Module.onRuntimeInitialized = function() {
22
+ console.log('Model files downloaded!');
23
+
24
+ console.log('Initializing tts ......');
25
+ tts = initSherpaOnnxOfflineTts()
26
+ if (tts.numSpeakers > 1) {
27
+ speakerIdLabel.innerHTML = `Speaker ID (0 - ${tts.numSpeakers - 1}):`;
28
+ }
29
+
30
+ hint.innerText =
31
+ 'Initialized! Please enter text and click the Generate button.';
32
+
33
+
34
+
35
+ generateBtn.disabled = false;
36
+ };
37
+
38
+ speedInput.oninput = function() {
39
+ speedValue.innerHTML = this.value;
40
+ };
41
+
42
+ generateBtn.onclick = function() {
43
+ let speakerId = speakerIdInput.value;
44
+ if (speakerId.trim().length == 0) {
45
+ alert('Please input a speakerId');
46
+ return;
47
+ }
48
+
49
+ if (!speakerId.match(/^\d+$/)) {
50
+ alert(`Input speakerID ${
51
+ speakerId} is not a number.\nPlease enter a number between 0 and ${
52
+ tts.numSpeakers - 1}`);
53
+ return;
54
+ }
55
+ speakerId = parseInt(speakerId, 10);
56
+ if (speakerId > tts.numSpeakers - 1) {
57
+ alert(`Pleaser enter a number between 0 and ${tts.numSpeakers - 1}`);
58
+ return;
59
+ }
60
+
61
+ let text = textArea.value.trim();
62
+ if (text.length == 0) {
63
+ alert('Please input a non-blank text');
64
+ return;
65
+ }
66
+
67
+ console.log('speakerId', speakerId);
68
+ console.log('speed', speedInput.value);
69
+ console.log('text', text);
70
+
71
+ let audio =
72
+ tts.generate({text: text, sid: speakerId, speed: speedInput.value});
73
+
74
+ console.log(audio.samples.length, audio.sampleRate);
75
+
76
+ if (!audioCtx) {
77
+ audioCtx = new AudioContext({sampleRate: tts.sampleRate});
78
+ }
79
+
80
+ const buffer = audioCtx.createBuffer(1, audio.samples.length, tts.sampleRate);
81
+
82
+ const ptr = buffer.getChannelData(0);
83
+ for (let i = 0; i < audio.samples.length; i++) {
84
+ ptr[i] = audio.samples[i];
85
+ }
86
+ const source = audioCtx.createBufferSource();
87
+ source.buffer = buffer;
88
+ source.connect(audioCtx.destination);
89
+ source.start();
90
+
91
+ createAudioTag(audio);
92
+ };
93
+
94
+ function createAudioTag(generateAudio) {
95
+ const blob = toWav(generateAudio.samples, generateAudio.sampleRate);
96
+
97
+ const text = textArea.value.trim().substring(0, 100);
98
+ const clipName = `${index} ${text} ...`;
99
+ index += 1;
100
+
101
+ const clipContainer = document.createElement('article');
102
+ const clipLabel = document.createElement('p');
103
+ const audio = document.createElement('audio');
104
+ const deleteButton = document.createElement('button');
105
+ clipContainer.classList.add('clip');
106
+ audio.setAttribute('controls', '');
107
+ deleteButton.textContent = 'Delete';
108
+ deleteButton.className = 'delete';
109
+
110
+ clipLabel.textContent = clipName;
111
+
112
+ clipContainer.appendChild(audio);
113
+
114
+ clipContainer.appendChild(clipLabel);
115
+ clipContainer.appendChild(deleteButton);
116
+ soundClips.appendChild(clipContainer);
117
+
118
+ audio.controls = true;
119
+
120
+ const audioURL = window.URL.createObjectURL(blob);
121
+ audio.src = audioURL;
122
+
123
+ deleteButton.onclick = function(e) {
124
+ let evtTgt = e.target;
125
+ evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
126
+ };
127
+
128
+ clipLabel.onclick = function() {
129
+ const existingName = clipLabel.textContent;
130
+ const newClipName = prompt('Enter a new name for your sound clip?');
131
+ if (newClipName === null) {
132
+ clipLabel.textContent = existingName;
133
+ } else {
134
+ clipLabel.textContent = newClipName;
135
+ }
136
+ };
137
+ }
138
+
139
+ // this function is copied/modified from
140
+ // https://gist.github.com/meziantou/edb7217fddfbb70e899e
141
+ function toWav(floatSamples, sampleRate) {
142
+ let samples = new Int16Array(floatSamples.length);
143
+ for (let i = 0; i < samples.length; ++i) {
144
+ let s = floatSamples[i];
145
+ if (s >= 1)
146
+ s = 1;
147
+ else if (s <= -1)
148
+ s = -1;
149
+
150
+ samples[i] = s * 32767;
151
+ }
152
+
153
+ let buf = new ArrayBuffer(44 + samples.length * 2);
154
+ var view = new DataView(buf);
155
+
156
+ // http://soundfile.sapp.org/doc/WaveFormat/
157
+ // F F I R
158
+ view.setUint32(0, 0x46464952, true); // chunkID
159
+ view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
160
+ // E V A W
161
+ view.setUint32(8, 0x45564157, true); // format
162
+ //
163
+ // t m f
164
+ view.setUint32(12, 0x20746d66, true); // subchunk1ID
165
+ view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
166
+ view.setUint32(20, 1, true); // audioFormat, 1 for PCM
167
+ view.setUint16(22, 1, true); // numChannels: 1 channel
168
+ view.setUint32(24, sampleRate, true); // sampleRate
169
+ view.setUint32(28, sampleRate * 2, true); // byteRate
170
+ view.setUint16(32, 2, true); // blockAlign
171
+ view.setUint16(34, 16, true); // bitsPerSample
172
+ view.setUint32(36, 0x61746164, true); // Subchunk2ID
173
+ view.setUint32(40, samples.length * 2, true); // subchunk2Size
174
+
175
+ let offset = 44;
176
+ for (let i = 0; i < samples.length; ++i) {
177
+ view.setInt16(offset, samples[i], true);
178
+ offset += 2;
179
+ }
180
+
181
+ return new Blob([view], {type: 'audio/wav'});
182
+ }
index.html CHANGED
@@ -1,19 +1,46 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html lang="en">
2
+
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width" />
6
+ <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title>
7
+ <style>
8
+ h1,div {
9
+ text-align: center;
10
+ }
11
+ textarea {
12
+ width:100%;
13
+ }
14
+ </style>
15
+ </head>
16
+
17
+ <body>
18
+ <h1>
19
+ Next-gen Kaldi + WebAssembly<br/>
20
+ Text-to-speech Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
21
+ </h1>
22
+ <div>
23
+ <span id="hint">Loading model ... ...</span>
24
+ <br/>
25
+ <br/>
26
+ <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
27
+ <input type="text" id="speakerId" name="speakerId" value="0" />
28
+ <br/>
29
+ <br/>
30
+ <label for="speed" id="speedLabel">Speed: </label>
31
+ <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
32
+ <span id="speedValue"></span>
33
+ <br/>
34
+ <br/>
35
+ <textarea id="text" rows="10" placeholder="Please enter your text here and click the Generate button"></textarea>
36
+ <br/>
37
+ <br/>
38
+ <button id="generateBtn" disabled>Generate</button>
39
+ </div>
40
+ <section flex="1" overflow="auto" id="sound-clips">
41
+ </section>
42
+
43
+ <script src="app.js"></script>
44
+ <script src="sherpa-onnx.js"></script>
45
+ <script src="sherpa-onnx-wasm-main.js"></script>
46
+ </body>
sherpa-onnx-wasm-main.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a3983e944453a7e940c9f91101a6d4aa650147b3c7b978001d829a0ce62492b
3
+ size 96574544
sherpa-onnx-wasm-main.js ADDED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main.wasm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0c1eadb865c1f26ec7c665138656b2444409d55e5a3365b7b76d80059e71113
3
+ size 11462957
sherpa-onnx.js ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ function freeConfig(config) {
3
+ if ('buffer' in config) {
4
+ _free(config.buffer);
5
+ }
6
+
7
+ if ('config' in config) {
8
+ freeConfig(config.config)
9
+ }
10
+
11
+ _free(config.ptr);
12
+ }
13
+
14
+ // The user should free the returned pointers
15
+ function initSherpaOnnxOfflineTtsVitsModelConfig(config) {
16
+ let modelLen = lengthBytesUTF8(config.model) + 1;
17
+ let lexiconLen = lengthBytesUTF8(config.lexicon) + 1;
18
+ let tokensLen = lengthBytesUTF8(config.tokens) + 1;
19
+ let dataDirLen = lengthBytesUTF8(config.dataDir) + 1;
20
+
21
+ let n = modelLen + lexiconLen + tokensLen + dataDirLen;
22
+
23
+ let buffer = _malloc(n);
24
+
25
+ let len = 7 * 4;
26
+ let ptr = _malloc(len);
27
+
28
+ let offset = 0;
29
+ stringToUTF8(config.model, buffer + offset, modelLen);
30
+ offset += modelLen;
31
+
32
+ stringToUTF8(config.lexicon, buffer + offset, lexiconLen);
33
+ offset += lexiconLen;
34
+
35
+ stringToUTF8(config.tokens, buffer + offset, tokensLen);
36
+ offset += tokensLen;
37
+
38
+ stringToUTF8(config.dataDir, buffer + offset, dataDirLen);
39
+ offset += dataDirLen;
40
+
41
+ offset = 0;
42
+ setValue(ptr, buffer + offset, 'i8*');
43
+ offset += modelLen;
44
+
45
+ setValue(ptr + 4, buffer + offset, 'i8*');
46
+ offset += lexiconLen;
47
+
48
+ setValue(ptr + 8, buffer + offset, 'i8*');
49
+ offset += tokensLen;
50
+
51
+ setValue(ptr + 12, buffer + offset, 'i8*');
52
+ offset += dataDirLen;
53
+
54
+ setValue(ptr + 16, config.noiseScale, 'float');
55
+ setValue(ptr + 20, config.noiseScaleW, 'float');
56
+ setValue(ptr + 24, config.lengthScale, 'float');
57
+
58
+ return {
59
+ buffer: buffer, ptr: ptr, len: len,
60
+ }
61
+ }
62
+
63
+ function initSherpaOnnxOfflineTtsModelConfig(config) {
64
+ let vitsModelConfig =
65
+ initSherpaOnnxOfflineTtsVitsModelConfig(config.offlineTtsVitsModelConfig);
66
+
67
+ let len = vitsModelConfig.len + 3 * 4;
68
+ let ptr = _malloc(len);
69
+
70
+ let offset = 0;
71
+ _CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset);
72
+ offset += vitsModelConfig.len;
73
+
74
+ setValue(ptr + offset, config.numThreads, 'i32');
75
+ offset += 4;
76
+
77
+ setValue(ptr + offset, config.debug, 'i32');
78
+ offset += 4;
79
+
80
+ let providerLen = lengthBytesUTF8(config.provider) + 1;
81
+ let buffer = _malloc(providerLen);
82
+ stringToUTF8(config.provider, buffer, providerLen);
83
+ setValue(ptr + offset, buffer, 'i8*');
84
+
85
+ return {
86
+ buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
87
+ }
88
+ }
89
+
90
+ function initSherpaOnnxOfflineTtsConfig(config) {
91
+ let modelConfig =
92
+ initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig);
93
+ let len = modelConfig.len + 2 * 4;
94
+ let ptr = _malloc(len);
95
+
96
+ let offset = 0;
97
+ _CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset);
98
+ offset += modelConfig.len;
99
+
100
+ let ruleFstsLen = lengthBytesUTF8(config.ruleFsts) + 1;
101
+ let buffer = _malloc(ruleFstsLen);
102
+ stringToUTF8(config.ruleFsts, buffer, ruleFstsLen);
103
+ setValue(ptr + offset, buffer, 'i8*');
104
+ offset += 4;
105
+
106
+ setValue(ptr + offset, config.maxNumSentences, 'i32');
107
+
108
+ return {
109
+ buffer: buffer, ptr: ptr, len: len, config: modelConfig,
110
+ }
111
+ }
112
+
113
+ class OfflineTts {
114
+ constructor(configObj) {
115
+ let config = initSherpaOnnxOfflineTtsConfig(configObj)
116
+ let handle = _SherpaOnnxCreateOfflineTts(config.ptr);
117
+
118
+ freeConfig(config);
119
+
120
+ this.handle = handle;
121
+ this.sampleRate = _SherpaOnnxOfflineTtsSampleRate(this.handle);
122
+ this.numSpeakers = _SherpaOnnxOfflineTtsNumSpeakers(this.handle);
123
+ }
124
+
125
+ free() {
126
+ _SherpaOnnxDestroyOfflineTts(this.handle);
127
+ this.handle = 0
128
+ }
129
+
130
+ // {
131
+ // text: "hello",
132
+ // sid: 1,
133
+ // speed: 1.0
134
+ // }
135
+ generate(config) {
136
+ let textLen = lengthBytesUTF8(config.text) + 1;
137
+ let textPtr = _malloc(textLen);
138
+ stringToUTF8(config.text, textPtr, textLen);
139
+
140
+ let h = _SherpaOnnxOfflineTtsGenerate(
141
+ this.handle, textPtr, config.sid, config.speed);
142
+
143
+ let numSamples = HEAP32[h / 4 + 1];
144
+ let sampleRate = HEAP32[h / 4 + 2];
145
+
146
+ let samplesPtr = HEAP32[h / 4] / 4;
147
+ let samples = new Float32Array(numSamples);
148
+ for (let i = 0; i < numSamples; i++) {
149
+ samples[i] = HEAPF32[samplesPtr + i];
150
+ }
151
+
152
+ _SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
153
+ return {samples: samples, sampleRate: sampleRate};
154
+ }
155
+ }
156
+
157
+ function initSherpaOnnxOfflineTts() {
158
+ let offlineTtsVitsModelConfig = {
159
+ model: './model.onnx',
160
+ lexicon: '',
161
+ tokens: './tokens.txt',
162
+ dataDir: './espeak-ng-data',
163
+ noiseScale: 0.667,
164
+ noiseScaleW: 0.8,
165
+ lengthScale: 1.0,
166
+ };
167
+ let offlineTtsModelConfig = {
168
+ offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
169
+ numThreads: 1,
170
+ debug: 1,
171
+ provider: 'cpu',
172
+ };
173
+ let offlineTtsConfig = {
174
+ offlineTtsModelConfig: offlineTtsModelConfig,
175
+ ruleFsts: '',
176
+ maxNumSentences: 1,
177
+ }
178
+
179
+ return new OfflineTts(offlineTtsConfig);
180
+ }