seathrun commited on
Commit
c78c2c3
·
1 Parent(s): ab90fa5

auto-speak is in!!

Browse files
Files changed (1) hide show
  1. app.py +185 -131
app.py CHANGED
@@ -8,10 +8,10 @@ Globals!
8
  shortcut_js = """
9
  <script>
10
  function shortcuts(e) {
11
- var event = document.all ? window.event : e;
12
  if (e.keyCode == 32) {
13
- rec = document.getElementsByClassName("record-button")[0];
14
- end = document.getElementsByClassName("stop-button")[0];
15
  if (rec) {
16
  rec.click();
17
  }
@@ -24,9 +24,67 @@ shortcut_js = """
24
  }
25
  }
26
  document.addEventListener("keypress", shortcuts, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  </script>
28
  """
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def get_html(input_text):
31
  inner_html = """
32
  <!DOCTYPE html>
@@ -125,141 +183,134 @@ def get_html(input_text):
125
 
126
  <body>
127
  <h1>Speech System</h1>
128
-
129
- <p></p>
130
 
131
  <form>
132
- <label for="txt"> </label>
133
- <input id="txt" type="text" class="txt" value="SOME_DEFAULT_VALUE" readonly />
134
- <div>
135
- <label for="rate">Rate</label
136
- ><input type="range" min="0.5" max="2" value="1" step="0.1" id="rate" />
137
- <div class="rate-value">1</div>
138
- <div class="clearfix"></div>
139
- </div>
140
- <div>
141
- <label for="pitch">Pitch</label
142
- ><input type="range" min="0" max="2" value="1" step="0.1" id="pitch" />
143
- <div class="pitch-value">1</div>
144
- <div class="clearfix"></div>
145
- </div>
146
- <select></select>
147
- <div class="controls">
148
- <button id="play" type="submit">Play</button>
149
- </div>
150
  </form>
151
 
152
  <script>
153
- const synth = window.speechSynthesis;
154
- const inputForm = document.querySelector("form");
155
- const inputTxt = document.querySelector(".txt");
156
- const voiceSelect = document.querySelector("select");
157
- const pitch = document.querySelector("#pitch");
158
- const pitchValue = document.querySelector(".pitch-value");
159
- const rate = document.querySelector("#rate");
160
- const rateValue = document.querySelector(".rate-value");
161
- let voices = [];
162
-
163
- function populateVoiceList() {
164
- voices = synth.getVoices().sort(function (a, b) {
165
- const aname = a.name.toUpperCase();
166
- const bname = b.name.toUpperCase();
167
-
168
- if (aname < bname) {
169
- return -1;
170
- } else if (aname == bname) {
171
- return 0;
172
- } else {
173
- return +1;
174
- }
175
- });
176
- const selectedIndex =
177
- voiceSelect.selectedIndex < 0 ? 0 : voiceSelect.selectedIndex;
178
- voiceSelect.innerHTML = "";
179
-
180
- for (let i = 0; i < voices.length; i++) {
181
- if (voices[i].name === "Microsoft Sean - English (Ireland)") {
182
- const option = document.createElement("option");
183
- option.textContent = `${voices[i].name} (${voices[i].lang})`;
184
-
185
- if (voices[i].default) {
186
- option.textContent += " -- DEFAULT";
 
 
 
 
 
 
187
  }
188
-
189
- option.setAttribute("data-lang", voices[i].lang);
190
- option.setAttribute("data-name", voices[i].name);
191
- voiceSelect.appendChild(option);
192
  }
193
- }
194
- voiceSelect.selectedIndex = selectedIndex;
195
- }
196
 
197
- populateVoiceList();
198
 
199
- if (speechSynthesis.onvoiceschanged !== undefined) {
200
- speechSynthesis.onvoiceschanged = populateVoiceList;
201
- }
202
-
203
- function speak() {
204
- if (synth.speaking) {
205
- console.error("speechSynthesis.speaking");
206
- return;
207
  }
208
 
209
- if (inputTxt.value !== "") {
210
- const utterThis = new SpeechSynthesisUtterance(inputTxt.value);
211
-
212
- utterThis.onend = function (event) {
213
- console.log("SpeechSynthesisUtterance.onend");
214
- };
215
-
216
- utterThis.onerror = function (event) {
217
- console.error("SpeechSynthesisUtterance.onerror");
218
- };
219
-
220
- const selectedOption =
221
- voiceSelect.selectedOptions[0].getAttribute("data-name");
222
 
223
- for (let i = 0; i < voices.length; i++) {
224
- if (voices[i].name === selectedOption) {
225
- utterThis.voice = voices[i];
226
- break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
228
  }
229
- utterThis.pitch = pitch.value;
230
- utterThis.rate = rate.value;
231
- synth.speak(utterThis);
232
- }
233
- }
234
-
235
- inputForm.onsubmit = function (event) {
236
- event.preventDefault();
237
-
238
- speak();
239
 
240
- inputTxt.blur();
241
- };
 
 
 
242
 
243
- pitch.onchange = function () {
244
- pitchValue.textContent = pitch.value;
245
- };
246
 
247
- rate.onchange = function () {
248
- rateValue.textContent = rate.value;
249
- };
250
 
251
- voiceSelect.onchange = function () {
252
- speak();
253
- };
254
  </script>
255
  </body>
256
  </html>
257
  """
258
- input_text = input_text.replace("'", "")
259
- input_text = input_text.replace('"', '')
260
  inner_html = inner_html.replace("SOME_DEFAULT_VALUE", input_text)
261
  html = f"""
262
- <iframe srcdoc='{inner_html}' frameBorder="0" height="450" width="100%" title="Speaker" allow="autoplay; fullscreen; layout-animations; microphone" sandbox="allow-forms allow-modals allow-popups allow-popups-to-escape-sandbox allow-same-origin allow-scripts allow-downloads"></iframe>
263
  """
264
  return html
265
 
@@ -271,23 +322,26 @@ transcriber = pipeline(
271
  )
272
 
273
  def transcribe(stream, new_chunk):
274
- sr, y = new_chunk
275
- y = y.astype(np.float32)
276
- y /= np.max(np.abs(y))
277
-
278
- if stream is not None:
279
- stream = np.concatenate([stream, y])
280
- else:
281
- stream = y
282
-
283
- text = transcriber(
284
- {
285
- "sampling_rate": sr,
286
- "raw": stream
287
- }
288
- )["text"]
289
- html = get_html(text)
290
- return stream, html
 
 
 
291
 
292
  def run_demo():
293
  demo = gr.Interface(
 
8
  shortcut_js = """
9
  <script>
10
  function shortcuts(e) {
11
+ let event = document.all ? window.event : e;
12
  if (e.keyCode == 32) {
13
+ let rec = document.getElementsByClassName("record-button")[0];
14
+ let end = document.getElementsByClassName("stop-button")[0];
15
  if (rec) {
16
  rec.click();
17
  }
 
24
  }
25
  }
26
  document.addEventListener("keypress", shortcuts, false);
27
+
28
+ let has_spoken = false;
29
+ function repeat() {
30
+ let iframe_handle = document.getElementById("inner-iframe");
31
+ console.log(iframe_handle);
32
+ if (iframe_handle)
33
+ {
34
+ let inner_doc = iframe_handle.contentDocument || iframe_handle.contentWindow.document;
35
+ if (inner_doc)
36
+ {
37
+ let speak_button = inner_doc.getElementById("play");
38
+ if (speak_button) {
39
+ speak_button.click();
40
+ has_spoken = true;
41
+ }
42
+ }
43
+ }
44
+ if (!has_spoken) {
45
+ setTimeout(repeat, 1000);
46
+ }
47
+ }
48
+ repeat();
49
  </script>
50
  """
51
 
52
+ def fix_fmt(input_text):
53
+ # correct contractions (we cna't use the apostraphe)
54
+ input_text = input_text.replace("I'm", "I am")
55
+ input_text = input_text.replace("I'll", "I will")
56
+ input_text = input_text.replace("I'd", "I would")
57
+
58
+ input_text = input_text.replace("you'd", "you would")
59
+ input_text = input_text.replace("he'd", "he would")
60
+ input_text = input_text.replace("she'd", "she would")
61
+ input_text = input_text.replace("they'd", "they would")
62
+ input_text = input_text.replace("we'd", "we would")
63
+
64
+ input_text = input_text.replace("You'd", "You would")
65
+ input_text = input_text.replace("He'd", "He would")
66
+ input_text = input_text.replace("She'd", "She would")
67
+ input_text = input_text.replace("They'd", "They would")
68
+ input_text = input_text.replace("We'd", "We would")
69
+
70
+ input_text = input_text.replace("you'll", "you will")
71
+ input_text = input_text.replace("he'll", "he will")
72
+ input_text = input_text.replace("she'll", "she will")
73
+ input_text = input_text.replace("they'll", "they will")
74
+ input_text = input_text.replace("we'll", "we will")
75
+
76
+ input_text = input_text.replace("You'll", "You will")
77
+ input_text = input_text.replace("He'll", "He will")
78
+ input_text = input_text.replace("She'll", "She will")
79
+ input_text = input_text.replace("They'll", "They will")
80
+ input_text = input_text.replace("We'll", "We will")
81
+
82
+ # other contractions can be said without the apostraphe -- that is not an issue
83
+ input_text = input_text.replace("'", "")
84
+ input_text = input_text.replace('"', '')
85
+
86
+ return input_text
87
+
88
  def get_html(input_text):
89
  inner_html = """
90
  <!DOCTYPE html>
 
183
 
184
  <body>
185
  <h1>Speech System</h1>
 
 
186
 
187
  <form>
188
+ <label for="txt"> </label>
189
+ <input id="txt" type="text" class="txt" value="SOME_DEFAULT_VALUE" readonly />
190
+ <div>
191
+ <label for="rate">Rate</label
192
+ ><input type="range" min="0.5" max="2" value="1" step="0.1" id="rate" />
193
+ <div class="rate-value">1</div>
194
+ <div class="clearfix"></div>
195
+ </div>
196
+ <div>
197
+ <label for="pitch">Pitch</label
198
+ ><input type="range" min="0" max="2" value="1" step="0.1" id="pitch" />
199
+ <div class="pitch-value">1</div>
200
+ <div class="clearfix"></div>
201
+ </div>
202
+ <select></select>
203
+ <div class="controls">
204
+ <button id="play" type="submit">Play</button>
205
+ </div>
206
  </form>
207
 
208
  <script>
209
+ const synth = window.speechSynthesis;
210
+ const inputForm = document.querySelector("form");
211
+ const inputTxt = document.querySelector(".txt");
212
+ const voiceSelect = document.querySelector("select");
213
+ const pitch = document.querySelector("#pitch");
214
+ const pitchValue = document.querySelector(".pitch-value");
215
+ const rate = document.querySelector("#rate");
216
+ const rateValue = document.querySelector(".rate-value");
217
+ let voices = [];
218
+
219
+ function populateVoiceList() {
220
+ voices = synth.getVoices().sort(function (a, b) {
221
+ const aname = a.name.toUpperCase();
222
+ const bname = b.name.toUpperCase();
223
+
224
+ if (aname < bname) {
225
+ return -1;
226
+ } else if (aname == bname) {
227
+ return 0;
228
+ } else {
229
+ return +1;
230
+ }
231
+ });
232
+ const selectedIndex =
233
+ voiceSelect.selectedIndex < 0 ? 0 : voiceSelect.selectedIndex;
234
+ voiceSelect.innerHTML = "";
235
+
236
+ for (let i = 0; i < voices.length; i++) {
237
+ if (voices[i].name === "Microsoft Sean - English (Ireland)") {
238
+ const option = document.createElement("option");
239
+ option.textContent = `${voices[i].name} (${voices[i].lang})`;
240
+
241
+ if (voices[i].default) {
242
+ option.textContent += " -- DEFAULT";
243
+ }
244
+
245
+ option.setAttribute("data-lang", voices[i].lang);
246
+ option.setAttribute("data-name", voices[i].name);
247
+ voiceSelect.appendChild(option);
248
+ }
249
  }
250
+ voiceSelect.selectedIndex = selectedIndex;
 
 
 
251
  }
 
 
 
252
 
253
+ populateVoiceList();
254
 
255
+ if (speechSynthesis.onvoiceschanged !== undefined) {
256
+ speechSynthesis.onvoiceschanged = populateVoiceList;
 
 
 
 
 
 
257
  }
258
 
259
+ function speak() {
260
+ if (synth.speaking) {
261
+ console.error("speechSynthesis.speaking");
262
+ return;
263
+ }
 
 
 
 
 
 
 
 
264
 
265
+ if (inputTxt.value !== "") {
266
+ const utterThis = new SpeechSynthesisUtterance(inputTxt.value);
267
+
268
+ utterThis.onend = function (event) {
269
+ console.log("SpeechSynthesisUtterance.onend");
270
+ };
271
+ utterThis.onerror = function (event) {
272
+ console.error("SpeechSynthesisUtterance.onerror");
273
+ };
274
+
275
+ const selectedOption =
276
+ voiceSelect.selectedOptions[0].getAttribute("data-name");
277
+ for (let i = 0; i < voices.length; i++) {
278
+ if (voices[i].name === selectedOption) {
279
+ utterThis.voice = voices[i];
280
+ break;
281
+ }
282
+ }
283
+ utterThis.pitch = pitch.value;
284
+ utterThis.rate = rate.value;
285
+ synth.speak(utterThis);
286
  }
287
  }
 
 
 
 
 
 
 
 
 
 
288
 
289
+ inputForm.onsubmit = function (event) {
290
+ event.preventDefault();
291
+ speak();
292
+ inputTxt.blur();
293
+ };
294
 
295
+ pitch.onchange = function () {
296
+ pitchValue.textContent = pitch.value;
297
+ };
298
 
299
+ rate.onchange = function () {
300
+ rateValue.textContent = rate.value;
301
+ };
302
 
303
+ voiceSelect.onchange = function () {
304
+ speak();
305
+ };
306
  </script>
307
  </body>
308
  </html>
309
  """
310
+ input_text = fix_fmt(input_text)
 
311
  inner_html = inner_html.replace("SOME_DEFAULT_VALUE", input_text)
312
  html = f"""
313
+ <iframe srcdoc='{inner_html}' id="inner-iframe" frameBorder="0" height="450" width="100%" title="Speaker" allow="autoplay" sandbox="allow-forms allow-same-origin allow-scripts"></iframe>
314
  """
315
  return html
316
 
 
322
  )
323
 
324
  def transcribe(stream, new_chunk):
325
+ try:
326
+ sr, y = new_chunk
327
+ y = y.astype(np.float32)
328
+ y /= np.max(np.abs(y))
329
+
330
+ if stream is not None:
331
+ stream = np.concatenate([stream, y])
332
+ else:
333
+ stream = y
334
+
335
+ text = transcriber(
336
+ {
337
+ "sampling_rate": sr,
338
+ "raw": stream
339
+ }
340
+ )["text"]
341
+ html = get_html(text)
342
+ return stream, html
343
+ except:
344
+ return None, None
345
 
346
  def run_demo():
347
  demo = gr.Interface(