Spaces:

seathrun
/

astt

Sleeping

App Files Files Community

seathrun commited on May 18, 2024

Commit

c78c2c3

1 Parent(s): ab90fa5

auto-speak is in!!

Browse files

Files changed (1) hide show

app.py +185 -131

app.py CHANGED Viewed

@@ -8,10 +8,10 @@ Globals!
 shortcut_js = """
 <script>
     function shortcuts(e) {
-        var event = document.all ? window.event : e;
         if (e.keyCode == 32) {
-            rec = document.getElementsByClassName("record-button")[0];
-            end = document.getElementsByClassName("stop-button")[0];
             if (rec) {
                 rec.click();
             }
@@ -24,9 +24,67 @@ shortcut_js = """
         }
     }
     document.addEventListener("keypress", shortcuts, false);
 </script>
 """
 def get_html(input_text):
     inner_html = """
         <!DOCTYPE html>
@@ -125,141 +183,134 @@ def get_html(input_text):
         <body>
             <h1>Speech System</h1>
-            <p></p>
             <form>
-            <label for="txt"> </label>
-            <input id="txt" type="text" class="txt" value="SOME_DEFAULT_VALUE" readonly />
-            <div>
-                <label for="rate">Rate</label
-                ><input type="range" min="0.5" max="2" value="1" step="0.1" id="rate" />
-                <div class="rate-value">1</div>
-                <div class="clearfix"></div>
-            </div>
-            <div>
-                <label for="pitch">Pitch</label
-                ><input type="range" min="0" max="2" value="1" step="0.1" id="pitch" />
-                <div class="pitch-value">1</div>
-                <div class="clearfix"></div>
-            </div>
-            <select></select>
-            <div class="controls">
-                <button id="play" type="submit">Play</button>
-            </div>
             </form>
             <script>
-            const synth = window.speechSynthesis;
-            const inputForm = document.querySelector("form");
-            const inputTxt = document.querySelector(".txt");
-            const voiceSelect = document.querySelector("select");
-            const pitch = document.querySelector("#pitch");
-            const pitchValue = document.querySelector(".pitch-value");
-            const rate = document.querySelector("#rate");
-            const rateValue = document.querySelector(".rate-value");
-            let voices = [];
-            function populateVoiceList() {
-                voices = synth.getVoices().sort(function (a, b) {
-                const aname = a.name.toUpperCase();
-                const bname = b.name.toUpperCase();
-                if (aname < bname) {
-                    return -1;
-                } else if (aname == bname) {
-                    return 0;
-                } else {
-                    return +1;
-                }
-                });
-                const selectedIndex =
-                voiceSelect.selectedIndex < 0 ? 0 : voiceSelect.selectedIndex;
-                voiceSelect.innerHTML = "";
-                for (let i = 0; i < voices.length; i++) {
-                if (voices[i].name === "Microsoft Sean - English (Ireland)") {
-                    const option = document.createElement("option");
-                    option.textContent = `${voices[i].name} (${voices[i].lang})`;
-                    if (voices[i].default) {
-                        option.textContent += " -- DEFAULT";
                     }
-                    option.setAttribute("data-lang", voices[i].lang);
-                    option.setAttribute("data-name", voices[i].name);
-                    voiceSelect.appendChild(option);
                 }
-                }
-                voiceSelect.selectedIndex = selectedIndex;
-            }
-            populateVoiceList();
-            if (speechSynthesis.onvoiceschanged !== undefined) {
-                speechSynthesis.onvoiceschanged = populateVoiceList;
-            }
-            function speak() {
-                if (synth.speaking) {
-                console.error("speechSynthesis.speaking");
-                return;
                 }
-                if (inputTxt.value !== "") {
-                const utterThis = new SpeechSynthesisUtterance(inputTxt.value);
-                utterThis.onend = function (event) {
-                    console.log("SpeechSynthesisUtterance.onend");
-                };
-                utterThis.onerror = function (event) {
-                    console.error("SpeechSynthesisUtterance.onerror");
-                };
-                const selectedOption =
-                    voiceSelect.selectedOptions[0].getAttribute("data-name");
-                for (let i = 0; i < voices.length; i++) {
-                    if (voices[i].name === selectedOption) {
-                    utterThis.voice = voices[i];
-                    break;
                     }
                 }
-                utterThis.pitch = pitch.value;
-                utterThis.rate = rate.value;
-                synth.speak(utterThis);
-                }
-            }
-            inputForm.onsubmit = function (event) {
-                event.preventDefault();
-                speak();
-                inputTxt.blur();
-            };
-            pitch.onchange = function () {
-                pitchValue.textContent = pitch.value;
-            };
-            rate.onchange = function () {
-                rateValue.textContent = rate.value;
-            };
-            voiceSelect.onchange = function () {
-                speak();
-            };
             </script>
         </body>
         </html>
     """
-    input_text = input_text.replace("'", "")
-    input_text = input_text.replace('"', '')
     inner_html = inner_html.replace("SOME_DEFAULT_VALUE", input_text)
     html = f"""
-        <iframe srcdoc='{inner_html}' frameBorder="0" height="450" width="100%" title="Speaker" allow="autoplay; fullscreen; layout-animations; microphone" sandbox="allow-forms allow-modals allow-popups allow-popups-to-escape-sandbox allow-same-origin allow-scripts allow-downloads"></iframe>
     """
     return html
@@ -271,23 +322,26 @@ transcriber = pipeline(
 )
 def transcribe(stream, new_chunk):
-    sr, y = new_chunk
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    if stream is not None:
-        stream = np.concatenate([stream, y])
-    else:
-        stream = y
-    text = transcriber(
-        {
-            "sampling_rate": sr,
-            "raw": stream
-        }
-    )["text"]
-    html = get_html(text)
-    return stream, html
 def run_demo():
     demo = gr.Interface(

 shortcut_js = """
 <script>
     function shortcuts(e) {
+        let event = document.all ? window.event : e;
         if (e.keyCode == 32) {
+            let rec = document.getElementsByClassName("record-button")[0];
+            let end = document.getElementsByClassName("stop-button")[0];
             if (rec) {
                 rec.click();
             }
         }
     }
     document.addEventListener("keypress", shortcuts, false);
+    let has_spoken = false;
+    function repeat() {
+        let iframe_handle = document.getElementById("inner-iframe");
+        console.log(iframe_handle);
+        if (iframe_handle)
+        {
+            let inner_doc = iframe_handle.contentDocument || iframe_handle.contentWindow.document;
+            if (inner_doc)
+            {
+                let speak_button = inner_doc.getElementById("play");
+                if (speak_button) {
+                    speak_button.click();
+                    has_spoken = true;
+                }
+            }
+        }
+        if (!has_spoken) {
+            setTimeout(repeat, 1000);
+        }
+    }
+    repeat();
 </script>
 """
+def fix_fmt(input_text):
+    # correct contractions (we cna't use the apostraphe)
+    input_text = input_text.replace("I'm", "I am")
+    input_text = input_text.replace("I'll", "I will")
+    input_text = input_text.replace("I'd", "I would")
+    input_text = input_text.replace("you'd", "you would")
+    input_text = input_text.replace("he'd", "he would")
+    input_text = input_text.replace("she'd", "she would")
+    input_text = input_text.replace("they'd", "they would")
+    input_text = input_text.replace("we'd", "we would")
+    input_text = input_text.replace("You'd", "You would")
+    input_text = input_text.replace("He'd", "He would")
+    input_text = input_text.replace("She'd", "She would")
+    input_text = input_text.replace("They'd", "They would")
+    input_text = input_text.replace("We'd", "We would")
+    input_text = input_text.replace("you'll", "you will")
+    input_text = input_text.replace("he'll", "he will")
+    input_text = input_text.replace("she'll", "she will")
+    input_text = input_text.replace("they'll", "they will")
+    input_text = input_text.replace("we'll", "we will")
+    input_text = input_text.replace("You'll", "You will")
+    input_text = input_text.replace("He'll", "He will")
+    input_text = input_text.replace("She'll", "She will")
+    input_text = input_text.replace("They'll", "They will")
+    input_text = input_text.replace("We'll", "We will")
+    # other contractions can be said without the apostraphe -- that is not an issue
+    input_text = input_text.replace("'", "")
+    input_text = input_text.replace('"', '')
+    return input_text
 def get_html(input_text):
     inner_html = """
         <!DOCTYPE html>
         <body>
             <h1>Speech System</h1>
             <form>
+                <label for="txt"> </label>
+                <input id="txt" type="text" class="txt" value="SOME_DEFAULT_VALUE" readonly />
+                <div>
+                    <label for="rate">Rate</label
+                    ><input type="range" min="0.5" max="2" value="1" step="0.1" id="rate" />
+                    <div class="rate-value">1</div>
+                    <div class="clearfix"></div>
+                </div>
+                <div>
+                    <label for="pitch">Pitch</label
+                    ><input type="range" min="0" max="2" value="1" step="0.1" id="pitch" />
+                    <div class="pitch-value">1</div>
+                    <div class="clearfix"></div>
+                </div>
+                <select></select>
+                <div class="controls">
+                    <button id="play" type="submit">Play</button>
+                </div>
             </form>
             <script>
+                const synth = window.speechSynthesis;
+                const inputForm = document.querySelector("form");
+                const inputTxt = document.querySelector(".txt");
+                const voiceSelect = document.querySelector("select");
+                const pitch = document.querySelector("#pitch");
+                const pitchValue = document.querySelector(".pitch-value");
+                const rate = document.querySelector("#rate");
+                const rateValue = document.querySelector(".rate-value");
+                let voices = [];
+                function populateVoiceList() {
+                    voices = synth.getVoices().sort(function (a, b) {
+                        const aname = a.name.toUpperCase();
+                        const bname = b.name.toUpperCase();
+                        if (aname < bname) {
+                            return -1;
+                        } else if (aname == bname) {
+                            return 0;
+                        } else {
+                            return +1;
+                        }
+                    });
+                    const selectedIndex =
+                        voiceSelect.selectedIndex < 0 ? 0 : voiceSelect.selectedIndex;
+                    voiceSelect.innerHTML = "";
+                    for (let i = 0; i < voices.length; i++) {
+                        if (voices[i].name === "Microsoft Sean - English (Ireland)") {
+                            const option = document.createElement("option");
+                            option.textContent = `${voices[i].name} (${voices[i].lang})`;
+                            if (voices[i].default) {
+                                option.textContent += " -- DEFAULT";
+                            }
+                            option.setAttribute("data-lang", voices[i].lang);
+                            option.setAttribute("data-name", voices[i].name);
+                            voiceSelect.appendChild(option);
+                        }
                     }
+                    voiceSelect.selectedIndex = selectedIndex;
                 }
+                populateVoiceList();
+                if (speechSynthesis.onvoiceschanged !== undefined) {
+                    speechSynthesis.onvoiceschanged = populateVoiceList;
                 }
+                function speak() {
+                    if (synth.speaking) {
+                        console.error("speechSynthesis.speaking");
+                        return;
+                    }
+                    if (inputTxt.value !== "") {
+                        const utterThis = new SpeechSynthesisUtterance(inputTxt.value);
+                        utterThis.onend = function (event) {
+                            console.log("SpeechSynthesisUtterance.onend");
+                        };
+                        utterThis.onerror = function (event) {
+                            console.error("SpeechSynthesisUtterance.onerror");
+                        };
+                        const selectedOption =
+                            voiceSelect.selectedOptions[0].getAttribute("data-name");
+                        for (let i = 0; i < voices.length; i++) {
+                            if (voices[i].name === selectedOption) {
+                                utterThis.voice = voices[i];
+                                break;
+                            }
+                        }
+                        utterThis.pitch = pitch.value;
+                        utterThis.rate = rate.value;
+                        synth.speak(utterThis);
                     }
                 }
+                inputForm.onsubmit = function (event) {
+                    event.preventDefault();
+                    speak();
+                    inputTxt.blur();
+                };
+                pitch.onchange = function () {
+                    pitchValue.textContent = pitch.value;
+                };
+                rate.onchange = function () {
+                    rateValue.textContent = rate.value;
+                };
+                voiceSelect.onchange = function () {
+                    speak();
+                };
             </script>
         </body>
         </html>
     """
+    input_text = fix_fmt(input_text)
     inner_html = inner_html.replace("SOME_DEFAULT_VALUE", input_text)
     html = f"""
+        <iframe srcdoc='{inner_html}' id="inner-iframe" frameBorder="0" height="450" width="100%" title="Speaker" allow="autoplay" sandbox="allow-forms allow-same-origin allow-scripts"></iframe>
     """
     return html
 )
 def transcribe(stream, new_chunk):
+    try:
+        sr, y = new_chunk
+        y = y.astype(np.float32)
+        y /= np.max(np.abs(y))
+        if stream is not None:
+            stream = np.concatenate([stream, y])
+        else:
+            stream = y
+        text = transcriber(
+            {
+                "sampling_rate": sr,
+                "raw": stream
+            }
+        )["text"]
+        html = get_html(text)
+        return stream, html
+    except:
+        return None, None
 def run_demo():
     demo = gr.Interface(