import gradio as gr from transformers import pipeline import numpy as np """ Globals! """ shortcut_js = """ """ def fix_fmt(input_text): # correct contractions (we cna't use the apostraphe) input_text = input_text.replace("I'm", "I am") input_text = input_text.replace("I'll", "I will") input_text = input_text.replace("I'd", "I would") input_text = input_text.replace("you'd", "you would") input_text = input_text.replace("he'd", "he would") input_text = input_text.replace("she'd", "she would") input_text = input_text.replace("they'd", "they would") input_text = input_text.replace("we'd", "we would") input_text = input_text.replace("You'd", "You would") input_text = input_text.replace("He'd", "He would") input_text = input_text.replace("She'd", "She would") input_text = input_text.replace("They'd", "They would") input_text = input_text.replace("We'd", "We would") input_text = input_text.replace("you'll", "you will") input_text = input_text.replace("he'll", "he will") input_text = input_text.replace("she'll", "she will") input_text = input_text.replace("they'll", "they will") input_text = input_text.replace("we'll", "we will") input_text = input_text.replace("You'll", "You will") input_text = input_text.replace("He'll", "He will") input_text = input_text.replace("She'll", "She will") input_text = input_text.replace("They'll", "They will") input_text = input_text.replace("We'll", "We will") # other contractions can be said without the apostraphe -- that is not an issue input_text = input_text.replace("'", "") input_text = input_text.replace('"', '') return input_text def get_html(input_text): inner_html = """ Speech synthesiser

Speech System

""" input_text = fix_fmt(input_text) inner_html = inner_html.replace("SOME_DEFAULT_VALUE", input_text) html = f""" """ return html # this seems to have installed in a diff loc to where # whisper.load_model(model_size) downlaods to transcriber = pipeline( "automatic-speech-recognition", model="openai/whisper-medium" #large is also installed ) def transcribe(stream, new_chunk): try: sr, y = new_chunk y = y.astype(np.float32) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y text = transcriber( { "sampling_rate": sr, "raw": stream } )["text"] html = get_html(text) return stream, html except: return None, None def run_demo(): demo = gr.Interface( transcribe, inputs = [ "state", gr.Audio(sources=["microphone"], streaming=False) ], outputs = [ "state", gr.HTML() ], live=True, head=shortcut_js ) demo.launch() if __name__ == "__main__": run_demo()