from stt import Model import gradio as gr import numpy as np model = 'stt-comodoro-czech-2022-05-31.tflite' scorer = 'czech-large-vocab.scorer' beam_width = 512 lm_alpha = 0.94 lm_beta = 2.52 model = Model(model) model.enableExternalScorer(scorer) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def reformat_freq(sr, y): if sr not in ( 48000, 16000, ): # Deepspeech only supports 16k, (we convert 48k -> 16k) raise ValueError("Unsupported rate", sr) if sr == 48000: y = ( ((y / max(np.max(y), 1)) * 32767) .reshape((-1, 3)) .mean(axis=1) .astype("int16") ) sr = 16000 return sr, y def transcribe(speech): _, y = reformat_freq(*speech) stream = model.createStream() stream.feedAudioContent(y) text = stream.intermediateDecode() return text with gr.Blocks() as blocks: audio = gr.Audio(source="microphone", type="numpy", streaming=False, label='Pokud je to třeba, povolte mikrofon pro tuto stránku, \ klikněte na Record from microphone, po dokončení nahrávání na Stop recording a poté na Rozpoznat') btn = gr.Button('Rozpoznat') output = gr.Textbox(show_label=False), inputs=[audio], outputs=[output]) blocks.launch(enable_queue=True, debug=True)