import gradio as gr import librosa import numpy as np import paddlehub as hub from paddlenlp import Taskflow from paddlespeech.cli import ASRExecutor import soundfile as sf # asr_model = hub.Module(name='u2_conformer_aishell') asr_executor = ASRExecutor() text_correct_model = Taskflow("text_correction") punc_model = hub.Module(name='auto_punc') def speech_recognize(file): data, sr = librosa.load(file) if sr != 16000: data = librosa.resample(data, sr, 16000) sf.write(file, data, samplerate=16000) print(f'[Audio Input] shape: {data.shape}, dtype: {data.dtype}, file: {file}') # text = asr_model.speech_recognize(file, device='cpu') text = asr_executor(file) text_correction = text_correct_model(text)[0] cor_text, errors = text_correction['target'], text_correction['errors'] print(f'[Text Correction] errors: {errors}') punc_text = punc_model.add_puncs(cor_text, device='cpu')[0] ret = '' ret += f'[ASR] {text}\n' ret += f'[COR] {cor_text}\n' ret += f'[PUN] {punc_text}' return ret iface = gr.Interface( fn=speech_recognize, inputs=gr.inputs.Audio(source="microphone", type='filepath'), outputs="text", ) iface.launch()