import gradio as gr from src.inference import Wav2Vec2Inference import librosa import os, sys import soundfile model_name = "arifagustyawan/wav2vec2-large-xlsr-53-id" asr = Wav2Vec2Inference(model_name) def convert(inputfile, outfile): target_sr = 16000 data, sample_rate = librosa.load(inputfile) data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr) soundfile.write(outfile, data, target_sr) def parse_transcription_record(wav_file): filename = wav_file.split('.')[0] convert(wav_file, filename + "16k.wav") transcription, confidence = asr.file_to_text(filename + "16k.wav") return transcription, confidence return filename + "16k.wav", transcription def parse_transcription_file(wav_file): filename = wav_file.name.split('.')[0] convert(wav_file.name, filename + "16k.wav") transcription, confidence = asr.file_to_text(filename + "16k.wav") return transcription, confidence return filename + "16k.wav", transcription examples = [ [os.path.join("assets", "halo.wav")] ] record_audio = gr.Interface( fn = parse_transcription_record, inputs = gr.Audio(sources="microphone", type="filepath", label = "Click button to record audio"), outputs = [gr.Textbox(label="Transcription"), gr.Textbox(label="Confidence")], analytics_enabled=False, allow_flagging = "never", title="Automatic Speech Recognition", description="Click the button bellow to record audio!", ) upload_file = gr.Interface( fn = parse_transcription_file, inputs = gr.File(type= "filepath", label = "Upload file here"), outputs = [gr.Textbox(label="Transcription"), gr.Textbox(label="Confidence")], examples = examples, analytics_enabled=False, title="Automatic Speech Recognition", allow_flagging = "never", description="Upload or drag and drop the audio file here!", ) demo = gr.TabbedInterface([record_audio, upload_file], ["Record Audio", "Upload Audio"]) if __name__ == "__main__": demo.launch()