from typing import Tuple import numpy as np import torch from torchaudio.transforms import Resample from huggingface_hub import hf_hub_download import gradio as gr from pipeline import PreTrainedPipeline HF_HUB_URL = 'ales/wav2vec2-cv-be' LM_HUB_FP = 'language_model/cv8be_5gram.bin' def main(rate_audio_tuple: Tuple[int, np.ndarray]): sampling_rate, audio = rate_audio_tuple # resample audio to 16kHz resampler = Resample(orig_freq=sampling_rate, new_freq=16_000) audio_resampled = resampler(torch.tensor(audio)).numpy().flatten() # download Language Model from HF Hub lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP) # init pipeline pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp) # recognize speech text_recognized = pipeline(inputs=audio_resampled)['text'][0] return text_recognized iface = gr.Interface( fn=main, inputs='microphone', outputs="text" ) iface.launch()