import gradio as gr import torch from wenet.cli.model import load_model from huggingface_hub import hf_hub_download #import spaces REPO_ID = "Revai/reverb-asr" files = ['reverb_asr_v1.jit.zip', 'tk.units.txt'] downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files] model = load_model(downloaded_files[0], downloaded_files[1]) def process_cat_embs(style): device = torch.device("cpu") cat_embs = torch.tensor([float(c) for c in style.split(',')]).to(device) return cat_embs #@spaces.GPU def transcribe_audio(audio, style=0): if not audio: return "Input Error! Please enter one audio!" cat_embs = process_cat_embs(f'{style},{1-style}') result = model.transcribe(audio, cat_embs=cat_embs) if not result or 'text' not in result: return "ERROR! No text output! Please try again!" text_output = result['text'].replace('▁', ' ') return text_output audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") style_slider = gr.Slider(0, 1, value=0, step=0.1, label="Transcription Style", info="Adjust the transcription style: 0 (casual) to 1 (formal).") output_textbox = gr.Textbox(label="Transcription Output") description = "This tool transcribes audio using a customizable transcription style ranging from casual to formal. Upload or record an audio file to begin." iface = gr.Interface( fn=transcribe_audio, inputs=[audio_input, style_slider], outputs=output_textbox, title="Audio Transcription", description=description, theme="default" ) iface.launch()