import torch import gradio as gr from transformers import pipeline MODEL_NAME_V1 = "rngzhi/cs3264-project" MODEL_NAME_V2 = "rngzhi/cs3264-project-v2" BATCH_SIZE = 8 FILE_LIMIT_MB = 1000 device = 0 if torch.cuda.is_available() else "cpu" def load_model(model_version): model_name = MODEL_NAME_V1 if model_version == 'Model-v1' else MODEL_NAME_V2 return pipeline( task="automatic-speech-recognition", model=model_name, chunk_length_s=30, device=device, ) def transcribe(model_version, inputs, task): if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") pipe = load_model(model_version) text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"] return text demo = gr.Blocks() mic_transcribe = gr.Interface( fn=transcribe, inputs=[gr.Dropdown(choices=['Model-v1', 'Model-v2'], label="Choose Model Version"), gr.Audio(sources="microphone", type="filepath")], outputs="text", ) file_transcribe = gr.Interface( fn=transcribe, inputs=[gr.Dropdown(choices=['Model-v1', 'Model-v2'], label="Choose Model Version"), gr.Audio(sources="upload", type="filepath")], outputs="text", examples=[["Model-v2", "samples/sample1.WAV", "upload"], ["Model-v2", "samples/sample2.WAV", "upload"]] ) with demo: gr.TabbedInterface([file_transcribe, mic_transcribe], ["Audio file", "Microphone"]) demo.launch(debug=True)