|
import gradio as gr |
|
import torch |
|
from transformers import ( |
|
AutomaticSpeechRecognitionPipeline, |
|
WhisperForConditionalGeneration, |
|
WhisperTokenizer, |
|
WhisperProcessor, |
|
) |
|
from peft import PeftModel, PeftConfig |
|
peft_model_id = "Boadiwaa/LORA-colab-Whisper-medium" |
|
task = "transcribe" |
|
peft_config = PeftConfig.from_pretrained(peft_model_id) |
|
model = WhisperForConditionalGeneration.from_pretrained( |
|
peft_config.base_model_name_or_path,device_map="auto" |
|
) |
|
|
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path,task=task) |
|
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path,task=task) |
|
feature_extractor = processor.feature_extractor |
|
|
|
pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) |
|
|
|
|
|
def transcribe(audio): |
|
with torch.cuda.amp.autocast(): |
|
text = pipe(audio,max_new_tokens=255)["text"] |
|
return text |
|
|
|
demo = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(sources=["microphone"], type="filepath"), |
|
outputs="text", |
|
title="Transcriber for Ghanaian-accented speech (English)", |
|
description="Realtime demo for Ghanaian-accented speech recognition (in English).", |
|
) |
|
|
|
demo.launch(share=True) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |