Whisper Fine-tuned Pronunciation Scorer
This model assesses pronunciation quality for Korean speech. It's based on the openai/whisper-small model, fine-tuned using the Korea AI-Hub (https://www.aihub.or.kr/) foreigner Korean pronunciation evaluation dataset.
Model Description
The Pronunciation Scorer takes audio input along with its corresponding text transcript and provides a Korean pronunciation score on a scale of 1 to 5. It utilizes the encoder-decoder architecture of the Whisper model to extract speech features and employs an additional linear layer to predict the pronunciation score.
How to Use
To use this model, follow these steps:
- Install required libraries
- Load the model and processor
- Prepare your audio file and text transcript
- Predict the pronunciation score
Here's a detailed example of how to use the model:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch.nn as nn
class WhisperPronunciationScorer(nn.Module):
def __init__(self, pretrained_model):
super().__init__()
self.whisper = pretrained_model
self.score_head = nn.Linear(self.whisper.config.d_model, 1)
def forward(self, input_features, labels=None):
outputs = self.whisper(input_features, labels=labels, output_hidden_states=True)
last_hidden_state = outputs.decoder_hidden_states[-1]
scores = self.score_head(last_hidden_state.mean(dim=1)).squeeze()
return scores
def load_model(model_path, device):
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
pretrained_model = WhisperForConditionalGeneration.from_pretrained(model_name)
model = WhisperPronunciationScorer(pretrained_model).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
return model, processor
def predict_pronunciation_score(model, processor, audio_path, transcript, device):
# Load and preprocess audio
audio, sr = torchaudio.load(audio_path)
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
input_features = processor(audio.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
# Prepare transcript
labels = processor(text=transcript, return_tensors="pt").input_ids.to(device)
# Predict score
with torch.no_grad():
score = model(input_features, labels)
return score.item()
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "path/to/your/model.pth"
model, processor = load_model(model_path, device)
# Run prediction
audio_path = "path/to/your/audio.wav"
transcript = "안녕하세요"
score = predict_pronunciation_score(model, processor, audio_path, transcript, device)
print(f"Predicted pronunciation score: {score:.2f}")
- Downloads last month
- 17
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.