Edit model card

Whisper Fine-tuned Pronunciation Scorer

This model assesses pronunciation quality for Korean speech. It's based on the openai/whisper-small model, fine-tuned using the Korea AI-Hub (https://www.aihub.or.kr/) foreigner Korean pronunciation evaluation dataset.

Model Description

The Pronunciation Scorer takes audio input along with its corresponding text transcript and provides a Korean pronunciation score on a scale of 1 to 5. It utilizes the encoder-decoder architecture of the Whisper model to extract speech features and employs an additional linear layer to predict the pronunciation score.

How to Use

To use this model, follow these steps:

  1. Install required libraries
  2. Load the model and processor
  3. Prepare your audio file and text transcript
  4. Predict the pronunciation score

Here's a detailed example of how to use the model:

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch.nn as nn

class WhisperPronunciationScorer(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.whisper = pretrained_model
        self.score_head = nn.Linear(self.whisper.config.d_model, 1)

    def forward(self, input_features, labels=None):
        outputs = self.whisper(input_features, labels=labels, output_hidden_states=True)
        last_hidden_state = outputs.decoder_hidden_states[-1]
        scores = self.score_head(last_hidden_state.mean(dim=1)).squeeze()
        return scores

def load_model(model_path, device):
    model_name = "openai/whisper-small"
    processor = WhisperProcessor.from_pretrained(model_name)
    pretrained_model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model = WhisperPronunciationScorer(pretrained_model).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    return model, processor

def predict_pronunciation_score(model, processor, audio_path, transcript, device):
    # Load and preprocess audio
    audio, sr = torchaudio.load(audio_path)
    if sr != 16000:
        audio = torchaudio.functional.resample(audio, sr, 16000)
    input_features = processor(audio.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
    
    # Prepare transcript
    labels = processor(text=transcript, return_tensors="pt").input_ids.to(device)
    
    # Predict score
    with torch.no_grad():
        score = model(input_features, labels)
    return score.item()

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "path/to/your/model.pth"
model, processor = load_model(model_path, device)

# Run prediction
audio_path = "path/to/your/audio.wav"
transcript = "안녕하세요"
score = predict_pronunciation_score(model, processor, audio_path, transcript, device)
print(f"Predicted pronunciation score: {score:.2f}")
Downloads last month
17
Safetensors
Model size
242M params
Tensor type
F32
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.