README.md · kingabzpro/wav2vec2-large-xlsr-53-wolof at refs/pr/1

metadata

language:
  - wo
datasets:
  - AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF
tags:
  - speech
  - audio
  - automatic-speech-recognition
license: apache-2.0
metrics:
  - WER

Evaluation on WOLOF Test

import pandas as pd
from datasets import load_dataset, load_metric,Dataset
from tqdm import tqdm
import torch
import soundfile as sf
import torchaudio
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2CTCTokenizer

model_name = "kingabzpro/wav2vec2-large-xlsr-53-wolof"
device = "cuda"

model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2Processor.from_pretrained(model_name)

val =pd.read_csv("../input/automatic-speech-recognition-in-wolof/Test.csv")
val["path"] = "../input/automatic-speech-recognition-in-wolof/Noise Removed/tmp/WOLOF_ASR_dataset/noise_remove/"+val["ID"]+".wav"
val.rename(columns = {'transcription':'sentence'}, inplace = True)
common_voice_val = Dataset.from_pandas(val)

def speech_file_to_array_fn_test(batch):
    speech_array, sampling_rate = sf.read(batch["path"])#(.wav) 16000 sample rate
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    return batch

def prepare_dataset_test(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], padding=True,sampling_rate=batch["sampling_rate"][0]).input_values
    return batch

common_voice_val = common_voice_val.remove_columns([ "ID","age",  "down_votes", "gender",  "up_votes"]) # Remove columns
common_voice_val = common_voice_val.map(speech_file_to_array_fn_test, remove_columns=common_voice_val.column_names)# Applying speech_file_to_array function
common_voice_val = common_voice_val.map(prepare_dataset_test, remove_columns=common_voice_val.column_names, batch_size=8, num_proc=4, batched=True)# Applying prepare_dataset_test function

final_pred = []
for i in tqdm(range(common_voice_val.shape[0])):# Testing model on Wolof Dataset    
    input_dict = processor(common_voice_val[i]["input_values"], return_tensors="pt", padding=True)

    logits = model(input_dict.input_values.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    prediction = processor.decode(pred_ids)
    final_pred.append(prediction)

You can check my result on Zindi, I got 8th rank in AI4D Baamtu Datamation - Automatic Speech Recognition in WOLOF

Result: 7.88 %