--- license: apache-2.0 tags: - generated_from_trainer datasets: - librispeech_asr metrics: - f1 base_model: facebook/wav2vec2-xls-r-300m model-index: - name: weights results: [] --- # wav2vec2-large-xlsr-53-gender-recognition-librispeech This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on Librispeech-clean-100 for gender recognition. It achieves the following results on the evaluation set: - Loss: 0.0061 - F1: 0.9993 ### Compute your inferences ```python import os import random from glob import glob from typing import List, Optional, Union, Dict import tqdm import torch import torchaudio import numpy as np import pandas as pd from torch import nn from torch.utils.data import DataLoader from torch.nn import functional as F from transformers import ( AutoFeatureExtractor, AutoModelForAudioClassification, Wav2Vec2Processor ) class CustomDataset(torch.utils.data.Dataset): def __init__( self, dataset: List, basedir: Optional[str] = None, sampling_rate: int = 16000, max_audio_len: int = 5, ): self.dataset = dataset self.basedir = basedir self.sampling_rate = sampling_rate self.max_audio_len = max_audio_len def __len__(self): """ Return the length of the dataset """ return len(self.dataset) def __getitem__(self, index): if self.basedir is None: filepath = self.dataset[index] else: filepath = os.path.join(self.basedir, self.dataset[index]) speech_array, sr = torchaudio.load(filepath) if speech_array.shape[0] > 1: speech_array = torch.mean(speech_array, dim=0, keepdim=True) if sr != self.sampling_rate: transform = torchaudio.transforms.Resample(sr, self.sampling_rate) speech_array = transform(speech_array) sr = self.sampling_rate len_audio = speech_array.shape[1] # Pad or truncate the audio to match the desired length if len_audio < self.max_audio_len * self.sampling_rate: # Pad the audio if it's shorter than the desired length padding = torch.zeros(1, self.max_audio_len * self.sampling_rate - len_audio) speech_array = torch.cat([speech_array, padding], dim=1) else: # Truncate the audio if it's longer than the desired length speech_array = speech_array[:, :self.max_audio_len * self.sampling_rate] speech_array = speech_array.squeeze().numpy() return {"input_values": speech_array, "attention_mask": None} class CollateFunc: def __init__( self, processor: Wav2Vec2Processor, padding: Union[bool, str] = True, pad_to_multiple_of: Optional[int] = None, return_attention_mask: bool = True, sampling_rate: int = 16000, max_length: Optional[int] = None, ): self.sampling_rate = sampling_rate self.processor = processor self.padding = padding self.pad_to_multiple_of = pad_to_multiple_of self.return_attention_mask = return_attention_mask self.max_length = max_length def __call__(self, batch: List[Dict[str, np.ndarray]]): # Extract input_values from the batch input_values = [item["input_values"] for item in batch] batch = self.processor( input_values, sampling_rate=self.sampling_rate, return_tensors="pt", padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_attention_mask=self.return_attention_mask ) return { "input_values": batch.input_values, "attention_mask": batch.attention_mask if self.return_attention_mask else None } def predict(test_dataloader, model, device: torch.device): """ Predict the class of the audio """ model.to(device) model.eval() preds = [] with torch.no_grad(): for batch in tqdm.tqdm(test_dataloader): input_values, attention_mask = batch['input_values'].to(device), batch['attention_mask'].to(device) logits = model(input_values, attention_mask=attention_mask).logits scores = F.softmax(logits, dim=-1) pred = torch.argmax(scores, dim=1).cpu().detach().numpy() preds.extend(pred) return preds def get_gender(model_name_or_path: str, audio_paths: List[str], label2id: Dict, id2label: Dict, device: torch.device): num_labels = 2 feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path) model = AutoModelForAudioClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, num_labels=num_labels, label2id=label2id, id2label=id2label, ) test_dataset = CustomDataset(audio_paths, max_audio_len=5) # for 5-second audio data_collator = CollateFunc( processor=feature_extractor, padding=True, sampling_rate=16000, ) test_dataloader = DataLoader( dataset=test_dataset, batch_size=16, collate_fn=data_collator, shuffle=False, num_workers=2 ) preds = predict(test_dataloader=test_dataloader, model=model, device=device) return preds model_name_or_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech" audio_paths = [] # Must be a list with absolute paths of the audios that will be used in inference device = torch.device("cuda" if torch.cuda.is_available() else "cpu") label2id = { "female": 0, "male": 1 } id2label = { 0: "female", 1: "male" } num_labels = 2 preds = get_gender(model_name_or_path, audio_paths, label2id, id2label, device) ``` ## Training and evaluation data The Librispeech-clean-100 dataset was used to train the model, with 70% of the data used for training, 10% for validation, and 20% for testing. ### Training hyperparameters The following hyperparameters were used during training: - learning_rate: 3e-05 - train_batch_size: 4 - eval_batch_size: 4 - seed: 42 - gradient_accumulation_steps: 4 - total_train_batch_size: 16 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 - lr_scheduler_type: linear - lr_scheduler_warmup_ratio: 0.1 - num_epochs: 1 - mixed_precision_training: Native AMP ### Training results | Training Loss | Epoch | Step | Validation Loss | F1 | |:-------------:|:-----:|:----:|:---------------:|:------:| | 0.002 | 1.0 | 1248 | 0.0061 | 0.9993 | ### Framework versions - Transformers 4.28.0 - Pytorch 2.0.0+cu118 - Tokenizers 0.13.3