audio-sentiment-analysis / audio_train.py
supun9's picture
Upload 5 files
b7f4dbe
raw
history blame
No virus
6.86 kB
import os
import logging
import librosa
import wandb
import numpy as np
from datasets import DatasetDict, load_dataset, load_metric
from transformers import (
HubertForSequenceClassification,
PretrainedConfig,
Trainer,
TrainingArguments,
Wav2Vec2FeatureExtractor,
)
from utils import collator
logging.basicConfig(
format="%(asctime)s | %(levelname)s: %(message)s", level=logging.INFO
)
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
NUM_LABELS = 6
USER = "XXXX" # TODO: replace with your username
WANDB_PROJECT = "XXXXX" # TODO: replace with your project name
wandb.init(entity=USER, project=WANDB_PROJECT)
# PROCESS THE DATASET TO THE FORMAT EXPECTED BY THE MODEL FOR TRAINING
PreTrainedFeatureExtractor = "SequenceFeatureExtractor" # noqa: F821
INPUT_FIELD = "input_values"
LABEL_FIELD = "labels"
def prepare_dataset(batch, feature_extractor: PreTrainedFeatureExtractor):
audio_arr = batch["array"]
input = feature_extractor(
audio_arr, sampling_rate=16000, padding=True, return_tensors="pt"
)
batch[INPUT_FIELD] = input.input_values[0]
batch[LABEL_FIELD] = batch[
"label"
] # colname MUST be labels as Trainer will look for it by default
return batch
model_id = "facebook/hubert-base-ls960"
MODELS_DIR = os.path.join(PROJECT_ROOT, "models")
extractor_path = (
model_id
if len(os.listdir(MODELS_DIR)) == 0
else os.path.join(MODELS_DIR, "feature_extractor")
)
model_path = (
model_id
if len(os.listdir(MODELS_DIR)) == 0
else os.path.join(MODELS_DIR, "pretrained_model")
)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(extractor_path)
config = PretrainedConfig.from_pretrained(model_path, num_labels=NUM_LABELS)
hubert_model = HubertForSequenceClassification.from_pretrained(
model_path,
config=config, # because we need to update num_labels as per our dataset
ignore_mismatched_sizes=True, # to avoid classifier size mismatch from from_pretrained.
)
# FREEZE LAYERS
# freeze all layers to begin with
for param in hubert_model.parameters():
param.requires_grad = False
layers_freeze_num = 2
n_layers = (
4 + layers_freeze_num * 16
) # 4 refers to projector and classifier's weights and biases.
for name, param in list(hubert_model.named_parameters())[-n_layers:]:
param.requires_grad = True
# # freeze model weights for all layers except projector and classifier
# for name, param in hubert_model.named_parameters():
# if any(ext in name for ext in ["projector", "classifier"]):
# param.requires_grad = True
trainer_config = {
"OUTPUT_DIR": "results",
"TRAIN_EPOCHS": 5,
"TRAIN_BATCH_SIZE": 32,
"EVAL_BATCH_SIZE": 32,
"GRADIENT_ACCUMULATION_STEPS": 4,
"WARMUP_STEPS": 500,
"DECAY": 0.01,
"LOGGING_STEPS": 10,
"MODEL_DIR": "models/audio-model",
"LR": 1e-3,
}
dataset_config = {
"LOADING_SCRIPT_FILES": os.path.join(PROJECT_ROOT, "src/data/crema.py"),
"CONFIG_NAME": "clean",
"DATA_DIR": os.path.join(PROJECT_ROOT, "data/archive.zip"),
"CACHE_DIR": os.path.join(PROJECT_ROOT, "cache_crema"),
}
ds = load_dataset(
dataset_config["LOADING_SCRIPT_FILES"],
dataset_config["CONFIG_NAME"],
cache_dir=dataset_config["CACHE_DIR"],
data_dir=dataset_config["DATA_DIR"],
)
# CONVERING RAW AUDIO TO ARRAYS
ds = ds.map(
lambda x: {"array": librosa.load(x["file"], sr=16000, mono=False)[0]},
num_proc=2,
)
# LABEL TO ID
ds = ds.class_encode_column("label")
# ds["train"] = ds["train"].select(range(2500))
wandb.log({"dataset_size": len(ds["train"])})
# APPLY THE DATA PREP USING FEATURE EXTRACTOR TO ALL EXAMPLES
ds = ds.map(
prepare_dataset,
fn_kwargs={"feature_extractor": feature_extractor},
# num_proc=4,
)
logging.info("Finished extracting features from audio arrays.")
# INTRODUCE TRAIN TEST VAL SPLITS
# 90% train, 10% test + validation
train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid["test"].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
ds = DatasetDict(
{
"train": train_testvalid["train"],
"test": test_valid["test"],
"val": test_valid["train"],
}
)
# DEFINE DATA COLLATOR - TO PAD TRAINING BATCHES DYNAMICALLY
data_collator = collator.DataCollatorCTCWithPadding(
processor=feature_extractor, padding=True
)
# Fine-Tuning with Trainer
training_args = TrainingArguments(
output_dir=os.path.join(
PROJECT_ROOT, trainer_config["OUTPUT_DIR"]
), # output directory
gradient_accumulation_steps=trainer_config[
"GRADIENT_ACCUMULATION_STEPS"
], # accumulate the gradients before running optimization step
num_train_epochs=trainer_config["TRAIN_EPOCHS"], # total number of training epochs
per_device_train_batch_size=trainer_config[
"TRAIN_BATCH_SIZE"
], # batch size per device during training
per_device_eval_batch_size=trainer_config[
"EVAL_BATCH_SIZE"
], # batch size for evaluation
warmup_steps=trainer_config[
"WARMUP_STEPS"
], # number of warmup steps for learning rate scheduler
weight_decay=trainer_config["DECAY"], # strength of weight decay
logging_steps=trainer_config["LOGGING_STEPS"],
evaluation_strategy="epoch", # report metric at end of each epoch
report_to="wandb", # enable logging to W&B
learning_rate=trainer_config["LR"], # default = 5e-5
)
def compute_metrics(eval_pred):
# DEFINE EVALUATION METRIC
compute_accuracy_metric = load_metric("accuracy")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return compute_accuracy_metric.compute(predictions=predictions, references=labels)
# START TRAINING
trainer = Trainer(
model=hubert_model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
data_collator=data_collator,
train_dataset=ds["train"], # training dataset
eval_dataset=ds["val"], # evaluation dataset
compute_metrics=compute_metrics,
)
trainer.train()
# TO RESUME TRAINING FROM CHECKPOINT
# trainer.train("results/checkpoint-2000")
# VALIDATION SET RESULTS
logging.info("Eval Set Result: {}".format(trainer.evaluate()))
# TEST RESULTS
test_results = trainer.predict(ds["test"])
logging.info("Test Set Result: {}".format(test_results.metrics))
wandb.log({"test_accuracy": test_results.metrics["test_accuracy"]})
trainer.save_model(os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"]))
# logging trained models to wandb
wandb.save(
os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"], "*"),
base_path=os.path.dirname(trainer_config["MODEL_DIR"]),
policy="end",
)