Spaces:
Configuration error
Configuration error
import os | |
import logging | |
import librosa | |
import wandb | |
import numpy as np | |
from datasets import DatasetDict, load_dataset, load_metric | |
from transformers import ( | |
HubertForSequenceClassification, | |
PretrainedConfig, | |
Trainer, | |
TrainingArguments, | |
Wav2Vec2FeatureExtractor, | |
) | |
from utils import collator | |
logging.basicConfig( | |
format="%(asctime)s | %(levelname)s: %(message)s", level=logging.INFO | |
) | |
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) | |
NUM_LABELS = 6 | |
USER = "XXXX" # TODO: replace with your username | |
WANDB_PROJECT = "XXXXX" # TODO: replace with your project name | |
wandb.init(entity=USER, project=WANDB_PROJECT) | |
# PROCESS THE DATASET TO THE FORMAT EXPECTED BY THE MODEL FOR TRAINING | |
PreTrainedFeatureExtractor = "SequenceFeatureExtractor" # noqa: F821 | |
INPUT_FIELD = "input_values" | |
LABEL_FIELD = "labels" | |
def prepare_dataset(batch, feature_extractor: PreTrainedFeatureExtractor): | |
audio_arr = batch["array"] | |
input = feature_extractor( | |
audio_arr, sampling_rate=16000, padding=True, return_tensors="pt" | |
) | |
batch[INPUT_FIELD] = input.input_values[0] | |
batch[LABEL_FIELD] = batch[ | |
"label" | |
] # colname MUST be labels as Trainer will look for it by default | |
return batch | |
model_id = "facebook/hubert-base-ls960" | |
MODELS_DIR = os.path.join(PROJECT_ROOT, "models") | |
extractor_path = ( | |
model_id | |
if len(os.listdir(MODELS_DIR)) == 0 | |
else os.path.join(MODELS_DIR, "feature_extractor") | |
) | |
model_path = ( | |
model_id | |
if len(os.listdir(MODELS_DIR)) == 0 | |
else os.path.join(MODELS_DIR, "pretrained_model") | |
) | |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(extractor_path) | |
config = PretrainedConfig.from_pretrained(model_path, num_labels=NUM_LABELS) | |
hubert_model = HubertForSequenceClassification.from_pretrained( | |
model_path, | |
config=config, # because we need to update num_labels as per our dataset | |
ignore_mismatched_sizes=True, # to avoid classifier size mismatch from from_pretrained. | |
) | |
# FREEZE LAYERS | |
# freeze all layers to begin with | |
for param in hubert_model.parameters(): | |
param.requires_grad = False | |
layers_freeze_num = 2 | |
n_layers = ( | |
4 + layers_freeze_num * 16 | |
) # 4 refers to projector and classifier's weights and biases. | |
for name, param in list(hubert_model.named_parameters())[-n_layers:]: | |
param.requires_grad = True | |
# # freeze model weights for all layers except projector and classifier | |
# for name, param in hubert_model.named_parameters(): | |
# if any(ext in name for ext in ["projector", "classifier"]): | |
# param.requires_grad = True | |
trainer_config = { | |
"OUTPUT_DIR": "results", | |
"TRAIN_EPOCHS": 5, | |
"TRAIN_BATCH_SIZE": 32, | |
"EVAL_BATCH_SIZE": 32, | |
"GRADIENT_ACCUMULATION_STEPS": 4, | |
"WARMUP_STEPS": 500, | |
"DECAY": 0.01, | |
"LOGGING_STEPS": 10, | |
"MODEL_DIR": "models/audio-model", | |
"LR": 1e-3, | |
} | |
dataset_config = { | |
"LOADING_SCRIPT_FILES": os.path.join(PROJECT_ROOT, "src/data/crema.py"), | |
"CONFIG_NAME": "clean", | |
"DATA_DIR": os.path.join(PROJECT_ROOT, "data/archive.zip"), | |
"CACHE_DIR": os.path.join(PROJECT_ROOT, "cache_crema"), | |
} | |
ds = load_dataset( | |
dataset_config["LOADING_SCRIPT_FILES"], | |
dataset_config["CONFIG_NAME"], | |
cache_dir=dataset_config["CACHE_DIR"], | |
data_dir=dataset_config["DATA_DIR"], | |
) | |
# CONVERING RAW AUDIO TO ARRAYS | |
ds = ds.map( | |
lambda x: {"array": librosa.load(x["file"], sr=16000, mono=False)[0]}, | |
num_proc=2, | |
) | |
# LABEL TO ID | |
ds = ds.class_encode_column("label") | |
# ds["train"] = ds["train"].select(range(2500)) | |
wandb.log({"dataset_size": len(ds["train"])}) | |
# APPLY THE DATA PREP USING FEATURE EXTRACTOR TO ALL EXAMPLES | |
ds = ds.map( | |
prepare_dataset, | |
fn_kwargs={"feature_extractor": feature_extractor}, | |
# num_proc=4, | |
) | |
logging.info("Finished extracting features from audio arrays.") | |
# INTRODUCE TRAIN TEST VAL SPLITS | |
# 90% train, 10% test + validation | |
train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1) | |
# Split the 10% test + valid in half test, half valid | |
test_valid = train_testvalid["test"].train_test_split(test_size=0.5) | |
# gather everyone if you want to have a single DatasetDict | |
ds = DatasetDict( | |
{ | |
"train": train_testvalid["train"], | |
"test": test_valid["test"], | |
"val": test_valid["train"], | |
} | |
) | |
# DEFINE DATA COLLATOR - TO PAD TRAINING BATCHES DYNAMICALLY | |
data_collator = collator.DataCollatorCTCWithPadding( | |
processor=feature_extractor, padding=True | |
) | |
# Fine-Tuning with Trainer | |
training_args = TrainingArguments( | |
output_dir=os.path.join( | |
PROJECT_ROOT, trainer_config["OUTPUT_DIR"] | |
), # output directory | |
gradient_accumulation_steps=trainer_config[ | |
"GRADIENT_ACCUMULATION_STEPS" | |
], # accumulate the gradients before running optimization step | |
num_train_epochs=trainer_config["TRAIN_EPOCHS"], # total number of training epochs | |
per_device_train_batch_size=trainer_config[ | |
"TRAIN_BATCH_SIZE" | |
], # batch size per device during training | |
per_device_eval_batch_size=trainer_config[ | |
"EVAL_BATCH_SIZE" | |
], # batch size for evaluation | |
warmup_steps=trainer_config[ | |
"WARMUP_STEPS" | |
], # number of warmup steps for learning rate scheduler | |
weight_decay=trainer_config["DECAY"], # strength of weight decay | |
logging_steps=trainer_config["LOGGING_STEPS"], | |
evaluation_strategy="epoch", # report metric at end of each epoch | |
report_to="wandb", # enable logging to W&B | |
learning_rate=trainer_config["LR"], # default = 5e-5 | |
) | |
def compute_metrics(eval_pred): | |
# DEFINE EVALUATION METRIC | |
compute_accuracy_metric = load_metric("accuracy") | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
return compute_accuracy_metric.compute(predictions=predictions, references=labels) | |
# START TRAINING | |
trainer = Trainer( | |
model=hubert_model, # the instantiated 🤗 Transformers model to be trained | |
args=training_args, # training arguments, defined above | |
data_collator=data_collator, | |
train_dataset=ds["train"], # training dataset | |
eval_dataset=ds["val"], # evaluation dataset | |
compute_metrics=compute_metrics, | |
) | |
trainer.train() | |
# TO RESUME TRAINING FROM CHECKPOINT | |
# trainer.train("results/checkpoint-2000") | |
# VALIDATION SET RESULTS | |
logging.info("Eval Set Result: {}".format(trainer.evaluate())) | |
# TEST RESULTS | |
test_results = trainer.predict(ds["test"]) | |
logging.info("Test Set Result: {}".format(test_results.metrics)) | |
wandb.log({"test_accuracy": test_results.metrics["test_accuracy"]}) | |
trainer.save_model(os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"])) | |
# logging trained models to wandb | |
wandb.save( | |
os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"], "*"), | |
base_path=os.path.dirname(trainer_config["MODEL_DIR"]), | |
policy="end", | |
) | |