Spaces:

supun9
/

audio-sentiment-analysis

Configuration error

App Files Files Community

audio-sentiment-analysis / audio_train.py

supun9

Upload 5 files

b7f4dbe over 1 year ago

raw

history blame

No virus

6.86 kB

	import os
	import logging
	import librosa

	import wandb
	import numpy as np

	from datasets import DatasetDict, load_dataset, load_metric
	from transformers import (
	HubertForSequenceClassification,
	PretrainedConfig,
	Trainer,
	TrainingArguments,
	Wav2Vec2FeatureExtractor,
	)
	from utils import collator

	logging.basicConfig(
	format="%(asctime)s \| %(levelname)s: %(message)s", level=logging.INFO
	)

	PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
	NUM_LABELS = 6


	USER = "XXXX" # TODO: replace with your username
	WANDB_PROJECT = "XXXXX" # TODO: replace with your project name
	wandb.init(entity=USER, project=WANDB_PROJECT)


	# PROCESS THE DATASET TO THE FORMAT EXPECTED BY THE MODEL FOR TRAINING
	PreTrainedFeatureExtractor = "SequenceFeatureExtractor" # noqa: F821

	INPUT_FIELD = "input_values"
	LABEL_FIELD = "labels"


	def prepare_dataset(batch, feature_extractor: PreTrainedFeatureExtractor):
	audio_arr = batch["array"]
	input = feature_extractor(
	audio_arr, sampling_rate=16000, padding=True, return_tensors="pt"
	)

	batch[INPUT_FIELD] = input.input_values[0]
	batch[LABEL_FIELD] = batch[
	"label"
	] # colname MUST be labels as Trainer will look for it by default

	return batch


	model_id = "facebook/hubert-base-ls960"
	MODELS_DIR = os.path.join(PROJECT_ROOT, "models")

	extractor_path = (
	model_id
	if len(os.listdir(MODELS_DIR)) == 0
	else os.path.join(MODELS_DIR, "feature_extractor")
	)
	model_path = (
	model_id
	if len(os.listdir(MODELS_DIR)) == 0
	else os.path.join(MODELS_DIR, "pretrained_model")
	)

	feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(extractor_path)

	config = PretrainedConfig.from_pretrained(model_path, num_labels=NUM_LABELS)
	hubert_model = HubertForSequenceClassification.from_pretrained(
	model_path,
	config=config, # because we need to update num_labels as per our dataset
	ignore_mismatched_sizes=True, # to avoid classifier size mismatch from from_pretrained.
	)


	# FREEZE LAYERS

	# freeze all layers to begin with
	for param in hubert_model.parameters():
	param.requires_grad = False

	layers_freeze_num = 2
	n_layers = (
	4 + layers_freeze_num * 16
	) # 4 refers to projector and classifier's weights and biases.
	for name, param in list(hubert_model.named_parameters())[-n_layers:]:
	param.requires_grad = True

	# # freeze model weights for all layers except projector and classifier
	# for name, param in hubert_model.named_parameters():
	# if any(ext in name for ext in ["projector", "classifier"]):
	# param.requires_grad = True


	trainer_config = {
	"OUTPUT_DIR": "results",
	"TRAIN_EPOCHS": 5,
	"TRAIN_BATCH_SIZE": 32,
	"EVAL_BATCH_SIZE": 32,
	"GRADIENT_ACCUMULATION_STEPS": 4,
	"WARMUP_STEPS": 500,
	"DECAY": 0.01,
	"LOGGING_STEPS": 10,
	"MODEL_DIR": "models/audio-model",
	"LR": 1e-3,
	}


	dataset_config = {
	"LOADING_SCRIPT_FILES": os.path.join(PROJECT_ROOT, "src/data/crema.py"),
	"CONFIG_NAME": "clean",
	"DATA_DIR": os.path.join(PROJECT_ROOT, "data/archive.zip"),
	"CACHE_DIR": os.path.join(PROJECT_ROOT, "cache_crema"),
	}


	ds = load_dataset(
	dataset_config["LOADING_SCRIPT_FILES"],
	dataset_config["CONFIG_NAME"],
	cache_dir=dataset_config["CACHE_DIR"],
	data_dir=dataset_config["DATA_DIR"],
	)


	# CONVERING RAW AUDIO TO ARRAYS
	ds = ds.map(
	lambda x: {"array": librosa.load(x["file"], sr=16000, mono=False)[0]},
	num_proc=2,
	)


	# LABEL TO ID
	ds = ds.class_encode_column("label")


	# ds["train"] = ds["train"].select(range(2500))
	wandb.log({"dataset_size": len(ds["train"])})


	# APPLY THE DATA PREP USING FEATURE EXTRACTOR TO ALL EXAMPLES
	ds = ds.map(
	prepare_dataset,
	fn_kwargs={"feature_extractor": feature_extractor},
	# num_proc=4,
	)
	logging.info("Finished extracting features from audio arrays.")


	# INTRODUCE TRAIN TEST VAL SPLITS

	# 90% train, 10% test + validation
	train_testvalid = ds["train"].train_test_split(shuffle=True, test_size=0.1)
	# Split the 10% test + valid in half test, half valid
	test_valid = train_testvalid["test"].train_test_split(test_size=0.5)
	# gather everyone if you want to have a single DatasetDict
	ds = DatasetDict(
	{
	"train": train_testvalid["train"],
	"test": test_valid["test"],
	"val": test_valid["train"],
	}
	)


	# DEFINE DATA COLLATOR - TO PAD TRAINING BATCHES DYNAMICALLY
	data_collator = collator.DataCollatorCTCWithPadding(
	processor=feature_extractor, padding=True
	)


	# Fine-Tuning with Trainer
	training_args = TrainingArguments(
	output_dir=os.path.join(
	PROJECT_ROOT, trainer_config["OUTPUT_DIR"]
	), # output directory
	gradient_accumulation_steps=trainer_config[
	"GRADIENT_ACCUMULATION_STEPS"
	], # accumulate the gradients before running optimization step
	num_train_epochs=trainer_config["TRAIN_EPOCHS"], # total number of training epochs
	per_device_train_batch_size=trainer_config[
	"TRAIN_BATCH_SIZE"
	], # batch size per device during training
	per_device_eval_batch_size=trainer_config[
	"EVAL_BATCH_SIZE"
	], # batch size for evaluation
	warmup_steps=trainer_config[
	"WARMUP_STEPS"
	], # number of warmup steps for learning rate scheduler
	weight_decay=trainer_config["DECAY"], # strength of weight decay
	logging_steps=trainer_config["LOGGING_STEPS"],
	evaluation_strategy="epoch", # report metric at end of each epoch
	report_to="wandb", # enable logging to W&B
	learning_rate=trainer_config["LR"], # default = 5e-5
	)


	def compute_metrics(eval_pred):
	# DEFINE EVALUATION METRIC
	compute_accuracy_metric = load_metric("accuracy")
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	return compute_accuracy_metric.compute(predictions=predictions, references=labels)


	# START TRAINING
	trainer = Trainer(
	model=hubert_model, # the instantiated 🤗 Transformers model to be trained
	args=training_args, # training arguments, defined above
	data_collator=data_collator,
	train_dataset=ds["train"], # training dataset
	eval_dataset=ds["val"], # evaluation dataset
	compute_metrics=compute_metrics,
	)


	trainer.train()

	# TO RESUME TRAINING FROM CHECKPOINT
	# trainer.train("results/checkpoint-2000")

	# VALIDATION SET RESULTS
	logging.info("Eval Set Result: {}".format(trainer.evaluate()))

	# TEST RESULTS
	test_results = trainer.predict(ds["test"])
	logging.info("Test Set Result: {}".format(test_results.metrics))
	wandb.log({"test_accuracy": test_results.metrics["test_accuracy"]})

	trainer.save_model(os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"]))

	# logging trained models to wandb
	wandb.save(
	os.path.join(PROJECT_ROOT, trainer_config["MODEL_DIR"], "*"),
	base_path=os.path.dirname(trainer_config["MODEL_DIR"]),
	policy="end",
	)