BERTley / training_script.py

Upload folder using huggingface_hub

b95938c verified 5 months ago

5.81 kB

	import torch
	import argparse
	import json
	from transformers import (
	AutoModelForSequenceClassification,
	AutoTokenizer,
	TrainingArguments,
	Trainer,
	EarlyStoppingCallback,
	)
	import evaluate
	from datasets import Dataset


	# the LLM model we are going to be using:
	# google's BERT model
	MODEL = "bert-base-uncased"

	ACCURACY_METRIC = evaluate.load("accuracy")
	F1_METRIC = evaluate.load("f1")
	PRECISION_METRIC = evaluate.load("precision")
	RECALL_METRIC = evaluate.load("recall")


	def compute_metrics(eval_pred):

	logits, labels = eval_pred
	preds = logits.argmax(axis=-1)

	# weighted averages
	f1_w = F1_METRIC.compute(
	predictions=preds, references=labels, average="weighted"
	)["f1"]
	prec_w = PRECISION_METRIC.compute(
	predictions=preds, references=labels, average="weighted"
	)["precision"]
	rec_w = RECALL_METRIC.compute(
	predictions=preds, references=labels, average="weighted"
	)["recall"]

	# macro averages
	f1_m = F1_METRIC.compute(
	predictions=preds, references=labels, average="macro"
	)["f1"]
	prec_m = PRECISION_METRIC.compute(
	predictions=preds, references=labels, average="macro"
	)["precision"]
	rec_m = RECALL_METRIC.compute(
	predictions=preds, references=labels, average="macro"
	)["recall"]

	return {
	"accuracy": ACCURACY_METRIC.compute(
	predictions=preds, references=labels
	)["accuracy"],
	"f1_weighted": f1_w,
	"precision_weighted": prec_w,
	"recall_weighted": rec_w,
	"f1_macro": f1_m,
	"precision_macro": prec_m,
	"recall_macro": rec_m,
	}


	# creates a dataset object from the training data
	def main() -> None:

	data = None
	aggregate_data = None
	context = None

	flat_source = "./flattened_data_new.json"
	aggregate_source = "./aggregate_data_new.json"

	with open(flat_source, "r", encoding="utf-8") as f:
	data = json.load(f)
	with open(aggregate_source, "r", encoding="utf-8") as f:
	aggregate_data = json.load(f)

	try:
	for rec in data:
	rec["context"] = " ".join(
	str(v) for k, v in rec.items() if k not in ("text", "label")
	).strip()

	ds = Dataset.from_list(data)
	except:
	raise (Exception("Error creating dataset from list"))

	labels = list(aggregate_data.keys())
	label2id = {l: i for i, l in enumerate(labels)}
	id2label = {i: l for i, l in enumerate(labels)}

	if context and "context" in data[0]:
	ds = ds.map(
	lambda x: {"input_text": x["context"] + " " + x["text"]},
	batched=False,
	)
	text_field = "input_text"
	else:
	ds = ds.map(lambda x: {"input_text": x["text"]}, batched=False)
	text_field = "input_text"

	# maps labels to integers
	ds = ds.map(
	lambda x: {"labels": label2id[x["label"]]},
	remove_columns=(
	["label", "text", "context"]
	if "context" in data[0]
	else ["label", "text"]
	),
	)

	# quickly write the label/id mappings to files
	with open("label2id.json", "w", encoding="utf-8") as f:
	json.dump(label2id, f, indent=2)
	with open("id2label.json", "w", encoding="utf-8") as f:
	json.dump(id2label, f, indent=2)

	# this creates a datadict with two keys, "train" and "test"
	# each has a subset of data, one for testing and one for training
	# ratio of 80/20 train/test
	split = ds.train_test_split(0.2)
	tokenizer = AutoTokenizer.from_pretrained(MODEL)
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL,
	num_labels=len(labels),
	id2label=id2label,
	label2id=label2id,
	)

	tokenized = split.map(
	lambda x: tokenizer(
	x[text_field], padding="max_length", truncation=True
	),
	batched=True,
	)
	tokenized.set_format(
	"torch", columns=["input_ids", "attention_mask", "labels"]
	)

	# these are the training arguments. these should be ok for testing
	# but not a full fledged run. once dataset is larger, num_train_epochs should be raised
	training_args = TrainingArguments(
	output_dir="./BERTley",
	learning_rate=2e-5,
	per_device_train_batch_size=32,
	per_device_eval_batch_size=32,
	gradient_accumulation_steps=2, # simulate a 64‑batch without OOM
	num_train_epochs=5, # for a full run, more epochs may be needed
	weight_decay=0.01,
	dataloader_num_workers=4,
	eval_strategy="epoch", # evaluate every few steps instead of per epoch
	fp16=True,
	logging_strategy="epoch", # log based on epoch
	logging_dir="./logs",
	save_strategy="epoch",
	save_total_limit=1, # save checkpoints based on steps
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	greater_is_better=False,
	report_to=[
	"tensorboard"
	], # report metrics to TensorBoard, for example
	)

	# arguments for training the model
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized["train"],
	eval_dataset=tokenized["test"],
	compute_metrics=compute_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
	)

	# training the model...
	trainer.train()

	# evaluate after training
	evals = trainer.evaluate()
	with open("evals.json", "w", encoding="utf-8") as f:
	json.dump(evals, f, indent=2)
	print("Evaluation results: ")
	print(evals)
	print("Accuracy, F1, Precision, and Recall metrics: ")
	for key, value in evals.items():
	print(f"{key}: {value}")



	if __name__ == "__main__":
	main()