esm2_t6_8m_qlora_binding_sites_v0 / qlora_train_v4.py

Upload qlora_train_v4.py

a928b59 11 months ago

No virus

11.6 kB

	import os
	import wandb
	import numpy as np
	import torch
	import torch.nn as nn
	from datetime import datetime
	from sklearn.utils.class_weight import compute_class_weight
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, matthews_corrcoef
	from transformers import (
	AutoModelForTokenClassification,
	AutoTokenizer,
	DataCollatorForTokenClassification,
	TrainingArguments,
	Trainer,
	BitsAndBytesConfig,
	default_data_collator
	)
	from torch.utils.data import Dataset as TorchDataset
	from accelerate import Accelerator
	from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
	import pickle
	import gc
	from tqdm import tqdm

	# Initialize accelerator and Weights & Biases
	accelerator = Accelerator()
	os.environ["WANDB_NOTEBOOK_NAME"] = 'qlora_train.py'
	wandb.init(project='binding_site_prediction')

	# Helper Functions and Data Preparation
	# -----------------------------------------------------------------------------

	def print_trainable_parameters(model):
	"""
	Prints the number of trainable parameters in the model.
	"""
	trainable_params = 0
	all_param = 0
	for _, param in model.named_parameters():
	all_param += param.numel()
	if param.requires_grad:
	trainable_params += param.numel()
	print(
	f"trainable params: {trainable_params} \|\| all params: {all_param} \|\| trainable%: {100 * trainable_params / all_param}"
	)

	def save_config_to_txt(config, filename):
	"""Save the configuration dictionary to a text file."""
	with open(filename, 'w') as f:
	for key, value in config.items():
	f.write(f"{key}: {value}\n")

	def truncate_labels(labels, max_length):
	return [label[:max_length] for label in labels]

	def compute_metrics(p):
	predictions, labels = p
	predictions = np.argmax(predictions, axis=2)
	predictions = predictions[labels != -100].flatten()
	labels = labels[labels != -100].flatten()
	accuracy = accuracy_score(labels, predictions)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
	auc = roc_auc_score(labels, predictions)
	mcc = matthews_corrcoef(labels, predictions)
	return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc, 'mcc': mcc}

	def compute_loss(model, logits, inputs):
	# print("Shape of input_ids:", inputs["input_ids"].shape)
	labels = inputs["labels"]
	loss_fct = nn.CrossEntropyLoss(weight=class_weights)
	active_loss = inputs["attention_mask"].view(-1) == 1
	active_logits = logits.view(-1, model.config.num_labels)
	active_labels = torch.where(
	active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
	)
	loss = loss_fct(active_logits, active_labels)
	return loss

	# Load data from pickle files
	with open("data/16M_data_big/v2_train_sequences_chunked_by_family.pkl", "rb") as f:
	train_sequences = pickle.load(f)
	del f
	gc.collect()

	with open("data/16M_data_big/v2_test_sequences_chunked_by_family.pkl", "rb") as f:
	test_sequences = pickle.load(f)
	del f
	gc.collect()

	with open("data/16M_data_big/v2_train_labels_chunked_by_family.pkl", "rb") as f:
	train_labels = pickle.load(f)
	del f
	gc.collect()

	with open("data/16M_data_big/v2_test_labels_chunked_by_family.pkl", "rb") as f:
	test_labels = pickle.load(f)
	del f
	gc.collect()

	# Adjust max_sequence_length for special tokens
	desired_length = 1022
	tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
	sample_sequence = "A"
	tokenized_sample = tokenizer(sample_sequence)

	# Debugging print statements
	print(f"Sample Sequence: {sample_sequence}")
	print(f"Tokenized Sample: {tokenized_sample}")
	print(f"Number of tokens in tokenized sample: {len(tokenized_sample['input_ids'])}")

	num_special_tokens = len(tokenized_sample["input_ids"]) - 1
	print(f"Number of special tokens: {num_special_tokens}")

	effective_length = desired_length - num_special_tokens
	print(f"Effective sequence length (accounting for special tokens): {effective_length}")

	# Custom Dataset for on-the-fly tokenization
	class CustomDataset(TorchDataset):
	def __init__(self, sequences, labels, tokenizer, max_length):
	self.sequences = sequences
	self.labels = labels
	self.tokenizer = tokenizer
	self.max_length = max_length

	def __len__(self):
	return len(self.sequences)

	def __getitem__(self, idx):
	sequence = self.sequences[idx]
	label = self.labels[idx][:self.max_length]

	tokenized = self.tokenizer(sequence, padding='max_length', truncation=True, max_length=effective_length, return_tensors="pt", is_split_into_words=False)

	# Remove batch dimension
	for key, value in tokenized.items():
	tokenized[key] = value[0]

	tokenized['labels'] = torch.tensor(label, dtype=torch.long)

	# Diagnostics: Print the shape of the input_ids (or any other key you're interested in)
	# print("Shape of input_ids:", tokenized["input_ids"].shape)

	# Delete variables that are not needed anymore and collect garbage
	del sequence, label
	gc.collect()

	return tokenized


	train_dataset = CustomDataset(train_sequences, train_labels, tokenizer, effective_length)
	test_dataset = CustomDataset(test_sequences, test_labels, tokenizer, effective_length)


	# Compute Class Weights
	classes = [0, 1]
	# flat_train_labels = [label for sublist in train_labels for label in sublist]
	flat_train_labels_gen = (label for sublist in tqdm(train_labels, desc="Flattening labels") for label in sublist)
	flat_train_labels = np.fromiter(flat_train_labels_gen, dtype=np.int8)

	del train_sequences, test_sequences, test_labels
	gc.collect()

	def compute_average_class_weight(train_labels, classes, batch_size):
	num_batches = len(train_labels) // batch_size + (len(train_labels) % batch_size != 0)
	total_weights = np.zeros(len(classes))

	for i in tqdm(range(num_batches), desc="Computing class weights in batches"):
	start_idx = i * batch_size
	end_idx = start_idx + batch_size

	batch_labels = train_labels[start_idx:end_idx]
	flat_labels = np.array([label for sublist in batch_labels for label in sublist], dtype=np.int8)

	weights = compute_class_weight(class_weight='balanced', classes=classes, y=flat_labels)
	total_weights += weights

	# Clear memory
	del batch_labels, flat_labels, weights
	gc.collect()

	# Average the weights
	average_weights = total_weights / num_batches
	return average_weights

	batch_size = 100000 # You can adjust this based on your memory capacity
	class_weights = compute_average_class_weight(train_labels, classes, batch_size)
	class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)

	del train_labels
	gc.collect()

	# class_weights = torch.tensor(class_weights, dtype=np.int8).to(accelerator.device)

	# Define Custom Trainer Class
	class WeightedTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False):
	outputs = model(**inputs)
	logits = outputs.logits
	loss = compute_loss(model, logits, inputs)
	return (loss, outputs) if return_outputs else loss


	# Configure the quantization settings
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16
	)


	def train_function_no_sweeps(train_dataset, test_dataset):

	# Directly set the config
	config = {
	"lora_alpha": 1,
	"lora_dropout": 0.5,
	"lr": 1.701568055793089e-04,
	"lr_scheduler_type": "cosine",
	"max_grad_norm": 0.5,
	"num_train_epochs": 4,
	"per_device_train_batch_size": 60,
	"r": 2,
	"weight_decay": 0.3,
	# Add other hyperparameters as needed
	}

	# Log the config to W&B
	wandb.config.update(config)

	# Save the config to a text file
	timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
	config_filename = f"esm2_t33_650M_qlora_config_{timestamp}.txt"
	save_config_to_txt(config, config_filename)

	model_checkpoint = "facebook/esm2_t33_650M_UR50D"

	# Define labels and model
	id2label = {0: "No binding site", 1: "Binding site"}
	label2id = {v: k for k, v in id2label.items()}

	model = AutoModelForTokenClassification.from_pretrained(
	model_checkpoint,
	num_labels=len(id2label),
	id2label=id2label,
	label2id=label2id,
	quantization_config=bnb_config # Apply quantization here
	)

	# Prepare the model for 4-bit quantization training
	model.gradient_checkpointing_enable()
	model = prepare_model_for_kbit_training(model)

	# Convert the model into a PeftModel
	peft_config = LoraConfig(
	task_type=TaskType.TOKEN_CLS,
	inference_mode=False,
	r=config["r"],
	lora_alpha=config["lora_alpha"],
	target_modules=[
	"query",
	"key",
	"value",
	"EsmSelfOutput.dense",
	"EsmIntermediate.dense",
	"EsmOutput.dense",
	"EsmContactPredictionHead.regression",
	"classifier"
	],
	lora_dropout=config["lora_dropout"],
	bias="none", # or "all" or "lora_only"
	# modules_to_save=["classifier"]
	)
	model = get_peft_model(model, peft_config)
	print_trainable_parameters(model) # added this in

	# Use the accelerator
	model = accelerator.prepare(model)
	train_dataset = accelerator.prepare(train_dataset)
	test_dataset = accelerator.prepare(test_dataset)

	timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

	# Training setup
	training_args = TrainingArguments(
	output_dir=f"esm2_t33_650M_qlora_binding_sites_{timestamp}",
	learning_rate=config["lr"],
	lr_scheduler_type=config["lr_scheduler_type"],
	gradient_accumulation_steps=4, # changed from 1 to 4
	# warmup_steps=2, # added this in
	max_grad_norm=config["max_grad_norm"],
	per_device_train_batch_size=config["per_device_train_batch_size"],
	per_device_eval_batch_size=config["per_device_train_batch_size"],
	num_train_epochs=config["num_train_epochs"],
	weight_decay=config["weight_decay"],
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	greater_is_better=True,
	push_to_hub=False,
	logging_dir=None,
	logging_first_step=False,
	logging_steps=200,
	save_total_limit=7,
	no_cuda=False,
	seed=8893,
	fp16=True,
	report_to='wandb',
	optim="paged_adamw_8bit" # added this in

	)

	# Initialize Trainer
	trainer = WeightedTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer,
	data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
	compute_metrics=compute_metrics
	)

	# Train and Save Model
	trainer.train()
	save_path = os.path.join("qlora_binding_sites", f"best_model_esm2_t33_650M_qlora_{timestamp}")
	trainer.save_model(save_path)
	tokenizer.save_pretrained(save_path)

	# Call the training function
	if __name__ == "__main__":
	train_function_no_sweeps(train_dataset, test_dataset)