AmelieSchreiber's picture
Upload qlora_train_v4.py
a928b59
raw
history blame contribute delete
No virus
11.6 kB
import os
import wandb
import numpy as np
import torch
import torch.nn as nn
from datetime import datetime
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, matthews_corrcoef
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
DataCollatorForTokenClassification,
TrainingArguments,
Trainer,
BitsAndBytesConfig,
default_data_collator
)
from torch.utils.data import Dataset as TorchDataset
from accelerate import Accelerator
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import pickle
import gc
from tqdm import tqdm
# Initialize accelerator and Weights & Biases
accelerator = Accelerator()
os.environ["WANDB_NOTEBOOK_NAME"] = 'qlora_train.py'
wandb.init(project='binding_site_prediction')
# Helper Functions and Data Preparation
# -----------------------------------------------------------------------------
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
def save_config_to_txt(config, filename):
"""Save the configuration dictionary to a text file."""
with open(filename, 'w') as f:
for key, value in config.items():
f.write(f"{key}: {value}\n")
def truncate_labels(labels, max_length):
return [label[:max_length] for label in labels]
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
predictions = predictions[labels != -100].flatten()
labels = labels[labels != -100].flatten()
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
auc = roc_auc_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc, 'mcc': mcc}
def compute_loss(model, logits, inputs):
# print("Shape of input_ids:", inputs["input_ids"].shape)
labels = inputs["labels"]
loss_fct = nn.CrossEntropyLoss(weight=class_weights)
active_loss = inputs["attention_mask"].view(-1) == 1
active_logits = logits.view(-1, model.config.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
return loss
# Load data from pickle files
with open("data/16M_data_big/v2_train_sequences_chunked_by_family.pkl", "rb") as f:
train_sequences = pickle.load(f)
del f
gc.collect()
with open("data/16M_data_big/v2_test_sequences_chunked_by_family.pkl", "rb") as f:
test_sequences = pickle.load(f)
del f
gc.collect()
with open("data/16M_data_big/v2_train_labels_chunked_by_family.pkl", "rb") as f:
train_labels = pickle.load(f)
del f
gc.collect()
with open("data/16M_data_big/v2_test_labels_chunked_by_family.pkl", "rb") as f:
test_labels = pickle.load(f)
del f
gc.collect()
# Adjust max_sequence_length for special tokens
desired_length = 1022
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
sample_sequence = "A"
tokenized_sample = tokenizer(sample_sequence)
# Debugging print statements
print(f"Sample Sequence: {sample_sequence}")
print(f"Tokenized Sample: {tokenized_sample}")
print(f"Number of tokens in tokenized sample: {len(tokenized_sample['input_ids'])}")
num_special_tokens = len(tokenized_sample["input_ids"]) - 1
print(f"Number of special tokens: {num_special_tokens}")
effective_length = desired_length - num_special_tokens
print(f"Effective sequence length (accounting for special tokens): {effective_length}")
# Custom Dataset for on-the-fly tokenization
class CustomDataset(TorchDataset):
def __init__(self, sequences, labels, tokenizer, max_length):
self.sequences = sequences
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
sequence = self.sequences[idx]
label = self.labels[idx][:self.max_length]
tokenized = self.tokenizer(sequence, padding='max_length', truncation=True, max_length=effective_length, return_tensors="pt", is_split_into_words=False)
# Remove batch dimension
for key, value in tokenized.items():
tokenized[key] = value[0]
tokenized['labels'] = torch.tensor(label, dtype=torch.long)
# Diagnostics: Print the shape of the input_ids (or any other key you're interested in)
# print("Shape of input_ids:", tokenized["input_ids"].shape)
# Delete variables that are not needed anymore and collect garbage
del sequence, label
gc.collect()
return tokenized
train_dataset = CustomDataset(train_sequences, train_labels, tokenizer, effective_length)
test_dataset = CustomDataset(test_sequences, test_labels, tokenizer, effective_length)
# Compute Class Weights
classes = [0, 1]
# flat_train_labels = [label for sublist in train_labels for label in sublist]
flat_train_labels_gen = (label for sublist in tqdm(train_labels, desc="Flattening labels") for label in sublist)
flat_train_labels = np.fromiter(flat_train_labels_gen, dtype=np.int8)
del train_sequences, test_sequences, test_labels
gc.collect()
def compute_average_class_weight(train_labels, classes, batch_size):
num_batches = len(train_labels) // batch_size + (len(train_labels) % batch_size != 0)
total_weights = np.zeros(len(classes))
for i in tqdm(range(num_batches), desc="Computing class weights in batches"):
start_idx = i * batch_size
end_idx = start_idx + batch_size
batch_labels = train_labels[start_idx:end_idx]
flat_labels = np.array([label for sublist in batch_labels for label in sublist], dtype=np.int8)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=flat_labels)
total_weights += weights
# Clear memory
del batch_labels, flat_labels, weights
gc.collect()
# Average the weights
average_weights = total_weights / num_batches
return average_weights
batch_size = 100000 # You can adjust this based on your memory capacity
class_weights = compute_average_class_weight(train_labels, classes, batch_size)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)
del train_labels
gc.collect()
# class_weights = torch.tensor(class_weights, dtype=np.int8).to(accelerator.device)
# Define Custom Trainer Class
class WeightedTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
logits = outputs.logits
loss = compute_loss(model, logits, inputs)
return (loss, outputs) if return_outputs else loss
# Configure the quantization settings
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
def train_function_no_sweeps(train_dataset, test_dataset):
# Directly set the config
config = {
"lora_alpha": 1,
"lora_dropout": 0.5,
"lr": 1.701568055793089e-04,
"lr_scheduler_type": "cosine",
"max_grad_norm": 0.5,
"num_train_epochs": 4,
"per_device_train_batch_size": 60,
"r": 2,
"weight_decay": 0.3,
# Add other hyperparameters as needed
}
# Log the config to W&B
wandb.config.update(config)
# Save the config to a text file
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
config_filename = f"esm2_t33_650M_qlora_config_{timestamp}.txt"
save_config_to_txt(config, config_filename)
model_checkpoint = "facebook/esm2_t33_650M_UR50D"
# Define labels and model
id2label = {0: "No binding site", 1: "Binding site"}
label2id = {v: k for k, v in id2label.items()}
model = AutoModelForTokenClassification.from_pretrained(
model_checkpoint,
num_labels=len(id2label),
id2label=id2label,
label2id=label2id,
quantization_config=bnb_config # Apply quantization here
)
# Prepare the model for 4-bit quantization training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
# Convert the model into a PeftModel
peft_config = LoraConfig(
task_type=TaskType.TOKEN_CLS,
inference_mode=False,
r=config["r"],
lora_alpha=config["lora_alpha"],
target_modules=[
"query",
"key",
"value",
"EsmSelfOutput.dense",
"EsmIntermediate.dense",
"EsmOutput.dense",
"EsmContactPredictionHead.regression",
"classifier"
],
lora_dropout=config["lora_dropout"],
bias="none", # or "all" or "lora_only"
# modules_to_save=["classifier"]
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model) # added this in
# Use the accelerator
model = accelerator.prepare(model)
train_dataset = accelerator.prepare(train_dataset)
test_dataset = accelerator.prepare(test_dataset)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
# Training setup
training_args = TrainingArguments(
output_dir=f"esm2_t33_650M_qlora_binding_sites_{timestamp}",
learning_rate=config["lr"],
lr_scheduler_type=config["lr_scheduler_type"],
gradient_accumulation_steps=4, # changed from 1 to 4
# warmup_steps=2, # added this in
max_grad_norm=config["max_grad_norm"],
per_device_train_batch_size=config["per_device_train_batch_size"],
per_device_eval_batch_size=config["per_device_train_batch_size"],
num_train_epochs=config["num_train_epochs"],
weight_decay=config["weight_decay"],
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
push_to_hub=False,
logging_dir=None,
logging_first_step=False,
logging_steps=200,
save_total_limit=7,
no_cuda=False,
seed=8893,
fp16=True,
report_to='wandb',
optim="paged_adamw_8bit" # added this in
)
# Initialize Trainer
trainer = WeightedTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
compute_metrics=compute_metrics
)
# Train and Save Model
trainer.train()
save_path = os.path.join("qlora_binding_sites", f"best_model_esm2_t33_650M_qlora_{timestamp}")
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
# Call the training function
if __name__ == "__main__":
train_function_no_sweeps(train_dataset, test_dataset)