|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, classification_report |
|
import torch |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import BertTokenizer, BertForSequenceClassification, AdamW |
|
from transformers import get_scheduler |
|
|
|
from datasets import load_dataset |
|
|
|
data_path = "" |
|
model_path = "" |
|
data_files = {"train": "train_data.csv", "validation": "val_data.csv", "test": "test_data.csv"} |
|
|
|
dataset_train = load_dataset(data_path, data_files=data_files, split="train") |
|
dataset_val = load_dataset(data_path, data_files=data_files, split="validation") |
|
dataset_test = load_dataset(data_path, data_files=data_files, split="test") |
|
|
|
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True) |
|
test_loader = DataLoader(dataset_test, batch_size=16) |
|
|
|
class CustomModel: |
|
def __init__(self, model_name="bert-base-uncased", num_labels=2, lr=5e-5, epochs=4, max_len=128): |
|
""" |
|
Initialize the custom model with tokenizer, optimizer, scheduler, and training parameters. |
|
|
|
Args: |
|
model_name (str): Name of the pretrained BERT model. |
|
num_labels (int): Number of labels for the classification task. |
|
lr (float): Learning rate for the optimizer. |
|
epochs (int): Number of epochs for training. |
|
max_len (int): Maximum token length for sequences. |
|
""" |
|
self.model_name = model_name |
|
self.num_labels = num_labels |
|
self.epochs = epochs |
|
self.max_len = max_len |
|
|
|
|
|
self.tokenizer = BertTokenizer.from_pretrained(model_name) |
|
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) |
|
|
|
|
|
self.optimizer = AdamW(self.model.parameters(), lr=lr) |
|
|
|
|
|
self.scheduler = None |
|
|
|
|
|
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
self.model.to(self.device) |
|
|
|
def setup_scheduler(self, train_loader): |
|
""" |
|
Setup a learning rate scheduler based on training data. |
|
|
|
Args: |
|
train_loader (DataLoader): Training data loader. |
|
""" |
|
num_training_steps = len(train_loader) * self.epochs |
|
self.scheduler = get_scheduler( |
|
"linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps |
|
) |
|
|
|
def tokenize_batch(self, texts): |
|
""" |
|
Tokenize a batch of text inputs. |
|
|
|
Args: |
|
texts (list[str]): List of text strings to tokenize. |
|
|
|
Returns: |
|
dict: Tokenized inputs with attention masks and input IDs. |
|
""" |
|
return self.tokenizer( |
|
texts, |
|
padding=True, |
|
truncation=True, |
|
max_length=self.max_len, |
|
return_tensors="pt" |
|
) |
|
|
|
def train(self, train_loader): |
|
""" |
|
Train the model with raw text inputs and labels. |
|
|
|
Args: |
|
train_loader (DataLoader): Training data loader containing text and labels. |
|
""" |
|
self.model.train() |
|
for epoch in range(self.epochs): |
|
epoch_loss = 0 |
|
for batch in train_loader: |
|
texts, labels = batch['title'], batch['labels'] |
|
labels = labels.to(self.device) |
|
|
|
|
|
tokenized_inputs = self.tokenize_batch(texts) |
|
tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} |
|
tokenized_inputs['labels'] = labels |
|
|
|
|
|
outputs = self.model(**tokenized_inputs) |
|
loss = outputs.loss |
|
loss.backward() |
|
self.optimizer.step() |
|
self.scheduler.step() |
|
self.optimizer.zero_grad() |
|
epoch_loss += loss.item() |
|
print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss / len(train_loader):.4f}") |
|
|
|
def evaluate(self, test_loader): |
|
""" |
|
Evaluate the model with raw text inputs and labels. |
|
|
|
Args: |
|
test_loader (DataLoader): Test data loader containing text and labels. |
|
|
|
Returns: |
|
Tuple: True labels and predicted labels. |
|
""" |
|
self.model.eval() |
|
y_true, y_pred = [], [] |
|
with torch.no_grad(): |
|
for batch in test_loader: |
|
texts, labels = batch['title'], batch['labels'] |
|
labels = labels.to(self.device) |
|
|
|
|
|
tokenized_inputs = self.tokenize_batch(texts) |
|
tokenized_inputs = {key: val.to(self.device) for key, val in tokenized_inputs.items()} |
|
|
|
|
|
outputs = self.model(**tokenized_inputs) |
|
logits = outputs.logits |
|
predictions = torch.argmax(logits, dim=-1) |
|
y_true.extend(labels.tolist()) |
|
y_pred.extend(predictions.tolist()) |
|
return y_true, y_pred |
|
|
|
def save_model(self, save_path): |
|
""" |
|
Save the model locally in Hugging Face format. |
|
|
|
Args: |
|
save_path (str): Path to save the model. |
|
""" |
|
self.model.save_pretrained(save_path) |
|
self.tokenizer.save_pretrained(save_path) |
|
|
|
def push_model(self, repo_name): |
|
""" |
|
Push the model to the Hugging Face Hub. |
|
|
|
Args: |
|
repo_name (str): Repository name on Hugging Face Hub. |
|
""" |
|
self.model.push_to_hub(repo_name) |
|
self.tokenizer.push_to_hub(repo_name) |
|
|
|
custom_model = CustomModel(model_name=model_path, num_labels=2, lr=5e-5, epochs=4) |
|
|
|
|
|
y_true, y_pred = custom_model.evaluate(test_loader) |
|
|
|
|
|
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}") |
|
print("Classification Report:\n", classification_report(y_true, y_pred)) |
|
|
|
|