spam-indobert / src /trainer.py
yondikavl's picture
Upload folder using huggingface_hub
932d265 verified
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import (
BertTokenizer, BertForSequenceClassification, BertConfig,
Trainer, TrainingArguments, DataCollatorWithPadding,
EarlyStoppingCallback
)
from datasets import Dataset
# ---------------- CONFIGURASI ---------------- #
# Kamu bisa pindahkan ini ke `src/config.py` kalau mau lebih modular
MODEL_NAME = "indobenchmark/indobert-base-p1"
MODEL_DIR = "/content/drive/MyDrive/model-spam/" # Ganti sesuai path drive kamu
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4
WEIGHT_DECAY = 0.01
# ---------------- TOKENIZE FUNCTION ---------------- #
def tokenize_function(tokenizer, examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
# ---------------- METRICS ---------------- #
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
labels = pred.label_ids
preds = np.argmax(pred.predictions, axis=1)
acc = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
return {
"accuracy": acc,
"precision": precision,
"recall": recall,
"f1": f1,
}
# ---------------- CALLBACK UNTUK PLOT ---------------- #
class CustomCallback(EarlyStoppingCallback):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.train_loss = []
self.val_loss = []
self.accuracy = []
def on_log(self, args, state, control, logs=None, **kwargs):
if 'loss' in logs:
self.train_loss.append(logs['loss'])
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
if metrics:
if 'eval_loss' in metrics:
self.val_loss.append(metrics['eval_loss'])
if 'eval_accuracy' in metrics:
self.accuracy.append(metrics['eval_accuracy'])
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(self.train_loss, label='Training Loss')
plt.plot(self.val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(self.accuracy, label='Accuracy', color='green')
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig("training_metrics.png")
plt.close()
# ---------------- CONFUSION MATRIX ---------------- #
def plot_confusion_matrix(trainer, dataset, filename="confusion_matrix.png"):
predictions = trainer.predict(dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Non Spam', 'Spam'],
yticklabels=['Non Spam', 'Spam'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(filename)
plt.close()
# ---------------- MAIN FUNCTION ---------------- #
def train_model():
# Load dataset
df = pd.read_csv("data/datasetspam.csv")[["text", "label"]].dropna()
# Split data
df_train_val, df_test = train_test_split(
df, test_size=0.15, random_state=42, stratify=df["label"]
)
df_train, df_val = train_test_split(
df_train_val, test_size=0.15, random_state=42, stratify=df_train_val["label"]
)
print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")
# Tokenizer dan konfigurasi
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
config = BertConfig.from_pretrained(
MODEL_NAME,
num_labels=2,
hidden_dropout_prob=0.3,
attention_probs_dropout_prob=0.3,
label_smoothing_factor=0.1
)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
# Dataset Huggingface
train_dataset = Dataset.from_pandas(df_train.reset_index(drop=True))
val_dataset = Dataset.from_pandas(df_val.reset_index(drop=True))
test_dataset = Dataset.from_pandas(df_test.reset_index(drop=True))
train_dataset = train_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
val_dataset = val_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
test_dataset = test_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
# Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest', max_length=MAX_LENGTH)
# Training args
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
logging_strategy="epoch",
learning_rate=LEARNING_RATE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUM_EPOCHS,
weight_decay=WEIGHT_DECAY,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
report_to="none",
save_total_limit=1,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[
CustomCallback(),
EarlyStoppingCallback(early_stopping_patience=2)
]
)
trainer.train()
# Simpan hasil visualisasi
plot_confusion_matrix(trainer, test_dataset)
# Simpan model dan tokenizer
os.makedirs(MODEL_DIR, exist_ok=True)
# Simpan cara 1: native transformers
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
# Simpan cara 2: PyTorch state_dict
torch.save(model.state_dict(), os.path.join(MODEL_DIR, "model.pt"))
print("✅ Training selesai. Model dan tokenizer berhasil disimpan.")
# ---------------- ENTRY POINT ---------------- #
if __name__ == "__main__":
train_model()