|
import os
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import torch
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
from transformers import (
|
|
BertTokenizer, BertForSequenceClassification, BertConfig,
|
|
Trainer, TrainingArguments, DataCollatorWithPadding,
|
|
EarlyStoppingCallback
|
|
)
|
|
from datasets import Dataset
|
|
|
|
|
|
|
|
MODEL_NAME = "indobenchmark/indobert-base-p1"
|
|
MODEL_DIR = "/content/drive/MyDrive/model-spam/"
|
|
MAX_LENGTH = 128
|
|
BATCH_SIZE = 16
|
|
LEARNING_RATE = 2e-5
|
|
NUM_EPOCHS = 4
|
|
WEIGHT_DECAY = 0.01
|
|
|
|
|
|
def tokenize_function(tokenizer, examples):
|
|
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
|
|
|
|
|
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
|
|
|
def compute_metrics(pred):
|
|
labels = pred.label_ids
|
|
preds = np.argmax(pred.predictions, axis=1)
|
|
acc = accuracy_score(labels, preds)
|
|
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
|
|
return {
|
|
"accuracy": acc,
|
|
"precision": precision,
|
|
"recall": recall,
|
|
"f1": f1,
|
|
}
|
|
|
|
|
|
class CustomCallback(EarlyStoppingCallback):
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.train_loss = []
|
|
self.val_loss = []
|
|
self.accuracy = []
|
|
|
|
def on_log(self, args, state, control, logs=None, **kwargs):
|
|
if 'loss' in logs:
|
|
self.train_loss.append(logs['loss'])
|
|
|
|
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
|
|
if metrics:
|
|
if 'eval_loss' in metrics:
|
|
self.val_loss.append(metrics['eval_loss'])
|
|
if 'eval_accuracy' in metrics:
|
|
self.accuracy.append(metrics['eval_accuracy'])
|
|
|
|
plt.figure(figsize=(12, 5))
|
|
plt.subplot(1, 2, 1)
|
|
plt.plot(self.train_loss, label='Training Loss')
|
|
plt.plot(self.val_loss, label='Validation Loss')
|
|
plt.title('Training and Validation Loss')
|
|
plt.xlabel('Step')
|
|
plt.ylabel('Loss')
|
|
plt.legend()
|
|
|
|
plt.subplot(1, 2, 2)
|
|
plt.plot(self.accuracy, label='Accuracy', color='green')
|
|
plt.title('Validation Accuracy')
|
|
plt.xlabel('Epoch')
|
|
plt.ylabel('Accuracy')
|
|
plt.legend()
|
|
|
|
plt.tight_layout()
|
|
plt.savefig("training_metrics.png")
|
|
plt.close()
|
|
|
|
|
|
def plot_confusion_matrix(trainer, dataset, filename="confusion_matrix.png"):
|
|
predictions = trainer.predict(dataset)
|
|
preds = np.argmax(predictions.predictions, axis=-1)
|
|
labels = predictions.label_ids
|
|
|
|
cm = confusion_matrix(labels, preds)
|
|
plt.figure(figsize=(6, 6))
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
|
xticklabels=['Non Spam', 'Spam'],
|
|
yticklabels=['Non Spam', 'Spam'])
|
|
plt.title('Confusion Matrix')
|
|
plt.ylabel('True Label')
|
|
plt.xlabel('Predicted Label')
|
|
plt.savefig(filename)
|
|
plt.close()
|
|
|
|
|
|
def train_model():
|
|
|
|
df = pd.read_csv("data/datasetspam.csv")[["text", "label"]].dropna()
|
|
|
|
|
|
df_train_val, df_test = train_test_split(
|
|
df, test_size=0.15, random_state=42, stratify=df["label"]
|
|
)
|
|
|
|
df_train, df_val = train_test_split(
|
|
df_train_val, test_size=0.15, random_state=42, stratify=df_train_val["label"]
|
|
)
|
|
|
|
print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")
|
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
|
|
|
|
config = BertConfig.from_pretrained(
|
|
MODEL_NAME,
|
|
num_labels=2,
|
|
hidden_dropout_prob=0.3,
|
|
attention_probs_dropout_prob=0.3,
|
|
label_smoothing_factor=0.1
|
|
)
|
|
|
|
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
|
|
|
|
|
|
train_dataset = Dataset.from_pandas(df_train.reset_index(drop=True))
|
|
val_dataset = Dataset.from_pandas(df_val.reset_index(drop=True))
|
|
test_dataset = Dataset.from_pandas(df_test.reset_index(drop=True))
|
|
|
|
train_dataset = train_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
|
|
val_dataset = val_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
|
|
test_dataset = test_dataset.map(lambda x: tokenize_function(tokenizer, x), batched=True)
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest', max_length=MAX_LENGTH)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
eval_strategy="epoch",
|
|
save_strategy="epoch",
|
|
logging_strategy="epoch",
|
|
learning_rate=LEARNING_RATE,
|
|
per_device_train_batch_size=BATCH_SIZE,
|
|
per_device_eval_batch_size=BATCH_SIZE,
|
|
num_train_epochs=NUM_EPOCHS,
|
|
weight_decay=WEIGHT_DECAY,
|
|
load_best_model_at_end=True,
|
|
metric_for_best_model="accuracy",
|
|
greater_is_better=True,
|
|
report_to="none",
|
|
save_total_limit=1,
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=val_dataset,
|
|
tokenizer=tokenizer,
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
callbacks=[
|
|
CustomCallback(),
|
|
EarlyStoppingCallback(early_stopping_patience=2)
|
|
]
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
|
|
plot_confusion_matrix(trainer, test_dataset)
|
|
|
|
|
|
os.makedirs(MODEL_DIR, exist_ok=True)
|
|
|
|
|
|
model.save_pretrained(MODEL_DIR)
|
|
tokenizer.save_pretrained(MODEL_DIR)
|
|
|
|
|
|
torch.save(model.state_dict(), os.path.join(MODEL_DIR, "model.pt"))
|
|
|
|
print("✅ Training selesai. Model dan tokenizer berhasil disimpan.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
train_model()
|
|
|