kimic's picture
Initial commit for BERT
608e624
import torch
from transformers import (
DistilBertTokenizer,
DistilBertForSequenceClassification,
Trainer,
TrainingArguments,
)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from datasets import Dataset, load_from_disk
import os
version = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)
return {
"accuracy": acc,
"f1": f1,
}
# Load the dataset
df = pd.read_csv("./data_2/WELFake_Dataset.csv")
# Drop index
df.drop(df.columns[0], axis=1, inplace=True)
df.dropna(inplace=True)
# Swapping labels around since it originally is the opposite
df["label"] = df["label"].map({0: 1, 1: 0})
df["text"] = df["title"] + " " + df["text"]
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
df["text"] = df["text"].str.replace(
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
)
# Remove patterns like "Featured image via author name / image place"
df["text"] = df["text"].str.replace(r"Featured image via .+?\.($|\s)", "", regex=True)
df = df[["text", "label"]]
# Split the data into train, validate, and test sets
train_val, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(
train_val, test_size=0.25, random_state=42
) # 0.25 * 0.8 = 0.2
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=2
)
model.to(device)
if __name__ == "__main__":
tokenized_data_dir = f"./output/version_{version}/tokenized_data_{version}"
# Check if tokenized data exists
try:
print("Loading tokenized data from disk...")
train_dataset = load_from_disk(f"{tokenized_data_dir}/train")
val_dataset = load_from_disk(f"{tokenized_data_dir}/validation")
test_dataset = load_from_disk(f"{tokenized_data_dir}/test")
except:
print("Tokenizing...")
# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
def tokenize(examples):
return tokenizer(
examples["text"], padding=True, truncation=True, max_length=512
)
# Apply tokenization using map with multiprocessing
train_dataset = train_dataset.map(tokenize, batched=True, num_proc=8)
val_dataset = val_dataset.map(tokenize, batched=True, num_proc=8)
test_dataset = test_dataset.map(tokenize, batched=True, num_proc=8)
# Save the tokenized data
os.makedirs(tokenized_data_dir, exist_ok=True)
train_dataset.save_to_disk(f"{tokenized_data_dir}/train")
val_dataset.save_to_disk(f"{tokenized_data_dir}/validation")
test_dataset.save_to_disk(f"{tokenized_data_dir}/test")
# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print("Finished tokenizing.")
# Define training arguments
training_args = TrainingArguments(
output_dir=f"./output/version_{version}",
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir=f"./logs/version_{version}",
logging_steps=50,
eval_steps=1000,
save_steps=1000,
evaluation_strategy="steps",
save_strategy="steps",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_total_limit=2,
)
# Trainer with compute_metrics
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
)
# Train the model
trainer.train()
# Save the best model
model_save_path = f"./output/version_{version}/best_model_{version}"
trainer.save_model(model_save_path)
# Evaluation on test set
predictions = trainer.predict(test_dataset)
test_accuracy = accuracy_score(
predictions.label_ids, predictions.predictions.argmax(-1)
)
test_f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1))
test_auc_roc = roc_auc_score(
predictions.label_ids, predictions.predictions.argmax(-1)
)
print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")
print(f"Test Set AUC-ROC: {test_auc_roc}")
# Confusion Matrix
conf_matrix = confusion_matrix(
predictions.label_ids, predictions.predictions.argmax(-1)
)
conf_matrix_df = pd.DataFrame(conf_matrix)
conf_matrix_df.to_csv(
f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False
)
# Extracting training metrics
metrics = pd.DataFrame(trainer.state.log_history)
metrics = metrics.dropna(subset=["loss"])
metrics = metrics[["epoch", "loss", "eval_loss", "eval_accuracy"]]
metrics.rename(
columns={
"loss": "train_loss",
"eval_loss": "val_loss",
"eval_accuracy": "val_accuracy",
},
inplace=True,
)
metrics.to_csv(
f"./output/version_{version}/training_metrics_{version}.csv", index=False
)