|
import torch |
|
from transformers import ( |
|
DistilBertTokenizer, |
|
DistilBertForSequenceClassification, |
|
Trainer, |
|
TrainingArguments, |
|
) |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix |
|
from datasets import Dataset, load_from_disk |
|
import os |
|
|
|
version = 3 |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print("Device:", device) |
|
|
|
|
|
def compute_metrics(pred): |
|
labels = pred.label_ids |
|
preds = pred.predictions.argmax(-1) |
|
acc = accuracy_score(labels, preds) |
|
f1 = f1_score(labels, preds) |
|
return { |
|
"accuracy": acc, |
|
"f1": f1, |
|
} |
|
|
|
|
|
|
|
df = pd.read_csv("./data_2/WELFake_Dataset.csv") |
|
|
|
|
|
df.drop(df.columns[0], axis=1, inplace=True) |
|
df.dropna(inplace=True) |
|
|
|
|
|
df["label"] = df["label"].map({0: 1, 1: 0}) |
|
|
|
df["text"] = df["title"] + " " + df["text"] |
|
|
|
|
|
df["text"] = df["text"].str.replace( |
|
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True |
|
) |
|
|
|
|
|
df["text"] = df["text"].str.replace(r"Featured image via .+?\.($|\s)", "", regex=True) |
|
|
|
df = df[["text", "label"]] |
|
|
|
|
|
train_val, test_df = train_test_split(df, test_size=0.2, random_state=42) |
|
train_df, val_df = train_test_split( |
|
train_val, test_size=0.25, random_state=42 |
|
) |
|
|
|
|
|
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
|
model = DistilBertForSequenceClassification.from_pretrained( |
|
"distilbert-base-uncased", num_labels=2 |
|
) |
|
|
|
model.to(device) |
|
|
|
if __name__ == "__main__": |
|
tokenized_data_dir = f"./output/version_{version}/tokenized_data_{version}" |
|
|
|
|
|
try: |
|
print("Loading tokenized data from disk...") |
|
train_dataset = load_from_disk(f"{tokenized_data_dir}/train") |
|
val_dataset = load_from_disk(f"{tokenized_data_dir}/validation") |
|
test_dataset = load_from_disk(f"{tokenized_data_dir}/test") |
|
except: |
|
print("Tokenizing...") |
|
|
|
train_dataset = Dataset.from_pandas(train_df) |
|
val_dataset = Dataset.from_pandas(val_df) |
|
test_dataset = Dataset.from_pandas(test_df) |
|
|
|
def tokenize(examples): |
|
return tokenizer( |
|
examples["text"], padding=True, truncation=True, max_length=512 |
|
) |
|
|
|
|
|
train_dataset = train_dataset.map(tokenize, batched=True, num_proc=8) |
|
val_dataset = val_dataset.map(tokenize, batched=True, num_proc=8) |
|
test_dataset = test_dataset.map(tokenize, batched=True, num_proc=8) |
|
|
|
|
|
os.makedirs(tokenized_data_dir, exist_ok=True) |
|
train_dataset.save_to_disk(f"{tokenized_data_dir}/train") |
|
val_dataset.save_to_disk(f"{tokenized_data_dir}/validation") |
|
test_dataset.save_to_disk(f"{tokenized_data_dir}/test") |
|
|
|
|
|
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) |
|
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) |
|
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) |
|
print("Finished tokenizing.") |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"./output/version_{version}", |
|
num_train_epochs=5, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=64, |
|
warmup_steps=500, |
|
weight_decay=0.01, |
|
logging_dir=f"./logs/version_{version}", |
|
logging_steps=50, |
|
eval_steps=1000, |
|
save_steps=1000, |
|
evaluation_strategy="steps", |
|
save_strategy="steps", |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
greater_is_better=True, |
|
save_total_limit=2, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset, |
|
compute_metrics=compute_metrics, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model_save_path = f"./output/version_{version}/best_model_{version}" |
|
trainer.save_model(model_save_path) |
|
|
|
|
|
predictions = trainer.predict(test_dataset) |
|
test_accuracy = accuracy_score( |
|
predictions.label_ids, predictions.predictions.argmax(-1) |
|
) |
|
test_f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1)) |
|
test_auc_roc = roc_auc_score( |
|
predictions.label_ids, predictions.predictions.argmax(-1) |
|
) |
|
|
|
print(f"Test Set Accuracy: {test_accuracy}") |
|
print(f"Test Set F1 Score: {test_f1}") |
|
print(f"Test Set AUC-ROC: {test_auc_roc}") |
|
|
|
|
|
conf_matrix = confusion_matrix( |
|
predictions.label_ids, predictions.predictions.argmax(-1) |
|
) |
|
conf_matrix_df = pd.DataFrame(conf_matrix) |
|
conf_matrix_df.to_csv( |
|
f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False |
|
) |
|
|
|
|
|
metrics = pd.DataFrame(trainer.state.log_history) |
|
metrics = metrics.dropna(subset=["loss"]) |
|
metrics = metrics[["epoch", "loss", "eval_loss", "eval_accuracy"]] |
|
metrics.rename( |
|
columns={ |
|
"loss": "train_loss", |
|
"eval_loss": "val_loss", |
|
"eval_accuracy": "val_accuracy", |
|
}, |
|
inplace=True, |
|
) |
|
metrics.to_csv( |
|
f"./output/version_{version}/training_metrics_{version}.csv", index=False |
|
) |
|
|