File size: 5,914 Bytes
608e624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from datasets import Dataset, load_from_disk
import os

version = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
    }


# Load the dataset
df = pd.read_csv("./data_2/WELFake_Dataset.csv")

# Drop index
df.drop(df.columns[0], axis=1, inplace=True)
df.dropna(inplace=True)

# Swapping labels around since it originally is the opposite
df["label"] = df["label"].map({0: 1, 1: 0})

df["text"] = df["title"] + " " + df["text"]

# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
df["text"] = df["text"].str.replace(
    r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
)

# Remove patterns like "Featured image via author name / image place"
df["text"] = df["text"].str.replace(r"Featured image via .+?\.($|\s)", "", regex=True)

df = df[["text", "label"]]

# Split the data into train, validate, and test sets
train_val, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(
    train_val, test_size=0.25, random_state=42
)  # 0.25 * 0.8 = 0.2

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

model.to(device)

if __name__ == "__main__":
    tokenized_data_dir = f"./output/version_{version}/tokenized_data_{version}"

    # Check if tokenized data exists
    try:
        print("Loading tokenized data from disk...")
        train_dataset = load_from_disk(f"{tokenized_data_dir}/train")
        val_dataset = load_from_disk(f"{tokenized_data_dir}/validation")
        test_dataset = load_from_disk(f"{tokenized_data_dir}/test")
    except:
        print("Tokenizing...")
        # Convert DataFrame to Hugging Face Dataset
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)

        def tokenize(examples):
            return tokenizer(
                examples["text"], padding=True, truncation=True, max_length=512
            )

        # Apply tokenization using map with multiprocessing
        train_dataset = train_dataset.map(tokenize, batched=True, num_proc=8)
        val_dataset = val_dataset.map(tokenize, batched=True, num_proc=8)
        test_dataset = test_dataset.map(tokenize, batched=True, num_proc=8)

        # Save the tokenized data
        os.makedirs(tokenized_data_dir, exist_ok=True)
        train_dataset.save_to_disk(f"{tokenized_data_dir}/train")
        val_dataset.save_to_disk(f"{tokenized_data_dir}/validation")
        test_dataset.save_to_disk(f"{tokenized_data_dir}/test")

    # Set format for PyTorch
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    print("Finished tokenizing.")

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./output/version_{version}",
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f"./logs/version_{version}",
        logging_steps=50,
        eval_steps=1000,
        save_steps=1000,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        save_total_limit=2,
    )

    # Trainer with compute_metrics
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the best model
    model_save_path = f"./output/version_{version}/best_model_{version}"
    trainer.save_model(model_save_path)

    # Evaluation on test set
    predictions = trainer.predict(test_dataset)
    test_accuracy = accuracy_score(
        predictions.label_ids, predictions.predictions.argmax(-1)
    )
    test_f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1))
    test_auc_roc = roc_auc_score(
        predictions.label_ids, predictions.predictions.argmax(-1)
    )

    print(f"Test Set Accuracy: {test_accuracy}")
    print(f"Test Set F1 Score: {test_f1}")
    print(f"Test Set AUC-ROC: {test_auc_roc}")

    # Confusion Matrix
    conf_matrix = confusion_matrix(
        predictions.label_ids, predictions.predictions.argmax(-1)
    )
    conf_matrix_df = pd.DataFrame(conf_matrix)
    conf_matrix_df.to_csv(
        f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False
    )

    # Extracting training metrics
    metrics = pd.DataFrame(trainer.state.log_history)
    metrics = metrics.dropna(subset=["loss"])
    metrics = metrics[["epoch", "loss", "eval_loss", "eval_accuracy"]]
    metrics.rename(
        columns={
            "loss": "train_loss",
            "eval_loss": "val_loss",
            "eval_accuracy": "val_accuracy",
        },
        inplace=True,
    )
    metrics.to_csv(
        f"./output/version_{version}/training_metrics_{version}.csv", index=False
    )