File size: 4,240 Bytes
608e624
 
 
 
 
 
 
 
f465598
608e624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f465598
 
 
 
 
 
 
 
 
 
 
 
 
 
608e624
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
)
from datasets import Dataset, load_from_disk
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from functools import partial
import os

version = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


if __name__ == "__main__":
    # Load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    cleaned_path = (
        f"./output/version_{version}/tokenized_data_{version}/inference_{version}"
    )

    # Load the model
    model = DistilBertForSequenceClassification.from_pretrained(
        f"./output/version_{version}/best_model_{version}"
    )
    model.to(device)

    # Load and prepare the new dataset
    if os.path.exists(cleaned_path):
        print("Loading dataset...")
        dataset = load_from_disk(cleaned_path)
    else:
        print("No dataset found. Loading and preparing dataset now...")
        # # Load the datasets
        # true_news = pd.read_csv("data_1/True.csv")
        # fake_news = pd.read_csv("data_1/Fake.csv")

        # # Add labels
        # true_news["label"] = 1
        # fake_news["label"] = 0

        # # Combine the datasets
        # df = pd.concat([true_news, fake_news], ignore_index=True)

        df = pd.read_csv("./data_3/news_articles.csv")
        df.drop(
            columns=[
                "author",
                "published",
                "site_url",
                "main_img_url",
                "type",
                "text_without_stopwords",
                "title_without_stopwords",
                "hasImage",
            ],
            inplace=True,
        )
        # Map Real to 1 and Fake to 0
        df["label"] = df["label"].map({"Real": 1, "Fake": 0})
        df = df[df["label"].isin([1, 0])]

        # Drop rows where the language is not 'english'
        df = df[df["language"] == "english"]
        df.drop(columns=["language"], inplace=True)

        # Convert "no title" to empty string
        df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)

        df.dropna(inplace=True)

        # Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
        df["text"] = df["text"].str.replace(
            r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
        )

        # Remove patterns like "Featured image via author name / image place"
        df["text"] = df["text"].str.replace(
            r"Featured image via .+?\.($|\s)", "", regex=True
        )

        df["text"] = df["title"] + " " + df["text"]
        df = df[["text", "label"]]
        df["label"] = df["label"].astype(int)

        dataset = Dataset.from_pandas(df)

        def tokenize(tokenizer, examples):
            return tokenizer(
                examples["text"], padding=True, truncation=True, max_length=512
            )

        # Use partial to create a new function that has tokenizer as its first argument
        tokenize_with_tokenizer = partial(tokenize, tokenizer)

        # Use tokenize_with_tokenizer in the map function
        dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8)
        dataset.save_to_disk(cleaned_path)

    dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    print("Finished tokenizing.")

    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)

    # Compute metrics
    true_labels = predictions.label_ids
    pred_labels = predictions.predictions.argmax(-1)
    accuracy = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    auc_roc = roc_auc_score(true_labels, predictions.predictions[:, 1])

    # Generate and save confusion matrix
    cm = confusion_matrix(true_labels, pred_labels)
    cm_df = pd.DataFrame(cm)
    cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv"
    cm_df.to_csv(cm_filename, index=False)
    print(f"Confusion Matrix saved to {cm_filename}")

    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"AUC-ROC: {auc_roc}")