File size: 3,878 Bytes

608e624

import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
)
from datasets import Dataset, load_from_disk
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from functools import partial
import os

version = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


if __name__ == "__main__":
    # Load the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

    cleaned_path = (
        f"./output/version_{version}/tokenized_data_{version}/inference_{version}"
    )

    # Load the model
    model = DistilBertForSequenceClassification.from_pretrained(
        f"./output/version_{version}/best_model_{version}"
    )
    model.to(device)

    # Load and prepare the new dataset
    if os.path.exists(cleaned_path):
        print("Loading dataset...")
        dataset = load_from_disk(cleaned_path)
    else:
        print("No dataset found. Loading and preparing dataset now...")
        # # Load the datasets
        # true_news = pd.read_csv("data_1/True.csv")
        # fake_news = pd.read_csv("data_1/Fake.csv")

        # # Add labels
        # true_news["label"] = 1
        # fake_news["label"] = 0

        # # Combine the datasets
        # df = pd.concat([true_news, fake_news], ignore_index=True)

        df = pd.read_csv("./data_3/news_articles.csv")
        df.drop(
            columns=[
                "author",
                "published",
                "site_url",
                "main_img_url",
                "type",
                "text_without_stopwords",
                "title_without_stopwords",
                "hasImage",
            ],
            inplace=True,
        )
        # Map Real to 1 and Fake to 0
        df["label"] = df["label"].map({"Real": 1, "Fake": 0})
        df = df[df["label"].isin([1, 0])]

        # Drop rows where the language is not 'english'
        df = df[df["language"] == "english"]
        df.drop(columns=["language"], inplace=True)

        # Convert "no title" to empty string
        df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)

        df.dropna(inplace=True)

        # Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
        df["text"] = df["text"].str.replace(
            r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
        )

        # Remove patterns like "Featured image via author name / image place"
        df["text"] = df["text"].str.replace(
            r"Featured image via .+?\.($|\s)", "", regex=True
        )

        df["text"] = df["title"] + " " + df["text"]
        df = df[["text", "label"]]
        df["label"] = df["label"].astype(int)

        dataset = Dataset.from_pandas(df)

        def tokenize(tokenizer, examples):
            return tokenizer(
                examples["text"], padding=True, truncation=True, max_length=512
            )

        # Use partial to create a new function that has tokenizer as its first argument
        tokenize_with_tokenizer = partial(tokenize, tokenizer)

        # Use tokenize_with_tokenizer in the map function
        dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8)
        dataset.save_to_disk(cleaned_path)

    dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    print("Finished tokenizing.")

    trainer = Trainer(model=model)
    predictions = trainer.predict(dataset)
    accuracy = accuracy_score(predictions.label_ids, predictions.predictions.argmax(-1))
    f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1))
    auc_roc = roc_auc_score(predictions.label_ids, predictions.predictions.argmax(-1))

    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"AUC-ROC: {auc_roc}")