fake-news-detector-DistilBERT / inference_main.py
kimic's picture
Added cm and updated graph titles for clarity
f465598
import torch
from transformers import (
DistilBertTokenizer,
DistilBertForSequenceClassification,
Trainer,
)
from datasets import Dataset, load_from_disk
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from functools import partial
import os
version = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if __name__ == "__main__":
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
cleaned_path = (
f"./output/version_{version}/tokenized_data_{version}/inference_{version}"
)
# Load the model
model = DistilBertForSequenceClassification.from_pretrained(
f"./output/version_{version}/best_model_{version}"
)
model.to(device)
# Load and prepare the new dataset
if os.path.exists(cleaned_path):
print("Loading dataset...")
dataset = load_from_disk(cleaned_path)
else:
print("No dataset found. Loading and preparing dataset now...")
# # Load the datasets
# true_news = pd.read_csv("data_1/True.csv")
# fake_news = pd.read_csv("data_1/Fake.csv")
# # Add labels
# true_news["label"] = 1
# fake_news["label"] = 0
# # Combine the datasets
# df = pd.concat([true_news, fake_news], ignore_index=True)
df = pd.read_csv("./data_3/news_articles.csv")
df.drop(
columns=[
"author",
"published",
"site_url",
"main_img_url",
"type",
"text_without_stopwords",
"title_without_stopwords",
"hasImage",
],
inplace=True,
)
# Map Real to 1 and Fake to 0
df["label"] = df["label"].map({"Real": 1, "Fake": 0})
df = df[df["label"].isin([1, 0])]
# Drop rows where the language is not 'english'
df = df[df["language"] == "english"]
df.drop(columns=["language"], inplace=True)
# Convert "no title" to empty string
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)
df.dropna(inplace=True)
# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
df["text"] = df["text"].str.replace(
r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True
)
# Remove patterns like "Featured image via author name / image place"
df["text"] = df["text"].str.replace(
r"Featured image via .+?\.($|\s)", "", regex=True
)
df["text"] = df["title"] + " " + df["text"]
df = df[["text", "label"]]
df["label"] = df["label"].astype(int)
dataset = Dataset.from_pandas(df)
def tokenize(tokenizer, examples):
return tokenizer(
examples["text"], padding=True, truncation=True, max_length=512
)
# Use partial to create a new function that has tokenizer as its first argument
tokenize_with_tokenizer = partial(tokenize, tokenizer)
# Use tokenize_with_tokenizer in the map function
dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8)
dataset.save_to_disk(cleaned_path)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print("Finished tokenizing.")
trainer = Trainer(model=model)
predictions = trainer.predict(dataset)
# Compute metrics
true_labels = predictions.label_ids
pred_labels = predictions.predictions.argmax(-1)
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)
auc_roc = roc_auc_score(true_labels, predictions.predictions[:, 1])
# Generate and save confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
cm_df = pd.DataFrame(cm)
cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv"
cm_df.to_csv(cm_filename, index=False)
print(f"Confusion Matrix saved to {cm_filename}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc_roc}")