|
import torch |
|
import pandas as pd |
|
from preprocessing import ( |
|
preprocess_text, |
|
load_tokenizer, |
|
prepare_data, |
|
load_glove_embeddings, |
|
) |
|
from data_loader import create_data_loader |
|
from inference import load_model, evaluate_model |
|
from sklearn.metrics import confusion_matrix |
|
import os |
|
|
|
version = 2 |
|
|
|
|
|
def run_evaluation(model_path, tokenizer_path, device): |
|
cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv" |
|
|
|
if os.path.exists(cleaned_path): |
|
df = pd.read_csv(cleaned_path) |
|
df.dropna(inplace=True) |
|
print("Cleaned data found.") |
|
else: |
|
print("No cleaned data found. Cleaning data now...") |
|
|
|
df = pd.read_csv("./data_3/news_articles.csv") |
|
df.drop( |
|
columns=[ |
|
"author", |
|
"published", |
|
"site_url", |
|
"main_img_url", |
|
"type", |
|
"text_without_stopwords", |
|
"title_without_stopwords", |
|
"hasImage", |
|
], |
|
inplace=True, |
|
) |
|
|
|
df["label"] = df["label"].map({"Real": 1, "Fake": 0}) |
|
df = df[df["label"].isin([1, 0])] |
|
|
|
|
|
df = df[df["language"] == "english"] |
|
df.drop(columns=["language"], inplace=True) |
|
|
|
|
|
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x) |
|
|
|
df.dropna(inplace=True) |
|
df["title"] = df["title"].apply(preprocess_text) |
|
df["text"] = df["text"].apply(preprocess_text) |
|
|
|
df.to_csv(cleaned_path, index=False) |
|
df.dropna(inplace=True) |
|
print("Cleaned data saved.") |
|
|
|
labels = df["label"].values |
|
|
|
|
|
tokenizer = load_tokenizer(tokenizer_path) |
|
|
|
embedding_matrix = load_glove_embeddings( |
|
"./GloVe/glove.6B.300d.txt", tokenizer.word_index, embedding_dim=300 |
|
) |
|
|
|
model = load_model(model_path, embedding_matrix) |
|
model.to(device) |
|
|
|
|
|
titles = prepare_data(df["title"], tokenizer) |
|
texts = prepare_data(df["text"], tokenizer) |
|
|
|
|
|
data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False) |
|
|
|
|
|
accuracy, f1, auc_roc, y_true, y_pred = evaluate_model( |
|
model, data_loader, device, labels |
|
) |
|
|
|
|
|
cm = confusion_matrix(y_true, y_pred) |
|
cm_df = pd.DataFrame(cm) |
|
cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv" |
|
cm_df.to_csv(cm_filename, index=False) |
|
print(f"Confusion Matrix saved to {cm_filename}") |
|
return accuracy, f1, auc_roc |
|
|
|
|
|
if __name__ == "__main__": |
|
model_path = f"./output/version_{version}/best_model_{version}.pth" |
|
tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle" |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Device: {device}") |
|
|
|
accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device) |
|
print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}") |
|
|