File size: 3,455 Bytes
c5cd586 1bb2bdd 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 64c01a0 c5cd586 1bb2bdd c5cd586 64c01a0 c5cd586 64c01a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import torch
import pandas as pd
from preprocessing import preprocess_text, load_tokenizer, prepare_data
from data_loader import create_data_loader
from inference import load_model, evaluate_model
from sklearn.metrics import confusion_matrix
import os
version = 9
def run_evaluation(model_path, tokenizer_path, device):
cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv"
# Load data
if os.path.exists(cleaned_path):
df = pd.read_csv(cleaned_path)
df.dropna(inplace=True)
print("Cleaned data found.")
else:
print("No cleaned data found. Cleaning data now...")
# # Load the datasets
# true_news = pd.read_csv("data_1/True.csv")
# fake_news = pd.read_csv("data_1/Fake.csv")
# # Add labels
# true_news["label"] = 1
# fake_news["label"] = 0
# # Combine the datasets
# df = pd.concat([true_news, fake_news], ignore_index=True)
# # Drop unnecessary columns
# df.drop(columns=["subject", "date"], inplace=True)
df = pd.read_csv("./data_3/news_articles.csv")
df.drop(
columns=[
"author",
"published",
"site_url",
"main_img_url",
"type",
"text_without_stopwords",
"title_without_stopwords",
"hasImage",
],
inplace=True,
)
# Map Real to 1 and Fake to 0
df["label"] = df["label"].map({"Real": 1, "Fake": 0})
df = df[df["label"].isin([1, 0])]
# Drop rows where the language is not 'english'
df = df[df["language"] == "english"]
df.drop(columns=["language"], inplace=True)
# Convert "no title" to empty string
df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)
df.dropna(inplace=True)
df["title"] = df["title"].apply(preprocess_text)
df["text"] = df["text"].apply(preprocess_text)
df.to_csv(cleaned_path, index=False)
df.dropna(inplace=True)
print("Cleaned data saved.")
labels = df["label"].values
# Load tokenizer and model
tokenizer = load_tokenizer(tokenizer_path)
model = load_model(model_path, len(tokenizer.word_index) + 1)
# Prepare data
titles = prepare_data(df["title"], tokenizer)
texts = prepare_data(df["text"], tokenizer)
# Create DataLoader
data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)
# Evaluate
accuracy, f1, auc_roc, y_true, y_pred = evaluate_model(
model, data_loader, device, labels
)
# Generate and save confusion matrix
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm)
cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv"
cm_df.to_csv(cm_filename, index=False)
print(f"Confusion Matrix saved to {cm_filename}")
return accuracy, f1, auc_roc
if __name__ == "__main__":
model_path = f"./output/version_{version}/best_model_{version}.pth"
tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")
|