import torch import pandas as pd from preprocessing import preprocess_text, load_tokenizer, prepare_data from data_loader import create_data_loader from inference import load_model, evaluate_model from sklearn.metrics import confusion_matrix import os version = 9 def run_evaluation(model_path, tokenizer_path, device): cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv" # Load data if os.path.exists(cleaned_path): df = pd.read_csv(cleaned_path) df.dropna(inplace=True) print("Cleaned data found.") else: print("No cleaned data found. Cleaning data now...") # # Load the datasets # true_news = pd.read_csv("data_1/True.csv") # fake_news = pd.read_csv("data_1/Fake.csv") # # Add labels # true_news["label"] = 1 # fake_news["label"] = 0 # # Combine the datasets # df = pd.concat([true_news, fake_news], ignore_index=True) # # Drop unnecessary columns # df.drop(columns=["subject", "date"], inplace=True) df = pd.read_csv("./data_3/news_articles.csv") df.drop( columns=[ "author", "published", "site_url", "main_img_url", "type", "text_without_stopwords", "title_without_stopwords", "hasImage", ], inplace=True, ) # Map Real to 1 and Fake to 0 df["label"] = df["label"].map({"Real": 1, "Fake": 0}) df = df[df["label"].isin([1, 0])] # Drop rows where the language is not 'english' df = df[df["language"] == "english"] df.drop(columns=["language"], inplace=True) # Convert "no title" to empty string df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x) df.dropna(inplace=True) df["title"] = df["title"].apply(preprocess_text) df["text"] = df["text"].apply(preprocess_text) df.to_csv(cleaned_path, index=False) df.dropna(inplace=True) print("Cleaned data saved.") labels = df["label"].values # Load tokenizer and model tokenizer = load_tokenizer(tokenizer_path) model = load_model(model_path, len(tokenizer.word_index) + 1) # Prepare data titles = prepare_data(df["title"], tokenizer) texts = prepare_data(df["text"], tokenizer) # Create DataLoader data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False) # Evaluate accuracy, f1, auc_roc, y_true, y_pred = evaluate_model( model, data_loader, device, labels ) # Generate and save confusion matrix cm = confusion_matrix(y_true, y_pred) cm_df = pd.DataFrame(cm) cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv" cm_df.to_csv(cm_filename, index=False) print(f"Confusion Matrix saved to {cm_filename}") return accuracy, f1, auc_roc if __name__ == "__main__": model_path = f"./output/version_{version}/best_model_{version}.pth" tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device: {device}") accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device) print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")