import torch import pandas as pd from preprocessing import ( preprocess_text, load_tokenizer, prepare_data, load_glove_embeddings, ) from data_loader import create_data_loader from inference import load_model, evaluate_model from sklearn.metrics import confusion_matrix import os version = 2 def run_evaluation(model_path, tokenizer_path, device): cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv" # Load data if os.path.exists(cleaned_path): df = pd.read_csv(cleaned_path) df.dropna(inplace=True) print("Cleaned data found.") else: print("No cleaned data found. Cleaning data now...") df = pd.read_csv("./data_3/news_articles.csv") df.drop( columns=[ "author", "published", "site_url", "main_img_url", "type", "text_without_stopwords", "title_without_stopwords", "hasImage", ], inplace=True, ) # Map Real to 1 and Fake to 0 df["label"] = df["label"].map({"Real": 1, "Fake": 0}) df = df[df["label"].isin([1, 0])] # Drop rows where the language is not 'english' df = df[df["language"] == "english"] df.drop(columns=["language"], inplace=True) # Convert "no title" to empty string df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x) df.dropna(inplace=True) df["title"] = df["title"].apply(preprocess_text) df["text"] = df["text"].apply(preprocess_text) df.to_csv(cleaned_path, index=False) df.dropna(inplace=True) print("Cleaned data saved.") labels = df["label"].values # Load tokenizer tokenizer = load_tokenizer(tokenizer_path) embedding_matrix = load_glove_embeddings( "./GloVe/glove.6B.300d.txt", tokenizer.word_index, embedding_dim=300 ) model = load_model(model_path, embedding_matrix) model.to(device) # Prepare data titles = prepare_data(df["title"], tokenizer) texts = prepare_data(df["text"], tokenizer) # Create DataLoader data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False) # Evaluate accuracy, f1, auc_roc, y_true, y_pred = evaluate_model( model, data_loader, device, labels ) # Generate and save confusion matrix cm = confusion_matrix(y_true, y_pred) cm_df = pd.DataFrame(cm) cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv" cm_df.to_csv(cm_filename, index=False) print(f"Confusion Matrix saved to {cm_filename}") return accuracy, f1, auc_roc if __name__ == "__main__": model_path = f"./output/version_{version}/best_model_{version}.pth" tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device: {device}") accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device) print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")