import torch import torch.nn as nn import pandas as pd from model import LSTMModel from preprocessing import preprocess_text, load_glove_embeddings from data_loader import create_data_loader from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score, roc_auc_score from keras.preprocessing.text import Tokenizer from keras_preprocessing.sequence import pad_sequences import pickle import train as tr from torch.utils.data import Dataset, DataLoader from data_loader import NewsDataset import os version = 2 if __name__ == "__main__": data_path = "./data_2/WELFake_Dataset.csv" cleaned_path = f"./output/version_{version}/cleaned_news_data_{version}.csv" # Load data if os.path.exists(cleaned_path): df = pd.read_csv(cleaned_path) df.dropna(inplace=True) print("Cleaned data found.") else: print("No cleaned data found. Cleaning data now...") df = pd.read_csv(data_path) # Drop index df.drop(df.columns[0], axis=1, inplace=True) df.dropna(inplace=True) # Swapping labels around since it originally is the opposite df["label"] = df["label"].map({0: 1, 1: 0}) df["title"] = df["title"].apply(preprocess_text) df["text"] = df["text"].apply(preprocess_text) # Create the directory if it does not exist os.makedirs(os.path.dirname(cleaned_path), exist_ok=True) df.to_csv(cleaned_path, index=False) print("Cleaned data saved.") # Splitting the data train_val, test = train_test_split(df, test_size=0.2, random_state=42) train, val = train_test_split( train_val, test_size=0.25, random_state=42 ) # 0.25 * 0.8 = 0.2 # Initialize the tokenizer tokenizer = Tokenizer() # Fit the tokenizer on the training data tokenizer.fit_on_texts(train["title"] + train["text"]) with open(f"./output/version_{version}/tokenizer_{version}.pickle", "wb") as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # Tokenize the data X_train_title = tokenizer.texts_to_sequences(train["title"]) X_train_text = tokenizer.texts_to_sequences(train["text"]) X_val_title = tokenizer.texts_to_sequences(val["title"]) X_val_text = tokenizer.texts_to_sequences(val["text"]) X_test_title = tokenizer.texts_to_sequences(test["title"]) X_test_text = tokenizer.texts_to_sequences(test["text"]) # GloVe embeddings embedding_matrix = load_glove_embeddings( "./GloVe/glove.6B.300d.txt", tokenizer.word_index, embedding_dim=300 ) # Padding sequences max_length = 500 X_train_title = pad_sequences(X_train_title, maxlen=max_length) X_train_text = pad_sequences(X_train_text, maxlen=max_length) X_val_title = pad_sequences(X_val_title, maxlen=max_length) X_val_text = pad_sequences(X_val_text, maxlen=max_length) X_test_title = pad_sequences(X_test_title, maxlen=max_length) X_test_text = pad_sequences(X_test_text, maxlen=max_length) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device: {device}") model = LSTMModel(embedding_matrix).to(device) # Convert data to PyTorch tensors train_data = NewsDataset( torch.tensor(X_train_title), torch.tensor(X_train_text), torch.tensor(train["label"].values), ) val_data = NewsDataset( torch.tensor(X_val_title), torch.tensor(X_val_text), torch.tensor(val["label"].values), ) test_data = NewsDataset( torch.tensor(X_test_title), torch.tensor(X_test_text), torch.tensor(test["label"].values), ) train_loader = DataLoader( train_data, batch_size=32, shuffle=True, num_workers=6, pin_memory=True, persistent_workers=True, ) val_loader = DataLoader( val_data, batch_size=32, shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True, ) test_loader = DataLoader( test_data, batch_size=32, shuffle=False, num_workers=6, pin_memory=True, persistent_workers=True, ) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) trained_model, best_accuracy, best_epoch = tr.train( model=model, train_loader=train_loader, val_loader=val_loader, criterion=criterion, optimizer=optimizer, version=version, epochs=10, device=device, max_grad_norm=1.0, early_stopping_patience=3, early_stopping_delta=0.01, ) print(f"Best model was saved at epoch: {best_epoch}") # Load the best model before testing best_model_path = f"./output/version_{version}/best_model_{version}.pth" model.load_state_dict(torch.load(best_model_path, map_location=device)) # Testing model.eval() true_labels = [] predicted_labels = [] predicted_probs = [] with torch.no_grad(): correct = 0 total = 0 for titles, texts, labels in test_loader: titles, texts, labels = ( titles.to(device), texts.to(device), labels.to(device).float(), ) outputs = model(titles, texts).squeeze() predicted = (outputs > 0.5).float() total += labels.size(0) correct += (predicted == labels).sum().item() true_labels.extend(labels.cpu().numpy()) predicted_labels.extend(predicted.cpu().numpy()) predicted_probs.extend(outputs.cpu().numpy()) test_accuracy = 100 * correct / total f1 = f1_score(true_labels, predicted_labels) auc_roc = roc_auc_score(true_labels, predicted_probs) print( f"Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}" ) # Create DataFrame and Save to CSV confusion_data = pd.DataFrame({"True": true_labels, "Predicted": predicted_labels}) confusion_data.to_csv( f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False )