|
import torch |
|
import torch.nn as nn |
|
import pandas as pd |
|
from model import LSTMModel |
|
from preprocessing import preprocess_text |
|
from data_loader import create_data_loader |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import f1_score, roc_auc_score |
|
from keras.preprocessing.text import Tokenizer |
|
from keras_preprocessing.sequence import pad_sequences |
|
import pickle |
|
import train as tr |
|
from torch.utils.data import Dataset, DataLoader |
|
from data_loader import NewsDataset |
|
import os |
|
|
|
version = 9 |
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_path = "./data_2/WELFake_Dataset.csv" |
|
cleaned_path = f"./output/version_{version}/cleaned_news_data_{version}.csv" |
|
|
|
try: |
|
df = pd.read_csv(cleaned_path) |
|
df.dropna(inplace=True) |
|
print("Cleaned data found.") |
|
except: |
|
print("No cleaned data found. Cleaning data now...") |
|
df = pd.read_csv(data_path) |
|
|
|
|
|
df.drop(df.columns[0], axis=1, inplace=True) |
|
df.dropna(inplace=True) |
|
|
|
|
|
df["label"] = df["label"].map({0: 1, 1: 0}) |
|
|
|
df["title"] = df["title"].apply(preprocess_text) |
|
df["text"] = df["text"].apply(preprocess_text) |
|
|
|
|
|
os.makedirs(os.path.dirname(cleaned_path), exist_ok=True) |
|
df.to_csv(cleaned_path, index=False) |
|
print("Cleaned data saved.") |
|
|
|
|
|
train_val, test = train_test_split(df, test_size=0.2, random_state=42) |
|
train, val = train_test_split( |
|
train_val, test_size=0.25, random_state=42 |
|
) |
|
|
|
|
|
tokenizer = Tokenizer() |
|
|
|
|
|
tokenizer.fit_on_texts(train["title"] + train["text"]) |
|
|
|
with open(f"./output/version_{version}/tokenizer_{version}.pickle", "wb") as handle: |
|
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
X_train_title = tokenizer.texts_to_sequences(train["title"]) |
|
X_train_text = tokenizer.texts_to_sequences(train["text"]) |
|
X_val_title = tokenizer.texts_to_sequences(val["title"]) |
|
X_val_text = tokenizer.texts_to_sequences(val["text"]) |
|
X_test_title = tokenizer.texts_to_sequences(test["title"]) |
|
X_test_text = tokenizer.texts_to_sequences(test["text"]) |
|
|
|
|
|
max_length = 500 |
|
X_train_title = pad_sequences(X_train_title, maxlen=max_length) |
|
X_train_text = pad_sequences(X_train_text, maxlen=max_length) |
|
X_val_title = pad_sequences(X_val_title, maxlen=max_length) |
|
X_val_text = pad_sequences(X_val_text, maxlen=max_length) |
|
X_test_title = pad_sequences(X_test_title, maxlen=max_length) |
|
X_test_text = pad_sequences(X_test_text, maxlen=max_length) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Device: {device}") |
|
|
|
model = LSTMModel(len(tokenizer.word_index) + 1).to(device) |
|
|
|
|
|
train_data = NewsDataset( |
|
torch.tensor(X_train_title), |
|
torch.tensor(X_train_text), |
|
torch.tensor(train["label"].values), |
|
) |
|
val_data = NewsDataset( |
|
torch.tensor(X_val_title), |
|
torch.tensor(X_val_text), |
|
torch.tensor(val["label"].values), |
|
) |
|
test_data = NewsDataset( |
|
torch.tensor(X_test_title), |
|
torch.tensor(X_test_text), |
|
torch.tensor(test["label"].values), |
|
) |
|
|
|
train_loader = DataLoader( |
|
train_data, |
|
batch_size=32, |
|
shuffle=True, |
|
num_workers=6, |
|
pin_memory=True, |
|
persistent_workers=True, |
|
) |
|
val_loader = DataLoader( |
|
val_data, |
|
batch_size=32, |
|
shuffle=False, |
|
num_workers=6, |
|
pin_memory=True, |
|
persistent_workers=True, |
|
) |
|
test_loader = DataLoader( |
|
test_data, |
|
batch_size=32, |
|
shuffle=False, |
|
num_workers=6, |
|
pin_memory=True, |
|
persistent_workers=True, |
|
) |
|
|
|
criterion = nn.BCELoss() |
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) |
|
|
|
trained_model, best_accuracy, best_epoch = tr.train( |
|
model=model, |
|
train_loader=train_loader, |
|
val_loader=val_loader, |
|
criterion=criterion, |
|
optimizer=optimizer, |
|
version=version, |
|
epochs=10, |
|
device=device, |
|
max_grad_norm=1.0, |
|
early_stopping_patience=3, |
|
early_stopping_delta=0.01, |
|
) |
|
|
|
print(f"Best model was saved at epoch: {best_epoch}") |
|
|
|
|
|
best_model_path = f"./output/version_{version}/best_model_{version}.pth" |
|
model.load_state_dict(torch.load(best_model_path, map_location=device)) |
|
|
|
|
|
model.eval() |
|
true_labels = [] |
|
predicted_labels = [] |
|
predicted_probs = [] |
|
|
|
with torch.no_grad(): |
|
correct = 0 |
|
total = 0 |
|
for titles, texts, labels in test_loader: |
|
titles, texts, labels = ( |
|
titles.to(device), |
|
texts.to(device), |
|
labels.to(device).float(), |
|
) |
|
outputs = model(titles, texts).squeeze() |
|
|
|
predicted = (outputs > 0.5).float() |
|
total += labels.size(0) |
|
correct += (predicted == labels).sum().item() |
|
true_labels.extend(labels.cpu().numpy()) |
|
predicted_labels.extend(predicted.cpu().numpy()) |
|
predicted_probs.extend(outputs.cpu().numpy()) |
|
|
|
test_accuracy = 100 * correct / total |
|
f1 = f1_score(true_labels, predicted_labels) |
|
auc_roc = roc_auc_score(true_labels, predicted_probs) |
|
|
|
print( |
|
f"Test Accuracy: {test_accuracy:.2f}%, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}" |
|
) |
|
|
|
|
|
confusion_data = pd.DataFrame({"True": true_labels, "Predicted": predicted_labels}) |
|
confusion_data.to_csv( |
|
f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False |
|
) |
|
|