kimic
/

fake-news-detector-LSTM-GloVe

Model card Files Files and versions Community

fake-news-detector-LSTM-GloVe / inference_main.py

kimic's picture

Initial commit for LSTM with GloVe embeddings

6f9bfc0 11 months ago

3.2 kB

	import torch
	import pandas as pd
	from preprocessing import (
	preprocess_text,
	load_tokenizer,
	prepare_data,
	load_glove_embeddings,
	)
	from data_loader import create_data_loader
	from inference import load_model, evaluate_model
	from sklearn.metrics import confusion_matrix
	import os

	version = 2


	def run_evaluation(model_path, tokenizer_path, device):
	cleaned_path = f"./output/version_{version}/cleaned_inference_data_{version}.csv"
	# Load data
	if os.path.exists(cleaned_path):
	df = pd.read_csv(cleaned_path)
	df.dropna(inplace=True)
	print("Cleaned data found.")
	else:
	print("No cleaned data found. Cleaning data now...")

	df = pd.read_csv("./data_3/news_articles.csv")
	df.drop(
	columns=[
	"author",
	"published",
	"site_url",
	"main_img_url",
	"type",
	"text_without_stopwords",
	"title_without_stopwords",
	"hasImage",
	],
	inplace=True,
	)
	# Map Real to 1 and Fake to 0
	df["label"] = df["label"].map({"Real": 1, "Fake": 0})
	df = df[df["label"].isin([1, 0])]

	# Drop rows where the language is not 'english'
	df = df[df["language"] == "english"]
	df.drop(columns=["language"], inplace=True)

	# Convert "no title" to empty string
	df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)

	df.dropna(inplace=True)
	df["title"] = df["title"].apply(preprocess_text)
	df["text"] = df["text"].apply(preprocess_text)

	df.to_csv(cleaned_path, index=False)
	df.dropna(inplace=True)
	print("Cleaned data saved.")

	labels = df["label"].values

	# Load tokenizer
	tokenizer = load_tokenizer(tokenizer_path)

	embedding_matrix = load_glove_embeddings(
	"./GloVe/glove.6B.300d.txt", tokenizer.word_index, embedding_dim=300
	)

	model = load_model(model_path, embedding_matrix)
	model.to(device)

	# Prepare data
	titles = prepare_data(df["title"], tokenizer)
	texts = prepare_data(df["text"], tokenizer)

	# Create DataLoader
	data_loader = create_data_loader(titles, texts, batch_size=32, shuffle=False)

	# Evaluate
	accuracy, f1, auc_roc, y_true, y_pred = evaluate_model(
	model, data_loader, device, labels
	)

	# Generate and save confusion matrix
	cm = confusion_matrix(y_true, y_pred)
	cm_df = pd.DataFrame(cm)
	cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv"
	cm_df.to_csv(cm_filename, index=False)
	print(f"Confusion Matrix saved to {cm_filename}")
	return accuracy, f1, auc_roc


	if __name__ == "__main__":
	model_path = f"./output/version_{version}/best_model_{version}.pth"
	tokenizer_path = f"./output/version_{version}/tokenizer_{version}.pickle"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}")

	accuracy, f1, auc_roc = run_evaluation(model_path, tokenizer_path, device)
	print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}")