kimic
/

fake-news-detector-DistilBERT

Model card Files Files and versions Metrics Training metrics Community

fake-news-detector-DistilBERT / inference_main.py

kimic's picture

Initial commit for BERT

608e624 9 months ago

No virus

3.88 kB

	import torch
	from transformers import (
	DistilBertTokenizer,
	DistilBertForSequenceClassification,
	Trainer,
	)
	from datasets import Dataset, load_from_disk
	import pandas as pd
	from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
	from functools import partial
	import os

	version = 3

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print("Device:", device)


	if __name__ == "__main__":
	# Load the tokenizer
	tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

	cleaned_path = (
	f"./output/version_{version}/tokenized_data_{version}/inference_{version}"
	)

	# Load the model
	model = DistilBertForSequenceClassification.from_pretrained(
	f"./output/version_{version}/best_model_{version}"
	)
	model.to(device)

	# Load and prepare the new dataset
	if os.path.exists(cleaned_path):
	print("Loading dataset...")
	dataset = load_from_disk(cleaned_path)
	else:
	print("No dataset found. Loading and preparing dataset now...")
	# # Load the datasets
	# true_news = pd.read_csv("data_1/True.csv")
	# fake_news = pd.read_csv("data_1/Fake.csv")

	# # Add labels
	# true_news["label"] = 1
	# fake_news["label"] = 0

	# # Combine the datasets
	# df = pd.concat([true_news, fake_news], ignore_index=True)

	df = pd.read_csv("./data_3/news_articles.csv")
	df.drop(
	columns=[
	"author",
	"published",
	"site_url",
	"main_img_url",
	"type",
	"text_without_stopwords",
	"title_without_stopwords",
	"hasImage",
	],
	inplace=True,
	)
	# Map Real to 1 and Fake to 0
	df["label"] = df["label"].map({"Real": 1, "Fake": 0})
	df = df[df["label"].isin([1, 0])]

	# Drop rows where the language is not 'english'
	df = df[df["language"] == "english"]
	df.drop(columns=["language"], inplace=True)

	# Convert "no title" to empty string
	df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x)

	df.dropna(inplace=True)

	# Remove patterns like "COUNTRY or STATE NAME (Reuters) -"
	df["text"] = df["text"].str.replace(
	r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-\|\(Reuters\))", "", regex=True
	)

	# Remove patterns like "Featured image via author name / image place"
	df["text"] = df["text"].str.replace(
	r"Featured image via .+?\.($\|\s)", "", regex=True
	)

	df["text"] = df["title"] + " " + df["text"]
	df = df[["text", "label"]]
	df["label"] = df["label"].astype(int)

	dataset = Dataset.from_pandas(df)

	def tokenize(tokenizer, examples):
	return tokenizer(
	examples["text"], padding=True, truncation=True, max_length=512
	)

	# Use partial to create a new function that has tokenizer as its first argument
	tokenize_with_tokenizer = partial(tokenize, tokenizer)

	# Use tokenize_with_tokenizer in the map function
	dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8)
	dataset.save_to_disk(cleaned_path)

	dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
	print("Finished tokenizing.")

	trainer = Trainer(model=model)
	predictions = trainer.predict(dataset)
	accuracy = accuracy_score(predictions.label_ids, predictions.predictions.argmax(-1))
	f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1))
	auc_roc = roc_auc_score(predictions.label_ids, predictions.predictions.argmax(-1))

	print(f"Accuracy: {accuracy}")
	print(f"F1 Score: {f1}")
	print(f"AUC-ROC: {auc_roc}")