import torch from transformers import ( DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, ) from datasets import Dataset, load_from_disk import pandas as pd from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix from functools import partial import os version = 3 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) if __name__ == "__main__": # Load the tokenizer tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") cleaned_path = ( f"./output/version_{version}/tokenized_data_{version}/inference_{version}" ) # Load the model model = DistilBertForSequenceClassification.from_pretrained( f"./output/version_{version}/best_model_{version}" ) model.to(device) # Load and prepare the new dataset if os.path.exists(cleaned_path): print("Loading dataset...") dataset = load_from_disk(cleaned_path) else: print("No dataset found. Loading and preparing dataset now...") # # Load the datasets # true_news = pd.read_csv("data_1/True.csv") # fake_news = pd.read_csv("data_1/Fake.csv") # # Add labels # true_news["label"] = 1 # fake_news["label"] = 0 # # Combine the datasets # df = pd.concat([true_news, fake_news], ignore_index=True) df = pd.read_csv("./data_3/news_articles.csv") df.drop( columns=[ "author", "published", "site_url", "main_img_url", "type", "text_without_stopwords", "title_without_stopwords", "hasImage", ], inplace=True, ) # Map Real to 1 and Fake to 0 df["label"] = df["label"].map({"Real": 1, "Fake": 0}) df = df[df["label"].isin([1, 0])] # Drop rows where the language is not 'english' df = df[df["language"] == "english"] df.drop(columns=["language"], inplace=True) # Convert "no title" to empty string df["title"] = df["title"].apply(lambda x: "" if x == "no title" else x) df.dropna(inplace=True) # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" df["text"] = df["text"].str.replace( r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True ) # Remove patterns like "Featured image via author name / image place" df["text"] = df["text"].str.replace( r"Featured image via .+?\.($|\s)", "", regex=True ) df["text"] = df["title"] + " " + df["text"] df = df[["text", "label"]] df["label"] = df["label"].astype(int) dataset = Dataset.from_pandas(df) def tokenize(tokenizer, examples): return tokenizer( examples["text"], padding=True, truncation=True, max_length=512 ) # Use partial to create a new function that has tokenizer as its first argument tokenize_with_tokenizer = partial(tokenize, tokenizer) # Use tokenize_with_tokenizer in the map function dataset = dataset.map(tokenize_with_tokenizer, batched=True, num_proc=8) dataset.save_to_disk(cleaned_path) dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) print("Finished tokenizing.") trainer = Trainer(model=model) predictions = trainer.predict(dataset) # Compute metrics true_labels = predictions.label_ids pred_labels = predictions.predictions.argmax(-1) accuracy = accuracy_score(true_labels, pred_labels) f1 = f1_score(true_labels, pred_labels) auc_roc = roc_auc_score(true_labels, predictions.predictions[:, 1]) # Generate and save confusion matrix cm = confusion_matrix(true_labels, pred_labels) cm_df = pd.DataFrame(cm) cm_filename = f"./output/version_{version}/confusion_matrix_inference_{version}.csv" cm_df.to_csv(cm_filename, index=False) print(f"Confusion Matrix saved to {cm_filename}") print(f"Accuracy: {accuracy}") print(f"F1 Score: {f1}") print(f"AUC-ROC: {auc_roc}")