import torch from transformers import ( DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, ) import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix from datasets import Dataset, load_from_disk import os version = 3 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds) return { "accuracy": acc, "f1": f1, } # Load the dataset df = pd.read_csv("./data_2/WELFake_Dataset.csv") # Drop index df.drop(df.columns[0], axis=1, inplace=True) df.dropna(inplace=True) # Swapping labels around since it originally is the opposite df["label"] = df["label"].map({0: 1, 1: 0}) df["text"] = df["title"] + " " + df["text"] # Remove patterns like "COUNTRY or STATE NAME (Reuters) -" df["text"] = df["text"].str.replace( r"(\b[A-Z]{2,}(?:\s[A-Z]{2,})*\s\(Reuters\)\s-|\(Reuters\))", "", regex=True ) # Remove patterns like "Featured image via author name / image place" df["text"] = df["text"].str.replace(r"Featured image via .+?\.($|\s)", "", regex=True) df = df[["text", "label"]] # Split the data into train, validate, and test sets train_val, test_df = train_test_split(df, test_size=0.2, random_state=42) train_df, val_df = train_test_split( train_val, test_size=0.25, random_state=42 ) # 0.25 * 0.8 = 0.2 # Load the tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=2 ) model.to(device) if __name__ == "__main__": tokenized_data_dir = f"./output/version_{version}/tokenized_data_{version}" # Check if tokenized data exists try: print("Loading tokenized data from disk...") train_dataset = load_from_disk(f"{tokenized_data_dir}/train") val_dataset = load_from_disk(f"{tokenized_data_dir}/validation") test_dataset = load_from_disk(f"{tokenized_data_dir}/test") except: print("Tokenizing...") # Convert DataFrame to Hugging Face Dataset train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) test_dataset = Dataset.from_pandas(test_df) def tokenize(examples): return tokenizer( examples["text"], padding=True, truncation=True, max_length=512 ) # Apply tokenization using map with multiprocessing train_dataset = train_dataset.map(tokenize, batched=True, num_proc=8) val_dataset = val_dataset.map(tokenize, batched=True, num_proc=8) test_dataset = test_dataset.map(tokenize, batched=True, num_proc=8) # Save the tokenized data os.makedirs(tokenized_data_dir, exist_ok=True) train_dataset.save_to_disk(f"{tokenized_data_dir}/train") val_dataset.save_to_disk(f"{tokenized_data_dir}/validation") test_dataset.save_to_disk(f"{tokenized_data_dir}/test") # Set format for PyTorch train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"]) print("Finished tokenizing.") # Define training arguments training_args = TrainingArguments( output_dir=f"./output/version_{version}", num_train_epochs=5, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir=f"./logs/version_{version}", logging_steps=50, eval_steps=1000, save_steps=1000, evaluation_strategy="steps", save_strategy="steps", load_best_model_at_end=True, metric_for_best_model="accuracy", greater_is_better=True, save_total_limit=2, ) # Trainer with compute_metrics trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, ) # Train the model trainer.train() # Save the best model model_save_path = f"./output/version_{version}/best_model_{version}" trainer.save_model(model_save_path) # Evaluation on test set predictions = trainer.predict(test_dataset) test_accuracy = accuracy_score( predictions.label_ids, predictions.predictions.argmax(-1) ) test_f1 = f1_score(predictions.label_ids, predictions.predictions.argmax(-1)) test_auc_roc = roc_auc_score( predictions.label_ids, predictions.predictions.argmax(-1) ) print(f"Test Set Accuracy: {test_accuracy}") print(f"Test Set F1 Score: {test_f1}") print(f"Test Set AUC-ROC: {test_auc_roc}") # Confusion Matrix conf_matrix = confusion_matrix( predictions.label_ids, predictions.predictions.argmax(-1) ) conf_matrix_df = pd.DataFrame(conf_matrix) conf_matrix_df.to_csv( f"./output/version_{version}/confusion_matrix_data_{version}.csv", index=False ) # Extracting training metrics metrics = pd.DataFrame(trainer.state.log_history) metrics = metrics.dropna(subset=["loss"]) metrics = metrics[["epoch", "loss", "eval_loss", "eval_accuracy"]] metrics.rename( columns={ "loss": "train_loss", "eval_loss": "val_loss", "eval_accuracy": "val_accuracy", }, inplace=True, ) metrics.to_csv( f"./output/version_{version}/training_metrics_{version}.csv", index=False )