import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split import pandas as pd from transformers import BertTokenizer, BertForSequenceClassification, AdamW from sklearn.metrics import accuracy_score dataset_path ="BIAS_DATASET.csv" df = pd.read_csv(dataset_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Extract text and labels from your dataset texts = df['TEXT'].tolist() text_labels = df['LABEL'].tolist() # Create a label mapping from text labels to integers label_mapping = { "gender_bias": 0, "religion_bias": 1, "country_bias": 2, "non_bias": 3, } # Encode text labels as integers using the label mapping labels = [label_mapping[label] for label in text_labels] # Tokenize the text data and create input tensors tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") input_ids = [] attention_masks = [] for text in texts: encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt') input_ids.append(encoding['input_ids']) attention_masks.append(encoding['attention_mask']) input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # Split the dataset into training and validation sets train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split( input_ids, attention_masks, labels, test_size=0.2, random_state=42 ) # BERT Model num_classes = 4 model_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes) model_bert.to(device) # Create data loaders for BERT model batch_size = 8 train_dataset = TensorDataset(train_inputs, train_masks, train_labels) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_dataset = TensorDataset(val_inputs, val_masks, val_labels) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # Training settings for BERT model criterion_bert = nn.CrossEntropyLoss() optimizer_bert = AdamW(model_bert.parameters(), lr=2e-5) num_epochs_bert = 3 # Training loop for BERT model for epoch in range(num_epochs_bert): model_bert.train() total_loss_bert = 0.0 for inputs, masks, labels in train_dataloader: optimizer_bert.zero_grad() # move data to device (gpu/cpu) inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device) outputs_bert = model_bert(inputs, attention_mask=masks, labels=labels) # training loss loss_bert = outputs_bert.loss # backpropogation loss_bert.backward() optimizer_bert.step() total_loss_bert += loss_bert.item() average_loss_bert = total_loss_bert / len(train_dataloader) # Validation for BERT model model_bert.eval() val_losses_bert = [] val_predictions_bert = [] with torch.no_grad(): for inputs, masks, labels in val_dataloader: # move data to device (gpu / cpu) inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device) # make prediction outputs_bert = model_bert(inputs, attention_mask=masks, labels=labels) # validation loss val_losses_bert.append(outputs_bert.loss) # validation accuracy val_predictions_bert.extend(outputs_bert.logits.argmax(dim=1).tolist()) val_loss_bert = torch.mean(torch.tensor(val_losses_bert)) # Convert predictions to numpy array val_predictions_bert = torch.tensor(val_predictions_bert) val_labels_tensor = val_labels.clone().detach() # Calculate accuracy for BERT model accuracy_bert = accuracy_score(val_labels_tensor.numpy(), val_predictions_bert.numpy()) print(f"Epoch {epoch + 1}/{num_epochs_bert} - BERT Model - Training Loss: {average_loss_bert:.4f} - Validation Loss: {val_loss_bert:.4f} - Val Accuracy: {accuracy_bert:.4f}") torch.save(model_bert, "bert_model.pth")