import torch import numpy as np import pandas as pd from sklearn.model_selection import StratifiedKFold from sklearn.metrics import classification_report, confusion_matrix import torch.nn as nn from torch.optim import AdamW from torch.utils.data import Dataset, DataLoader from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging import warnings import time import pickle warnings.filterwarnings("ignore") logging.set_verbosity_error() # Function to set seed for reproducibility def seed_everything(seed_value): np.random.seed(seed_value) # Set seed for numpy random numbers torch.manual_seed(seed_value) # Set seed for PyTorch random numbers if torch.cuda.is_available(): # If CUDA is available, set CUDA seed torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed_all(seed_value) torch.backends.cudnn.deterministic = True # Ensure deterministic behavior torch.backends.cudnn.benchmark = True # Improve performance by allowing cudnn benchmarking seed_everything(86) # Set seed value for reproducibility model_name = "bluenguyen/longformer-phobert-base-4096" # Pretrained model name max_len = 512 # Maximum sequence length for tokenizer (512, but can use 256 if phobertbase) n_classes = 13 # Number of output classes tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) # Load tokenizer device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Set device to GPU if available, otherwise CPU EPOCHS = 5 # Number of training epochs N_SPLITS = 5 # Number of folds for cross-validation TRAIN_PATH = "data/train_data_162k.json" TEST_PATH = "data/test_data_162k.json" VAL_PATH = "data/val_data_162k.json" # Function to read data from JSON file def get_data(path): df = pd.read_json(path, lines=True) return df # Read the data from JSON files train_df = get_data(TRAIN_PATH) test_df = get_data(TEST_PATH) valid_df = get_data(VAL_PATH) # Combine train and validation data train_df = pd.concat([train_df, valid_df], ignore_index=True) # Apply StratifiedKFold skf = StratifiedKFold(n_splits=N_SPLITS) for fold, (_, val_) in enumerate(skf.split(X=train_df, y=train_df.category)): train_df.loc[val_, "kfold"] = fold class NewsDataset(Dataset): def __init__(self, df, tokenizer, max_len): self.df = df self.max_len = max_len self.tokenizer = tokenizer def __len__(self): return len(self.df) def __getitem__(self, index): """ To customize dataset, inherit from Dataset class and implement __len__ & __getitem__ __getitem__ should return data: input_ids attention_masks text targets """ row = self.df.iloc[index] text, label = self.get_input_data(row) # Encode_plus will: # (1) split text into token # (2) Add the '[CLS]' and '[SEP]' token to the start and end # (3) Truncate/Pad sentence to max length # (4) Map token to their IDS # (5) Create attention mask # (6) Return a dictionary of outputs encoding = self.tokenizer.encode_plus( text, truncation=True, add_special_tokens=True, max_length=self.max_len, padding='max_length', return_attention_mask=True, return_token_type_ids=False, return_tensors='pt', ) return { 'text': text, 'input_ids': encoding['input_ids'].flatten(), 'attention_masks': encoding['attention_mask'].flatten(), 'targets': torch.tensor(label, dtype=torch.long), } def labelencoder(self, text): label_map = { 'Cong nghe': 0, 'Doi song': 1, 'Giai tri': 2, 'Giao duc': 3, 'Khoa hoc': 4, 'Kinh te': 5, 'Nha dat': 6, 'Phap luat': 7, 'The gioi': 8, 'The thao': 9, 'Van hoa': 10, 'Xa hoi': 11, 'Xe co': 12 } return label_map.get(text, -1) def get_input_data(self, row): text = row['processed_content'] label = self.labelencoder(row['category']) return text, label class NewsClassifier(nn.Module): def __init__(self, n_classes, model_name): super(NewsClassifier, self).__init__() # Load a pre-trained BERT model self.bert = AutoModel.from_pretrained(model_name) # Dropout layer to prevent overfitting self.drop = nn.Dropout(p=0.3) # Fully-connected layer to convert BERT's hidden state to the number of classes to predict self.fc = nn.Linear(self.bert.config.hidden_size, n_classes) # Initialize weights and biases of the fully-connected layer using the normal distribution method nn.init.normal_(self.fc.weight, std=0.02) nn.init.normal_(self.fc.bias, 0) def forward(self, input_ids, attention_mask): # Get the output from the BERT model last_hidden_state, output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict=False ) # Apply dropout x = self.drop(output) # Pass through the fully-connected layer to get predictions x = self.fc(x) return x def prepare_loaders(df, fold): df_train = df[df.kfold != fold].reset_index(drop=True) df_valid = df[df.kfold == fold].reset_index(drop=True) train_dataset = NewsDataset(df_train, tokenizer, max_len) valid_dataset = NewsDataset(df_valid, tokenizer, max_len) train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2) valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True, num_workers=2) return train_loader, valid_loader # Function to train the model for one epoch def train(model, criterion, optimizer, train_loader, lr_scheduler): model.train() # Set the model to training mode losses = [] # List to store losses during training correct = 0 # Variable to store number of correct predictions # Iterate over batches in the training data loader for batch_idx, data in enumerate(train_loader): input_ids = data['input_ids'].to(device) # Move input_ids to GPU/CPU attention_mask = data['attention_masks'].to(device) # Move attention_mask to GPU/CPU targets = data['targets'].to(device) # Move targets to GPU/CPU optimizer.zero_grad() # Clear gradients from previous iteration outputs = model( # Forward pass through the model input_ids=input_ids, attention_mask=attention_mask ) loss = criterion(outputs, targets) # Calculate the loss _, pred = torch.max(outputs, dim=1) # Get the predicted labels correct += torch.sum(pred == targets) # Count correct predictions losses.append(loss.item()) # Append the current loss value to losses list loss.backward() # Backpropagation: compute gradients nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clip gradients to prevent exploding gradients optimizer.step() # Update model parameters lr_scheduler.step() # Update learning rate scheduler # Print training progress every 1000 batches if batch_idx % 1000 == 0: print(f'Batch {batch_idx}/{len(train_loader)} - Loss: {loss.item():.4f}, Accuracy: {correct.double() / ((batch_idx + 1) * train_loader.batch_size):.4f}') train_accuracy = correct.double() / len(train_loader.dataset) # Calculate training accuracy avg_loss = np.mean(losses) # Calculate average loss print(f'Train Accuracy: {train_accuracy:.4f} Loss: {avg_loss:.4f}') # Function to evaluate the model def eval(model, criterion, valid_loader, test_loader=None): model.eval() # Set the model to evaluation mode losses = [] # List to store losses during evaluation correct = 0 # Variable to store number of correct predictions with torch.no_grad(): # Disable gradient calculation for evaluation data_loader = test_loader if test_loader else valid_loader # Choose between test and validation data loader for batch_idx, data in enumerate(data_loader): input_ids = data['input_ids'].to(device) # Move input_ids to GPU/CPU attention_mask = data['attention_masks'].to(device) # Move attention_mask to GPU/CPU targets = data['targets'].to(device) # Move targets to GPU/CPU outputs = model( # Forward pass through the model input_ids=input_ids, attention_mask=attention_mask ) loss = criterion(outputs, targets) # Calculate the loss _, pred = torch.max(outputs, dim=1) # Get the predicted labels correct += torch.sum(pred == targets) # Count correct predictions losses.append(loss.item()) # Append the current loss value to losses list dataset_size = len(test_loader.dataset) if test_loader else len(valid_loader.dataset) # Determine dataset size accuracy = correct.double() / dataset_size # Calculate accuracy avg_loss = np.mean(losses) # Calculate average loss # Print evaluation results (either test or validation) if test_loader: print(f'Test Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}') else: print(f'Valid Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}') return accuracy # Return accuracy for further analysis or logging total_start_time = time.time() # Main training loop for fold in range(skf.n_splits): print(f'----------- Fold: {fold + 1} ------------------') train_loader, valid_loader = prepare_loaders(train_df, fold=fold) model = NewsClassifier(n_classes=13).to(device) criterion = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=2e-5) lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS ) best_acc = 0 for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 30) train(model, criterion, optimizer, train_loader, lr_scheduler) val_acc = eval(model, criterion, valid_loader) if val_acc > best_acc: torch.save(model.state_dict(), f'phobert_fold{fold + 1}.pth') best_acc = val_acc print(f'Best Accuracy for Fold {fold + 1}: {best_acc:.4f}') print() print(f'Finished Fold {fold + 1} with Best Accuracy: {best_acc:.4f}') print('--------------------------------------') total_end_time = time.time() total_duration = total_end_time - total_start_time print(f'Total training time: {total_duration:.2f} seconds')