Spaces:

hienntd
/

TextClassification-PhoBERT

Running

File size: 10,801 Bytes

c2a30b3

import torch
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging

import warnings
import time
import pickle
warnings.filterwarnings("ignore")

logging.set_verbosity_error()

# Function to set seed for reproducibility
def seed_everything(seed_value):
    np.random.seed(seed_value)  # Set seed for numpy random numbers
    torch.manual_seed(seed_value)  # Set seed for PyTorch random numbers

    if torch.cuda.is_available():  # If CUDA is available, set CUDA seed
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior
        torch.backends.cudnn.benchmark = True  # Improve performance by allowing cudnn benchmarking

seed_everything(86)  # Set seed value for reproducibility

model_name = "bluenguyen/longformer-phobert-base-4096"  # Pretrained model name
max_len = 512  # Maximum sequence length for tokenizer (512, but can use 256 if phobertbase)
n_classes = 13  # Number of output classes
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)  # Load tokenizer

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # Set device to GPU if available, otherwise CPU
EPOCHS = 5  # Number of training epochs
N_SPLITS = 5  # Number of folds for cross-validation

TRAIN_PATH = "data/train_data_162k.json"  
TEST_PATH = "data/test_data_162k.json"  
VAL_PATH = "data/val_data_162k.json"  

# Function to read data from JSON file
def get_data(path):
    df = pd.read_json(path, lines=True)
    return df

# Read the data from JSON files
train_df = get_data(TRAIN_PATH)
test_df = get_data(TEST_PATH)
valid_df = get_data(VAL_PATH)

# Combine train and validation data
train_df = pd.concat([train_df, valid_df], ignore_index=True)

# Apply StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS)
for fold, (_, val_) in enumerate(skf.split(X=train_df, y=train_df.category)):
    train_df.loc[val_, "kfold"] = fold

class NewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        """
        To customize dataset, inherit from Dataset class and implement
        __len__ & __getitem__
        __getitem__ should return
            data:
                input_ids
                attention_masks
                text
                targets
        """
        row = self.df.iloc[index]
        text, label = self.get_input_data(row)

        # Encode_plus will:
        # (1) split text into token
        # (2) Add the '[CLS]' and '[SEP]' token to the start and end
        # (3) Truncate/Pad sentence to max length
        # (4) Map token to their IDS
        # (5) Create attention mask
        # (6) Return a dictionary of outputs
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_masks': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long),
        }


    def labelencoder(self, text):
        label_map = {
            'Cong nghe': 0, 'Doi song': 1, 'Giai tri': 2, 'Giao duc': 3, 'Khoa hoc': 4,
            'Kinh te': 5, 'Nha dat': 6, 'Phap luat': 7, 'The gioi': 8, 'The thao': 9,
            'Van hoa': 10, 'Xa hoi': 11, 'Xe co': 12
        }
        return label_map.get(text, -1)

    def get_input_data(self, row):
        text = row['processed_content']
        label = self.labelencoder(row['category'])
        return text, label

class NewsClassifier(nn.Module):
    def __init__(self, n_classes, model_name):
        super(NewsClassifier, self).__init__()
        # Load a pre-trained BERT model
        self.bert = AutoModel.from_pretrained(model_name)
        # Dropout layer to prevent overfitting
        self.drop = nn.Dropout(p=0.3)
        # Fully-connected layer to convert BERT's hidden state to the number of classes to predict
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        # Initialize weights and biases of the fully-connected layer using the normal distribution method
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        # Get the output from the BERT model
        last_hidden_state, output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        # Apply dropout
        x = self.drop(output)
        # Pass through the fully-connected layer to get predictions
        x = self.fc(x)
        return x

def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = NewsDataset(df_train, tokenizer, max_len)
    valid_dataset = NewsDataset(df_valid, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=True, num_workers=2)

    return train_loader, valid_loader

# Function to train the model for one epoch
def train(model, criterion, optimizer, train_loader, lr_scheduler):
    model.train()  # Set the model to training mode
    losses = []  # List to store losses during training
    correct = 0  # Variable to store number of correct predictions

    # Iterate over batches in the training data loader
    for batch_idx, data in enumerate(train_loader):
        input_ids = data['input_ids'].to(device)  # Move input_ids to GPU/CPU
        attention_mask = data['attention_masks'].to(device)  # Move attention_mask to GPU/CPU
        targets = data['targets'].to(device)  # Move targets to GPU/CPU

        optimizer.zero_grad()  # Clear gradients from previous iteration
        outputs = model(  # Forward pass through the model
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = criterion(outputs, targets)  # Calculate the loss
        _, pred = torch.max(outputs, dim=1)  # Get the predicted labels

        correct += torch.sum(pred == targets)  # Count correct predictions
        losses.append(loss.item())  # Append the current loss value to losses list
        loss.backward()  # Backpropagation: compute gradients
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Clip gradients to prevent exploding gradients
        optimizer.step()  # Update model parameters
        lr_scheduler.step()  # Update learning rate scheduler

        # Print training progress every 1000 batches
        if batch_idx % 1000 == 0:
            print(f'Batch {batch_idx}/{len(train_loader)} - Loss: {loss.item():.4f}, Accuracy: {correct.double() / ((batch_idx + 1) * train_loader.batch_size):.4f}')

    train_accuracy = correct.double() / len(train_loader.dataset)  # Calculate training accuracy
    avg_loss = np.mean(losses)  # Calculate average loss
    print(f'Train Accuracy: {train_accuracy:.4f} Loss: {avg_loss:.4f}')

# Function to evaluate the model
def eval(model, criterion, valid_loader, test_loader=None):
    model.eval()  # Set the model to evaluation mode
    losses = []  # List to store losses during evaluation
    correct = 0  # Variable to store number of correct predictions

    with torch.no_grad():  # Disable gradient calculation for evaluation
        data_loader = test_loader if test_loader else valid_loader  # Choose between test and validation data loader
        for batch_idx, data in enumerate(data_loader):
            input_ids = data['input_ids'].to(device)  # Move input_ids to GPU/CPU
            attention_mask = data['attention_masks'].to(device)  # Move attention_mask to GPU/CPU
            targets = data['targets'].to(device)  # Move targets to GPU/CPU

            outputs = model(  # Forward pass through the model
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = criterion(outputs, targets)  # Calculate the loss
            _, pred = torch.max(outputs, dim=1)  # Get the predicted labels

            correct += torch.sum(pred == targets)  # Count correct predictions
            losses.append(loss.item())  # Append the current loss value to losses list

    dataset_size = len(test_loader.dataset) if test_loader else len(valid_loader.dataset)  # Determine dataset size
    accuracy = correct.double() / dataset_size  # Calculate accuracy
    avg_loss = np.mean(losses)  # Calculate average loss

    # Print evaluation results (either test or validation)
    if test_loader:
        print(f'Test Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}')
    else:
        print(f'Valid Accuracy: {accuracy:.4f} Loss: {avg_loss:.4f}')

    return accuracy  # Return accuracy for further analysis or logging

total_start_time = time.time()

# Main training loop
for fold in range(skf.n_splits):
    print(f'----------- Fold: {fold + 1} ------------------')
    train_loader, valid_loader = prepare_loaders(train_df, fold=fold)
    model = NewsClassifier(n_classes=13).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader) * EPOCHS
    )
    best_acc = 0

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 30)

        train(model, criterion, optimizer, train_loader, lr_scheduler)
        val_acc = eval(model, criterion, valid_loader)

        if val_acc > best_acc:
            torch.save(model.state_dict(), f'phobert_fold{fold + 1}.pth')
            best_acc = val_acc
        print(f'Best Accuracy for Fold {fold + 1}: {best_acc:.4f}')
        print()
    print(f'Finished Fold {fold + 1} with Best Accuracy: {best_acc:.4f}')
    print('--------------------------------------')


total_end_time = time.time()

total_duration = total_end_time - total_start_time
print(f'Total training time: {total_duration:.2f} seconds')