nvidia/parakeet-ctc-0.6b · OOM error when training on multiple GPUs

Hi NVIDIA Team,

I’m having issues when fine-tuning the parakeet-ctc-0.6b model on multiple GPUs. I can set batch_size = 8 when training on a single GPU, but only batch_size = 2 when training on 4 GPUs; it won’t run on 8 GPUs even with batch_size = 1. The OOM error messages always looks something like this (the numbers are always slightly different).

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0
has a total capacity of 15.77 GiB of which 3.38 MiB is free. Including non-PyTorch memory, 
this process has 14.70 GiB memory in use. Process 79480 has 364.00 MiB memory in use. 
Process 79481 has 364.00 MiB memory in use. Process 79479 has 364.00 MiB memory in 
use. Of the allocated memory 13.85 GiB is allocated by PyTorch, and 106.11 MiB is 
reserved by PyTorch but unallocated. If reserved but unallocated memory is large try 
setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  
See documentation for Memory Management 
(https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

I’m using an AWS p3.16xlarge instance with 8 V100 GPUs.

Here is the training code:

import argparse
import copy

# To avoid memory error, these have to be set before importing tensor
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import nemo.collections.asr as nemo_asr
import pytorch_lightning as ptl
from omegaconf import OmegaConf, open_dict
from nemo.utils import exp_manager

def setup_model_config(model, args):
    """Set up and return the model configuration."""
    cfg = copy.deepcopy(model.cfg)

    with open_dict(cfg):
        # Train dataset
        cfg.train_ds.manifest_filepath = args.train_manifest
        cfg.train_ds.normalize_transcripts = True
        cfg.train_ds.batch_size = 8         # this parameter changes
        cfg.train_ds.num_workers = 8
        cfg.train_ds.pin_memory = True
        cfg.train_ds.trim_silence = True
        cfg.train_ds.shuffle = True
        cfg.train_ds.is_tarred = False
        cfg.train_ds.augmentor = {}

        # Validation dataset 
        cfg.validation_ds.manifest_filepath = args.val_manifest
        cfg.validation_ds.normalize_transcripts = True
        cfg.validation_ds.batch_size = 8    # same as for train_ds.batch_size
        cfg.validation_ds.num_workers = 8
        cfg.validation_ds.pin_memory = True
        cfg.validation_ds.trim_silence = True

    return cfg

def setup_optimizer_config(model):
    """Set up and return the optimizer configuration."""
    with open_dict(model.cfg.optim):
        model.cfg.optim.lr = 0.05 
        model.cfg.optim.weight_decay = 0.001
        model.cfg.optim.sched.warmup_steps = None
        model.cfg.optim.sched.warmup_ratio = 0.10
        model.cfg.optim.sched.min_lr = 1e-9

    return model.cfg.optim

def setup_specaugment_config(model):
    """Set up and return the SpecAugment configuration."""
    with open_dict(model.cfg.spec_augment):
        model.cfg.spec_augment.freq_masks = 2
        model.cfg.spec_augment.freq_width = 25
        model.cfg.spec_augment.time_masks = 10
        model.cfg.spec_augment.time_width = 0.05

    return model.cfg.spec_augment

def setup_trainer():
    """Set up and return the PyTorch Lightning trainer."""
    return ptl.Trainer(
        devices=4,
        accelerator='gpu',
        max_epochs=50,
        accumulate_grad_batches=16,   
        enable_checkpointing=False,
        logger=False,
        log_every_n_steps=100,
        val_check_interval=0.5,
    )

def setup_exp_manager_config(trainer):
    """Set up and return the experiment manager configuration."""

    config = exp_manager.ExpManagerConfig(
        exp_dir='experiments/',
        name="oom_testing2-bs8-d4", 
        use_datetime_version=True,
        checkpoint_callback_params=exp_manager.CallbackParams(
            monitor="val_wer",
            mode="min",
            always_save_nemo=True,
            save_best_model=True,
        ),
    )

    return OmegaConf.structured(config)

def main(args):
    """Main function to run the script."""
    
    model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name=args.model_name)
    

    cfg = setup_model_config(model, args)
    model.setup_training_data(cfg.train_ds)
    model.setup_validation_data(cfg.validation_ds)

    model.cfg.optim = setup_optimizer_config(model)
    model.cfg.spec_augment = setup_specaugment_config(model)
    model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)

    trainer = setup_trainer()
    model.set_trainer(trainer)
    # model.cfg = model._c

    config = setup_exp_manager_config(trainer)
    exp_manager.exp_manager(trainer, config)

    trainer.fit(model)

if __name__ == "__main__":
  # Add commandline args for manifest file and model save path
  parser = argparse.ArgumentParser()
  parser.add_argument("--train-manifest", help="Path to training manifest file", required=True)
  parser.add_argument("--val-manifest", help="Path to validation manifest file", required=True)
  parser.add_argument("--model-name", help="Path to initial model", default="nvidia/parakeet-ctc-0.6b")
  args = parser.parse_args()
  main(args)