import argparse import math from models import models def get_GB(nbytes): return nbytes/(1024**3) def vocab(bsz, seqlen, dmodel, vocab_dim): # assumes tied embeddings w = vocab_dim*dmodel emb = seqlen*bsz*dmodel emb_norm = seqlen*bsz*dmodel pos_emb = seqlen*bsz*dmodel out_emb = seqlen*bsz*vocab_dim softmax_emb = seqlen*bsz*vocab_dim model = w + dmodel grad = emb + emb_norm + pos_emb + out_emb + softmax_emb grad *= 1 return model, grad def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None, checkpoint=False, shared_groups=None): if dhid is None: dhid = 4*dmodel model = 0 grad = 0 for i in range(nlayers): m, g = transformer_layer(bsz, seqlen, dmodel, dhid, checkpoint=checkpoint) model += m grad += g if shared_groups is not None: model = model / nlayers * shared_groups m, g = vocab(bsz, seqlen, dmodel, vocab_type) model += m grad += g return model, grad def layer_norm(bsz, seqlen, dmodel): w = dmodel x_grad = bsz*seqlen*dmodel return w, x_grad def transformer_layer(bsz, seqlen, dmodel, dhid, checkpoint=False): model = 0 grad = 0 m, g = ffn(bsz, seqlen, dmodel, dhid, 'gelu') model += m grad += g*3 m, g = attention_layer(bsz, seqlen, dmodel) model += m grad += g*5.0 m, g = layer_norm(bsz, seqlen, dmodel) model += m grad += g*1.0 if checkpoint: grad = bsz * seqlen * dmodel return model, grad def attention_layer(bsz, seqlen, dmodel): w_proj = dmodel*3*dmodel w_out = dmodel*dmodel x_residual = bsz*seqlen*dmodel x_proj = bsz*seqlen*dmodel*3 #x_proj_contiguous = bsz*seqlen*dmodel*3 x_proj_contiguous = 0 x_qscaled = bsz*seqlen*dmodel x_qk = bsz*seqlen*seqlen*2 # we need to store both input sequence directions for gradient computation x_softmax = bsz*seqlen*seqlen x_softmax_v = bsz*seqlen*dmodel*2 # we need to store both input sequence directions for gradient computation #x_out_contiguous = bsz*seqlen*dmodel x_out_contiguous = 0 x_out = bsz*seqlen*dmodel model = w_proj + w_out grad = x_residual + x_proj + x_proj_contiguous + x_qscaled + x_qk + x_softmax + x_softmax_v + x_out_contiguous + x_out return model, grad def ffn(bsz, seqlen, dmodel, dhid, func='relu'): # out = linear(relu(linear(x), inplace=True)) + x w1 = dmodel*dhid w2 = dhid*dmodel model = w1 + w2 wgrad = model x1 = bsz*seqlen*dhid if func != 'relu': x1 *= 2 # inplace not possible with most other functions x2 = bsz*seqlen*dmodel residual = bsz*seqlen*dmodel grad = x1 + x2 + residual return model, grad OPTIMIZERS = ['adam', 'adafactor', 'adafactor-fac-only', '8-bit-adam', '16-bit-adam'] def parse_args(args=None): parser = argparse.ArgumentParser('Memory calculator') parser.add_argument('--nlayers', type=int, help='The number of transformer layers.') parser.add_argument('--bsz', type=int, default=1, help='The batch size. Default: 2') parser.add_argument('--seqlen', type=int, help='The sequence length.') parser.add_argument('--dmodel', type=int, help='The core model size.') parser.add_argument('--dhid', type=int, default=None, help='The hidden size of the FFN layer. Default: 4x model size.') parser.add_argument('--fp16-level', type=str, default='O1', help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.') parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models') parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.') parser.add_argument('--vocab_size', type=int, default=None, help='The vocabulary to use.') parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.') parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1') parser.add_argument('--zero', type=int, default=0, help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1') parser.add_argument('--shared_groups', type=int, default=None, help='Number of shared layer groups (as in ALBERT). Defaults to no sharing.') parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.') return parser.parse_args(args) def calculate_memory(args): if args.model != '': if args.model not in models: raise ValueError(f'{args.model} is not supported') else: for key, value in models[args.model].items(): if getattr(args, key, None) is None: setattr(args, key, value) model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.shared_groups) parameters = model if args.optimizer == 'adam': optim = 8*model elif args.optimizer == '8-bit-adam': optim = 2*model elif args.optimizer in ['16-bit-adam', 'adafactor']: optim = 4*model elif args.optimizer in ['adafactor-fac-only']: optim = math.log(model) if args.fp16_level == 'O0': # fp32 weights wgrad = 4*model model = 4*model grad = 4*grad # fp32 elif args.fp16_level in ['O1', 'O2']: # fp16 weights + fp32 master weights wgrad = 2*model model = 4*model + (2*model) grad = 2*grad # fp16 elif args.fp16_level == 'O3': wgrad = 2*model model = 2*model #fp16 grad = 2*grad # fp32 model = get_GB(model) grad = get_GB(grad) optim = get_GB(optim) wgrad = get_GB(wgrad) cpu_mem = 0 overhead = 0 if args.zero == 1: if not args.offload: # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) overhead += optim/25 optim = optim / args.ngpus elif args.zero == 2: if not args.offload: # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) overhead += optim/25 overhead += wgrad/25 optim = optim / args.ngpus wgrad = wgrad / args.ngpus elif args.zero == 3: if not args.offload: # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) overhead += optim/25 overhead += model/25 overhead += wgrad/25 optim = optim / args.ngpus model = model / args.ngpus wgrad = wgrad / args.ngpus if args.offload: cpu_mem = optim + wgrad optim = 0 wgrad = 0 if args.ngpus <= 2: # 12 GB/s for PCIe 3.0 and 1-2x GPU setup (16 lanes, 16 GB/s theoretical) overhead = cpu_mem/12 else: # 6 GB/s for PCIe 3.0 and 4x GPU setup overhead = cpu_mem/6 total_mem = model + grad + optim + wgrad return locals() if __name__ == '__main__': args = parse_args() mem = calculate_memory(args) print('') print(f'Model: {args.model} with batch size {args.bsz} and sequence length {args.seqlen} and a total of {mem["parameters"]/1e9:.4f}B parameters.') print('='*80) print('Weight memory: {0:.2f} GB ({1:.2f}%)'.format(mem['model'], 100*mem['model']/mem['total_mem'])) print('Weight gradient memory: {0:.2f} GB ({1:.2f}%)'.format(mem['wgrad'], 100*mem['wgrad']/mem['total_mem'])) print('Input gradient memory: {0:.2f} GB ({1:.2f}%)'.format(mem['grad'], 100*mem['grad']/mem['total_mem'])) print('Optimizer memory: {0:.2f} GB ({1:.2f}%)'.format(mem['optim'], 100*mem['optim']/mem['total_mem'])) print('Total GPU memory: {0:.2f} GB'.format(mem['total_mem'])) if mem['cpu_mem'] > 0: print('Total CPU memory: {0:.2f} GB'.format(mem['cpu_mem'])) if mem['overhead'] > 0: print('Overhead: {0:.2f} seconds per update (can be partially overlapped with compute)'.format(mem['overhead']))