Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import argparse | |
| import math | |
| from models import models | |
| def get_GB(nbytes): | |
| return nbytes/(1024**3) | |
| def vocab(bsz, seqlen, dmodel, vocab_dim): | |
| # assumes tied embeddings | |
| w = vocab_dim*dmodel | |
| emb = seqlen*bsz*dmodel | |
| emb_norm = seqlen*bsz*dmodel | |
| pos_emb = seqlen*bsz*dmodel | |
| out_emb = seqlen*bsz*vocab_dim | |
| softmax_emb = seqlen*bsz*vocab_dim | |
| model = w + dmodel | |
| grad = emb + emb_norm + pos_emb + out_emb + softmax_emb | |
| grad *= 1 | |
| return model, grad | |
| def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None, | |
| checkpoint=False, shared_groups=None): | |
| if dhid is None: dhid = 4*dmodel | |
| model = 0 | |
| grad = 0 | |
| for i in range(nlayers): | |
| m, g = transformer_layer(bsz, seqlen, dmodel, dhid, checkpoint=checkpoint) | |
| model += m | |
| grad += g | |
| if shared_groups is not None: | |
| model = model / nlayers * shared_groups | |
| m, g = vocab(bsz, seqlen, dmodel, vocab_type) | |
| model += m | |
| grad += g | |
| return model, grad | |
| def layer_norm(bsz, seqlen, dmodel): | |
| w = dmodel | |
| x_grad = bsz*seqlen*dmodel | |
| return w, x_grad | |
| def transformer_layer(bsz, seqlen, dmodel, dhid, checkpoint=False): | |
| model = 0 | |
| grad = 0 | |
| m, g = ffn(bsz, seqlen, dmodel, dhid, 'gelu') | |
| model += m | |
| grad += g*3 | |
| m, g = attention_layer(bsz, seqlen, dmodel) | |
| model += m | |
| grad += g*5.0 | |
| m, g = layer_norm(bsz, seqlen, dmodel) | |
| model += m | |
| grad += g*1.0 | |
| if checkpoint: | |
| grad = bsz * seqlen * dmodel | |
| return model, grad | |
| def attention_layer(bsz, seqlen, dmodel): | |
| w_proj = dmodel*3*dmodel | |
| w_out = dmodel*dmodel | |
| x_residual = bsz*seqlen*dmodel | |
| x_proj = bsz*seqlen*dmodel*3 | |
| #x_proj_contiguous = bsz*seqlen*dmodel*3 | |
| x_proj_contiguous = 0 | |
| x_qscaled = bsz*seqlen*dmodel | |
| x_qk = bsz*seqlen*seqlen*2 # we need to store both input sequence directions for gradient computation | |
| x_softmax = bsz*seqlen*seqlen | |
| x_softmax_v = bsz*seqlen*dmodel*2 # we need to store both input sequence directions for gradient computation | |
| #x_out_contiguous = bsz*seqlen*dmodel | |
| x_out_contiguous = 0 | |
| x_out = bsz*seqlen*dmodel | |
| model = w_proj + w_out | |
| grad = x_residual + x_proj + x_proj_contiguous + x_qscaled + x_qk + x_softmax + x_softmax_v + x_out_contiguous + x_out | |
| return model, grad | |
| def ffn(bsz, seqlen, dmodel, dhid, func='relu'): | |
| # out = linear(relu(linear(x), inplace=True)) + x | |
| w1 = dmodel*dhid | |
| w2 = dhid*dmodel | |
| model = w1 + w2 | |
| wgrad = model | |
| x1 = bsz*seqlen*dhid | |
| if func != 'relu': x1 *= 2 # inplace not possible with most other functions | |
| x2 = bsz*seqlen*dmodel | |
| residual = bsz*seqlen*dmodel | |
| grad = x1 + x2 + residual | |
| return model, grad | |
| OPTIMIZERS = ['adam', 'adafactor', 'adafactor-fac-only', '8-bit-adam', '16-bit-adam'] | |
| def parse_args(args=None): | |
| parser = argparse.ArgumentParser('Memory calculator') | |
| parser.add_argument('--nlayers', type=int, help='The number of transformer layers.') | |
| parser.add_argument('--bsz', type=int, default=1, help='The batch size. Default: 2') | |
| parser.add_argument('--seqlen', type=int, help='The sequence length.') | |
| parser.add_argument('--dmodel', type=int, help='The core model size.') | |
| parser.add_argument('--dhid', type=int, default=None, | |
| help='The hidden size of the FFN layer. Default: 4x model size.') | |
| parser.add_argument('--fp16-level', type=str, default='O1', | |
| help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.') | |
| parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models') | |
| parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.') | |
| parser.add_argument('--vocab_size', type=int, default=None, help='The vocabulary to use.') | |
| parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.') | |
| parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1') | |
| parser.add_argument('--zero', type=int, default=0, | |
| help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1') | |
| parser.add_argument('--shared_groups', type=int, default=None, help='Number of shared layer groups (as in ALBERT). Defaults to no sharing.') | |
| parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.') | |
| return parser.parse_args(args) | |
| def calculate_memory(args): | |
| if args.model != '': | |
| if args.model not in models: | |
| raise ValueError(f'{args.model} is not supported') | |
| else: | |
| for key, value in models[args.model].items(): | |
| if getattr(args, key, None) is None: | |
| setattr(args, key, value) | |
| model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.shared_groups) | |
| parameters = model | |
| if args.optimizer == 'adam': | |
| optim = 8*model | |
| elif args.optimizer == '8-bit-adam': | |
| optim = 2*model | |
| elif args.optimizer in ['16-bit-adam', 'adafactor']: | |
| optim = 4*model | |
| elif args.optimizer in ['adafactor-fac-only']: | |
| optim = math.log(model) | |
| if args.fp16_level == 'O0': | |
| # fp32 weights | |
| wgrad = 4*model | |
| model = 4*model | |
| grad = 4*grad # fp32 | |
| elif args.fp16_level in ['O1', 'O2']: | |
| # fp16 weights + fp32 master weights | |
| wgrad = 2*model | |
| model = 4*model + (2*model) | |
| grad = 2*grad # fp16 | |
| elif args.fp16_level == 'O3': | |
| wgrad = 2*model | |
| model = 2*model #fp16 | |
| grad = 2*grad # fp32 | |
| model = get_GB(model) | |
| grad = get_GB(grad) | |
| optim = get_GB(optim) | |
| wgrad = get_GB(wgrad) | |
| cpu_mem = 0 | |
| overhead = 0 | |
| if args.zero == 1: | |
| if not args.offload: | |
| # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) | |
| overhead += optim/25 | |
| optim = optim / args.ngpus | |
| elif args.zero == 2: | |
| if not args.offload: | |
| # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) | |
| overhead += optim/25 | |
| overhead += wgrad/25 | |
| optim = optim / args.ngpus | |
| wgrad = wgrad / args.ngpus | |
| elif args.zero == 3: | |
| if not args.offload: | |
| # assumes PCIe 4.0 infiniband (200 Gbit/s = 25 GB/s) | |
| overhead += optim/25 | |
| overhead += model/25 | |
| overhead += wgrad/25 | |
| optim = optim / args.ngpus | |
| model = model / args.ngpus | |
| wgrad = wgrad / args.ngpus | |
| if args.offload: | |
| cpu_mem = optim + wgrad | |
| optim = 0 | |
| wgrad = 0 | |
| if args.ngpus <= 2: | |
| # 12 GB/s for PCIe 3.0 and 1-2x GPU setup (16 lanes, 16 GB/s theoretical) | |
| overhead = cpu_mem/12 | |
| else: | |
| # 6 GB/s for PCIe 3.0 and 4x GPU setup | |
| overhead = cpu_mem/6 | |
| total_mem = model + grad + optim + wgrad | |
| return locals() | |
| if __name__ == '__main__': | |
| args = parse_args() | |
| mem = calculate_memory(args) | |
| print('') | |
| print(f'Model: {args.model} with batch size {args.bsz} and sequence length {args.seqlen} and a total of {mem["parameters"]/1e9:.4f}B parameters.') | |
| print('='*80) | |
| print('Weight memory: {0:.2f} GB ({1:.2f}%)'.format(mem['model'], 100*mem['model']/mem['total_mem'])) | |
| print('Weight gradient memory: {0:.2f} GB ({1:.2f}%)'.format(mem['wgrad'], 100*mem['wgrad']/mem['total_mem'])) | |
| print('Input gradient memory: {0:.2f} GB ({1:.2f}%)'.format(mem['grad'], 100*mem['grad']/mem['total_mem'])) | |
| print('Optimizer memory: {0:.2f} GB ({1:.2f}%)'.format(mem['optim'], 100*mem['optim']/mem['total_mem'])) | |
| print('Total GPU memory: {0:.2f} GB'.format(mem['total_mem'])) | |
| if mem['cpu_mem'] > 0: | |
| print('Total CPU memory: {0:.2f} GB'.format(mem['cpu_mem'])) | |
| if mem['overhead'] > 0: | |
| print('Overhead: {0:.2f} seconds per update (can be partially overlapped with compute)'.format(mem['overhead'])) | |

