Spaces:
Runtime error
Runtime error
justheuristic
commited on
Commit
•
2133880
1
Parent(s):
52aac07
sharing groups
Browse files- app.py +5 -3
- mem_calc.py +5 -5
- models.py +13 -7
app.py
CHANGED
@@ -27,15 +27,17 @@ share_params = col2.checkbox("Share parameters", value=False)
|
|
27 |
|
28 |
with st.expander("More options"):
|
29 |
batch_size = int(st.number_input('Microbatch size (sequences)', min_value=1, step=1, value=1, format="%i"))
|
30 |
-
seq_len = int(st.number_input('Sequence length (max. tokens)', min_value=1, step=1, value=1024, format="%i"))
|
31 |
precisions_names = ('Full', 'Mixed ("O1")', 'Pure 16-bit')
|
32 |
precisions_values = ('O0', 'O1', 'O3')
|
|
|
|
|
33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
34 |
|
35 |
args = mem_calc.parse_args(f"""
|
36 |
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
37 |
-
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''}
|
38 |
-
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size}
|
|
|
39 |
""".split())
|
40 |
|
41 |
|
|
|
27 |
|
28 |
with st.expander("More options"):
|
29 |
batch_size = int(st.number_input('Microbatch size (sequences)', min_value=1, step=1, value=1, format="%i"))
|
|
|
30 |
precisions_names = ('Full', 'Mixed ("O1")', 'Pure 16-bit')
|
31 |
precisions_values = ('O0', 'O1', 'O3')
|
32 |
+
sharing_groups = int(st.number_input('Shared parameter groups (used if Share parameters is checked)',
|
33 |
+
min_value=1, step=1, value=1, format="%i"))
|
34 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
35 |
|
36 |
args = mem_calc.parse_args(f"""
|
37 |
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
38 |
+
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''}
|
39 |
+
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size}
|
40 |
+
{f'--shared_groups {sharing_groups}' if share_params else ''}
|
41 |
""".split())
|
42 |
|
43 |
|
mem_calc.py
CHANGED
@@ -24,7 +24,7 @@ def vocab(bsz, seqlen, dmodel, vocab_dim):
|
|
24 |
|
25 |
|
26 |
def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
27 |
-
checkpoint=False,
|
28 |
if dhid is None: dhid = 4*dmodel
|
29 |
model = 0
|
30 |
grad = 0
|
@@ -33,8 +33,8 @@ def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
|
33 |
model += m
|
34 |
grad += g
|
35 |
|
36 |
-
if
|
37 |
-
model = model / nlayers
|
38 |
|
39 |
m, g = vocab(bsz, seqlen, dmodel, vocab_type)
|
40 |
model += m
|
@@ -128,7 +128,7 @@ def parse_args(args=None):
|
|
128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
129 |
parser.add_argument('--zero', type=int, default=0,
|
130 |
help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1')
|
131 |
-
parser.add_argument('--
|
132 |
parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.')
|
133 |
|
134 |
return parser.parse_args(args)
|
@@ -143,7 +143,7 @@ def calculate_memory(args):
|
|
143 |
if getattr(args, key, None) is None:
|
144 |
setattr(args, key, value)
|
145 |
|
146 |
-
model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.
|
147 |
parameters = model
|
148 |
|
149 |
if args.optimizer == 'adam':
|
|
|
24 |
|
25 |
|
26 |
def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
27 |
+
checkpoint=False, shared_groups=None):
|
28 |
if dhid is None: dhid = 4*dmodel
|
29 |
model = 0
|
30 |
grad = 0
|
|
|
33 |
model += m
|
34 |
grad += g
|
35 |
|
36 |
+
if shared_groups is not None:
|
37 |
+
model = model / nlayers * shared_groups
|
38 |
|
39 |
m, g = vocab(bsz, seqlen, dmodel, vocab_type)
|
40 |
model += m
|
|
|
128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
129 |
parser.add_argument('--zero', type=int, default=0,
|
130 |
help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1')
|
131 |
+
parser.add_argument('--shared_groups', type=int, default=None, help='Number of shared layer groups (as in ALBERT). Defaults to no sharing.')
|
132 |
parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.')
|
133 |
|
134 |
return parser.parse_args(args)
|
|
|
143 |
if getattr(args, key, None) is None:
|
144 |
setattr(args, key, value)
|
145 |
|
146 |
+
model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.shared_groups)
|
147 |
parameters = model
|
148 |
|
149 |
if args.optimizer == 'adam':
|
models.py
CHANGED
@@ -56,13 +56,6 @@ models['gpt2-xl']['dhid'] = 1600*4
|
|
56 |
models['gpt2-xl']['nlayers'] = 48
|
57 |
models['gpt2-xl']['vocab_size'] = 50257
|
58 |
|
59 |
-
models['gpt-j-6b'] = {}
|
60 |
-
models['gpt-j-6b']['seqlen'] = 2048
|
61 |
-
models['gpt-j-6b']['dmodel'] = 4096
|
62 |
-
models['gpt-j-6b']['dhid'] = 4096 * 4
|
63 |
-
models['gpt-j-6b']['nlayers'] = 28
|
64 |
-
models['gpt-j-6b']['vocab_size'] = 50400
|
65 |
-
|
66 |
models['gpt3-s'] = {}
|
67 |
models['gpt3-s']['seqlen'] = 2048
|
68 |
models['gpt3-s']['dmodel'] = 768
|
@@ -118,3 +111,16 @@ models['gpt3-175b']['dmodel'] = 12288
|
|
118 |
models['gpt3-175b']['dhid'] = 12288*4
|
119 |
models['gpt3-175b']['nlayers'] = 96
|
120 |
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
models['gpt2-xl']['nlayers'] = 48
|
57 |
models['gpt2-xl']['vocab_size'] = 50257
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
models['gpt3-s'] = {}
|
60 |
models['gpt3-s']['seqlen'] = 2048
|
61 |
models['gpt3-s']['dmodel'] = 768
|
|
|
111 |
models['gpt3-175b']['dhid'] = 12288*4
|
112 |
models['gpt3-175b']['nlayers'] = 96
|
113 |
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|
114 |
+
|
115 |
+
models['gpt-j-6b'] = {}
|
116 |
+
models['gpt-j-6b']['seqlen'] = 2048
|
117 |
+
models['gpt-j-6b']['dmodel'] = 4096
|
118 |
+
models['gpt-j-6b']['dhid'] = 4096 * 4
|
119 |
+
models['gpt-j-6b']['nlayers'] = 28
|
120 |
+
models['gpt-j-6b']['vocab_size'] = 50400
|
121 |
+
|
122 |
+
models['dalle-12b'] = {}
|
123 |
+
models['dalle-12b']['seqlen'] = 1024 + 256
|
124 |
+
models['dalle-12b']['dmodel'] = 62 * 64
|
125 |
+
models['dalle-12b']['nlayers'] = 64
|
126 |
+
models['dalle-12b']['vocab_size'] = 8192 + 16384
|