Spaces:
Runtime error
Runtime error
justheuristic
commited on
Commit
•
52aac07
1
Parent(s):
1d1af69
mention our friends
Browse files- app.py +1 -3
- mem_calc.py +1 -1
- models.py +48 -25
app.py
CHANGED
@@ -32,10 +32,8 @@ with st.expander("More options"):
|
|
32 |
precisions_values = ('O0', 'O1', 'O3')
|
33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
34 |
|
35 |
-
vocab_size = int(st.number_input('Vocabulary size', min_value=1, step=1, value=50257, format="%i"))
|
36 |
-
|
37 |
args = mem_calc.parse_args(f"""
|
38 |
-
--model {model} --
|
39 |
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
|
40 |
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
|
41 |
""".split())
|
|
|
32 |
precisions_values = ('O0', 'O1', 'O3')
|
33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
34 |
|
|
|
|
|
35 |
args = mem_calc.parse_args(f"""
|
36 |
+
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
37 |
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
|
38 |
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
|
39 |
""".split())
|
mem_calc.py
CHANGED
@@ -123,7 +123,7 @@ def parse_args(args=None):
|
|
123 |
help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
|
124 |
parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
|
125 |
parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
|
126 |
-
parser.add_argument('--vocab_size', type=int, default=
|
127 |
parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
|
128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
129 |
parser.add_argument('--zero', type=int, default=0,
|
|
|
123 |
help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
|
124 |
parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
|
125 |
parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
|
126 |
+
parser.add_argument('--vocab_size', type=int, default=None, help='The vocabulary to use.')
|
127 |
parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
|
128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
129 |
parser.add_argument('--zero', type=int, default=0,
|
models.py
CHANGED
@@ -1,97 +1,120 @@
|
|
1 |
models = {}
|
2 |
-
models['bert-
|
3 |
-
models['bert-
|
4 |
-
models['bert-
|
5 |
-
models['bert-
|
6 |
-
models['bert-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
models['bert-
|
11 |
-
models['bert-
|
12 |
-
models['bert-
|
|
|
|
|
|
|
13 |
|
14 |
models['t5-3b'] = {}
|
15 |
models['t5-3b']['seqlen'] = 512
|
16 |
models['t5-3b']['dmodel'] = 1024
|
17 |
-
models['t5-3b']['
|
18 |
models['t5-3b']['nlayers'] = 48
|
|
|
19 |
|
20 |
models['t5-11b'] = {}
|
21 |
models['t5-11b']['seqlen'] = 512
|
22 |
models['t5-11b']['dmodel'] = 1024
|
23 |
-
models['t5-11b']['
|
24 |
models['t5-11b']['nlayers'] = 48
|
|
|
25 |
|
26 |
models['gpt2-s'] = {}
|
27 |
models['gpt2-s']['seqlen'] = 1024
|
28 |
models['gpt2-s']['dmodel'] = 768
|
29 |
-
models['gpt2-s']['
|
30 |
models['gpt2-s']['nlayers'] = 12
|
|
|
31 |
|
32 |
models['gpt2-m'] = {}
|
33 |
models['gpt2-m']['seqlen'] = 1024
|
34 |
models['gpt2-m']['dmodel'] = 1024
|
35 |
-
models['gpt2-m']['
|
36 |
models['gpt2-m']['nlayers'] = 24
|
|
|
37 |
|
38 |
models['gpt2-l'] = {}
|
39 |
models['gpt2-l']['seqlen'] = 1024
|
40 |
models['gpt2-l']['dmodel'] = 1280
|
41 |
-
models['gpt2-l']['
|
42 |
models['gpt2-l']['nlayers'] = 36
|
|
|
43 |
|
44 |
models['gpt2-xl'] = {}
|
45 |
models['gpt2-xl']['seqlen'] = 1024
|
46 |
models['gpt2-xl']['dmodel'] = 1600
|
47 |
-
models['gpt2-xl']['
|
48 |
models['gpt2-xl']['nlayers'] = 48
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
models['gpt3-s'] = {}
|
52 |
models['gpt3-s']['seqlen'] = 2048
|
53 |
models['gpt3-s']['dmodel'] = 768
|
54 |
-
models['gpt3-s']['
|
55 |
models['gpt3-s']['nlayers'] = 12
|
|
|
56 |
|
57 |
models['gpt3-m'] = {}
|
58 |
models['gpt3-m']['seqlen'] = 2048
|
59 |
models['gpt3-m']['dmodel'] = 1024
|
60 |
-
models['gpt3-m']['
|
61 |
models['gpt3-m']['nlayers'] = 24
|
|
|
62 |
|
63 |
models['gpt3-l'] = {}
|
64 |
models['gpt3-l']['seqlen'] = 2048
|
65 |
models['gpt3-l']['dmodel'] = 1536
|
66 |
-
models['gpt3-l']['
|
67 |
models['gpt3-l']['nlayers'] = 24
|
|
|
68 |
|
69 |
models['gpt3-xl'] = {}
|
70 |
models['gpt3-xl']['seqlen'] = 2048
|
71 |
models['gpt3-xl']['dmodel'] = 2560
|
72 |
-
models['gpt3-xl']['
|
73 |
models['gpt3-xl']['nlayers'] = 24
|
|
|
74 |
|
75 |
models['gpt3-3b'] = {}
|
76 |
models['gpt3-3b']['seqlen'] = 2048
|
77 |
models['gpt3-3b']['dmodel'] = 2560
|
78 |
-
models['gpt3-3b']['
|
79 |
models['gpt3-3b']['nlayers'] = 32
|
|
|
80 |
|
81 |
models['gpt3-7b'] = {}
|
82 |
models['gpt3-7b']['seqlen'] = 2048
|
83 |
models['gpt3-7b']['dmodel'] = 4096
|
84 |
-
models['gpt3-7b']['
|
85 |
models['gpt3-7b']['nlayers'] = 32
|
|
|
86 |
|
87 |
models['gpt3-13b'] = {}
|
88 |
models['gpt3-13b']['seqlen'] = 2048
|
89 |
models['gpt3-13b']['dmodel'] = 5120
|
90 |
-
models['gpt3-13b']['
|
91 |
models['gpt3-13b']['nlayers'] = 40
|
|
|
92 |
|
93 |
models['gpt3-175b'] = {}
|
94 |
models['gpt3-175b']['seqlen'] = 2048
|
95 |
models['gpt3-175b']['dmodel'] = 12288
|
96 |
-
models['gpt3-175b']['
|
97 |
models['gpt3-175b']['nlayers'] = 96
|
|
|
|
1 |
models = {}
|
2 |
+
models['bert-base'] = {}
|
3 |
+
models['bert-base']['seqlen'] = 512
|
4 |
+
models['bert-base']['dmodel'] = 768
|
5 |
+
models['bert-base']['dhid'] = 3072
|
6 |
+
models['bert-base']['nlayers'] = 12
|
7 |
+
models['bert-base']['vocab_size'] = 30522
|
8 |
+
|
9 |
+
|
10 |
+
models['bert-large'] = {}
|
11 |
+
models['bert-large']['seqlen'] = 512
|
12 |
+
models['bert-large']['dmodel'] = 1024
|
13 |
+
models['bert-large']['dhid'] = 4096
|
14 |
+
models['bert-large']['nlayers'] = 24
|
15 |
+
models['bert-large']['vocab_size'] = 30522
|
16 |
|
17 |
models['t5-3b'] = {}
|
18 |
models['t5-3b']['seqlen'] = 512
|
19 |
models['t5-3b']['dmodel'] = 1024
|
20 |
+
models['t5-3b']['dhid'] = 16384
|
21 |
models['t5-3b']['nlayers'] = 48
|
22 |
+
models['t5-3b']['vocab_size'] = 32128
|
23 |
|
24 |
models['t5-11b'] = {}
|
25 |
models['t5-11b']['seqlen'] = 512
|
26 |
models['t5-11b']['dmodel'] = 1024
|
27 |
+
models['t5-11b']['dhid'] = 64*1024
|
28 |
models['t5-11b']['nlayers'] = 48
|
29 |
+
models['t5-11b']['vocab_size'] = 32128
|
30 |
|
31 |
models['gpt2-s'] = {}
|
32 |
models['gpt2-s']['seqlen'] = 1024
|
33 |
models['gpt2-s']['dmodel'] = 768
|
34 |
+
models['gpt2-s']['dhid'] = 768*4
|
35 |
models['gpt2-s']['nlayers'] = 12
|
36 |
+
models['gpt2-s']['vocab_size'] = 50257
|
37 |
|
38 |
models['gpt2-m'] = {}
|
39 |
models['gpt2-m']['seqlen'] = 1024
|
40 |
models['gpt2-m']['dmodel'] = 1024
|
41 |
+
models['gpt2-m']['dhid'] = 1024*4
|
42 |
models['gpt2-m']['nlayers'] = 24
|
43 |
+
models['gpt2-m']['vocab_size'] = 50257
|
44 |
|
45 |
models['gpt2-l'] = {}
|
46 |
models['gpt2-l']['seqlen'] = 1024
|
47 |
models['gpt2-l']['dmodel'] = 1280
|
48 |
+
models['gpt2-l']['dhid'] = 1280*4
|
49 |
models['gpt2-l']['nlayers'] = 36
|
50 |
+
models['gpt2-l']['vocab_size'] = 50257
|
51 |
|
52 |
models['gpt2-xl'] = {}
|
53 |
models['gpt2-xl']['seqlen'] = 1024
|
54 |
models['gpt2-xl']['dmodel'] = 1600
|
55 |
+
models['gpt2-xl']['dhid'] = 1600*4
|
56 |
models['gpt2-xl']['nlayers'] = 48
|
57 |
+
models['gpt2-xl']['vocab_size'] = 50257
|
58 |
|
59 |
+
models['gpt-j-6b'] = {}
|
60 |
+
models['gpt-j-6b']['seqlen'] = 2048
|
61 |
+
models['gpt-j-6b']['dmodel'] = 4096
|
62 |
+
models['gpt-j-6b']['dhid'] = 4096 * 4
|
63 |
+
models['gpt-j-6b']['nlayers'] = 28
|
64 |
+
models['gpt-j-6b']['vocab_size'] = 50400
|
65 |
|
66 |
models['gpt3-s'] = {}
|
67 |
models['gpt3-s']['seqlen'] = 2048
|
68 |
models['gpt3-s']['dmodel'] = 768
|
69 |
+
models['gpt3-s']['dhid'] = 768*4
|
70 |
models['gpt3-s']['nlayers'] = 12
|
71 |
+
models['gpt3-s']['vocab_size'] = 50257 # from public reimplementations
|
72 |
|
73 |
models['gpt3-m'] = {}
|
74 |
models['gpt3-m']['seqlen'] = 2048
|
75 |
models['gpt3-m']['dmodel'] = 1024
|
76 |
+
models['gpt3-m']['dhid'] = 1024*4
|
77 |
models['gpt3-m']['nlayers'] = 24
|
78 |
+
models['gpt3-m']['vocab_size'] = 50257 # from public reimplementations
|
79 |
|
80 |
models['gpt3-l'] = {}
|
81 |
models['gpt3-l']['seqlen'] = 2048
|
82 |
models['gpt3-l']['dmodel'] = 1536
|
83 |
+
models['gpt3-l']['dhid'] = 1536*4
|
84 |
models['gpt3-l']['nlayers'] = 24
|
85 |
+
models['gpt3-l']['vocab_size'] = 50257 # from public reimplementations
|
86 |
|
87 |
models['gpt3-xl'] = {}
|
88 |
models['gpt3-xl']['seqlen'] = 2048
|
89 |
models['gpt3-xl']['dmodel'] = 2560
|
90 |
+
models['gpt3-xl']['dhid'] = 2560*4
|
91 |
models['gpt3-xl']['nlayers'] = 24
|
92 |
+
models['gpt3-xl']['vocab_size'] = 50257 # from public reimplementations
|
93 |
|
94 |
models['gpt3-3b'] = {}
|
95 |
models['gpt3-3b']['seqlen'] = 2048
|
96 |
models['gpt3-3b']['dmodel'] = 2560
|
97 |
+
models['gpt3-3b']['dhid'] = 2560*4
|
98 |
models['gpt3-3b']['nlayers'] = 32
|
99 |
+
models['gpt3-3b']['vocab_size'] = 50257 # from public reimplementations
|
100 |
|
101 |
models['gpt3-7b'] = {}
|
102 |
models['gpt3-7b']['seqlen'] = 2048
|
103 |
models['gpt3-7b']['dmodel'] = 4096
|
104 |
+
models['gpt3-7b']['dhid'] = 4096*4
|
105 |
models['gpt3-7b']['nlayers'] = 32
|
106 |
+
models['gpt3-7b']['vocab_size'] = 50257 # from public reimplementations
|
107 |
|
108 |
models['gpt3-13b'] = {}
|
109 |
models['gpt3-13b']['seqlen'] = 2048
|
110 |
models['gpt3-13b']['dmodel'] = 5120
|
111 |
+
models['gpt3-13b']['dhid'] = 5120*4
|
112 |
models['gpt3-13b']['nlayers'] = 40
|
113 |
+
models['gpt3-13b']['vocab_size'] = 50257 # from public reimplementations
|
114 |
|
115 |
models['gpt3-175b'] = {}
|
116 |
models['gpt3-175b']['seqlen'] = 2048
|
117 |
models['gpt3-175b']['dmodel'] = 12288
|
118 |
+
models['gpt3-175b']['dhid'] = 12288*4
|
119 |
models['gpt3-175b']['nlayers'] = 96
|
120 |
+
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|