justheuristic commited on
Commit
52aac07
1 Parent(s): 1d1af69

mention our friends

Browse files
Files changed (3) hide show
  1. app.py +1 -3
  2. mem_calc.py +1 -1
  3. models.py +48 -25
app.py CHANGED
@@ -32,10 +32,8 @@ with st.expander("More options"):
32
  precisions_values = ('O0', 'O1', 'O3')
33
  precision = st.selectbox('Precision', precisions_names, index=1)
34
 
35
- vocab_size = int(st.number_input('Vocabulary size', min_value=1, step=1, value=50257, format="%i"))
36
-
37
  args = mem_calc.parse_args(f"""
38
- --model {model} --vocab_size {vocab_size} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
39
  {'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
40
  --fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
41
  """.split())
 
32
  precisions_values = ('O0', 'O1', 'O3')
33
  precision = st.selectbox('Precision', precisions_names, index=1)
34
 
 
 
35
  args = mem_calc.parse_args(f"""
36
+ --model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
37
  {'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''} {'--albert' if share_params else ''}
38
  --fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size} --seqlen {seq_len}
39
  """.split())
mem_calc.py CHANGED
@@ -123,7 +123,7 @@ def parse_args(args=None):
123
  help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
124
  parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
125
  parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
126
- parser.add_argument('--vocab_size', type=int, default=50257, help='The vocabulary to use.')
127
  parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
128
  parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
129
  parser.add_argument('--zero', type=int, default=0,
 
123
  help='FP16-level to use. O0 = FP32; O1 = mixed-precision (16+32); O3 = fp16. Default: O1.')
124
  parser.add_argument('--model', default='', choices=list(models.keys()), help='Predefined NLP transformer models')
125
  parser.add_argument('--optimizer', default='adam', choices=OPTIMIZERS, help='The optimizer to use.')
126
+ parser.add_argument('--vocab_size', type=int, default=None, help='The vocabulary to use.')
127
  parser.add_argument('--offload', action='store_true', help='Whether to use optimizer offload.')
128
  parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
129
  parser.add_argument('--zero', type=int, default=0,
models.py CHANGED
@@ -1,97 +1,120 @@
1
  models = {}
2
- models['bert-s'] = {}
3
- models['bert-s']['seqlen'] = 512
4
- models['bert-s']['dmodel'] = 768
5
- models['bert-s']['dhidden'] = 3072
6
- models['bert-s']['nlayers'] = 12
7
-
8
- models['bert-l'] = {}
9
- models['bert-l']['seqlen'] = 512
10
- models['bert-l']['dmodel'] = 1024
11
- models['bert-l']['dhidden'] = 4096
12
- models['bert-l']['nlayers'] = 24
 
 
 
13
 
14
  models['t5-3b'] = {}
15
  models['t5-3b']['seqlen'] = 512
16
  models['t5-3b']['dmodel'] = 1024
17
- models['t5-3b']['dhidden'] = 16384
18
  models['t5-3b']['nlayers'] = 48
 
19
 
20
  models['t5-11b'] = {}
21
  models['t5-11b']['seqlen'] = 512
22
  models['t5-11b']['dmodel'] = 1024
23
- models['t5-11b']['dhidden'] = 64*1024
24
  models['t5-11b']['nlayers'] = 48
 
25
 
26
  models['gpt2-s'] = {}
27
  models['gpt2-s']['seqlen'] = 1024
28
  models['gpt2-s']['dmodel'] = 768
29
- models['gpt2-s']['dhidden'] = 768*4
30
  models['gpt2-s']['nlayers'] = 12
 
31
 
32
  models['gpt2-m'] = {}
33
  models['gpt2-m']['seqlen'] = 1024
34
  models['gpt2-m']['dmodel'] = 1024
35
- models['gpt2-m']['dhidden'] = 1024*4
36
  models['gpt2-m']['nlayers'] = 24
 
37
 
38
  models['gpt2-l'] = {}
39
  models['gpt2-l']['seqlen'] = 1024
40
  models['gpt2-l']['dmodel'] = 1280
41
- models['gpt2-l']['dhidden'] = 1280*4
42
  models['gpt2-l']['nlayers'] = 36
 
43
 
44
  models['gpt2-xl'] = {}
45
  models['gpt2-xl']['seqlen'] = 1024
46
  models['gpt2-xl']['dmodel'] = 1600
47
- models['gpt2-xl']['dhidden'] = 1600*4
48
  models['gpt2-xl']['nlayers'] = 48
 
49
 
 
 
 
 
 
 
50
 
51
  models['gpt3-s'] = {}
52
  models['gpt3-s']['seqlen'] = 2048
53
  models['gpt3-s']['dmodel'] = 768
54
- models['gpt3-s']['dhidden'] = 768*4
55
  models['gpt3-s']['nlayers'] = 12
 
56
 
57
  models['gpt3-m'] = {}
58
  models['gpt3-m']['seqlen'] = 2048
59
  models['gpt3-m']['dmodel'] = 1024
60
- models['gpt3-m']['dhidden'] = 1024*4
61
  models['gpt3-m']['nlayers'] = 24
 
62
 
63
  models['gpt3-l'] = {}
64
  models['gpt3-l']['seqlen'] = 2048
65
  models['gpt3-l']['dmodel'] = 1536
66
- models['gpt3-l']['dhidden'] = 1536*4
67
  models['gpt3-l']['nlayers'] = 24
 
68
 
69
  models['gpt3-xl'] = {}
70
  models['gpt3-xl']['seqlen'] = 2048
71
  models['gpt3-xl']['dmodel'] = 2560
72
- models['gpt3-xl']['dhidden'] = 2560*4
73
  models['gpt3-xl']['nlayers'] = 24
 
74
 
75
  models['gpt3-3b'] = {}
76
  models['gpt3-3b']['seqlen'] = 2048
77
  models['gpt3-3b']['dmodel'] = 2560
78
- models['gpt3-3b']['dhidden'] = 2560*4
79
  models['gpt3-3b']['nlayers'] = 32
 
80
 
81
  models['gpt3-7b'] = {}
82
  models['gpt3-7b']['seqlen'] = 2048
83
  models['gpt3-7b']['dmodel'] = 4096
84
- models['gpt3-7b']['dhidden'] = 4096*4
85
  models['gpt3-7b']['nlayers'] = 32
 
86
 
87
  models['gpt3-13b'] = {}
88
  models['gpt3-13b']['seqlen'] = 2048
89
  models['gpt3-13b']['dmodel'] = 5120
90
- models['gpt3-13b']['dhidden'] = 5120*4
91
  models['gpt3-13b']['nlayers'] = 40
 
92
 
93
  models['gpt3-175b'] = {}
94
  models['gpt3-175b']['seqlen'] = 2048
95
  models['gpt3-175b']['dmodel'] = 12288
96
- models['gpt3-175b']['dhidden'] = 12288*4
97
  models['gpt3-175b']['nlayers'] = 96
 
 
1
  models = {}
2
+ models['bert-base'] = {}
3
+ models['bert-base']['seqlen'] = 512
4
+ models['bert-base']['dmodel'] = 768
5
+ models['bert-base']['dhid'] = 3072
6
+ models['bert-base']['nlayers'] = 12
7
+ models['bert-base']['vocab_size'] = 30522
8
+
9
+
10
+ models['bert-large'] = {}
11
+ models['bert-large']['seqlen'] = 512
12
+ models['bert-large']['dmodel'] = 1024
13
+ models['bert-large']['dhid'] = 4096
14
+ models['bert-large']['nlayers'] = 24
15
+ models['bert-large']['vocab_size'] = 30522
16
 
17
  models['t5-3b'] = {}
18
  models['t5-3b']['seqlen'] = 512
19
  models['t5-3b']['dmodel'] = 1024
20
+ models['t5-3b']['dhid'] = 16384
21
  models['t5-3b']['nlayers'] = 48
22
+ models['t5-3b']['vocab_size'] = 32128
23
 
24
  models['t5-11b'] = {}
25
  models['t5-11b']['seqlen'] = 512
26
  models['t5-11b']['dmodel'] = 1024
27
+ models['t5-11b']['dhid'] = 64*1024
28
  models['t5-11b']['nlayers'] = 48
29
+ models['t5-11b']['vocab_size'] = 32128
30
 
31
  models['gpt2-s'] = {}
32
  models['gpt2-s']['seqlen'] = 1024
33
  models['gpt2-s']['dmodel'] = 768
34
+ models['gpt2-s']['dhid'] = 768*4
35
  models['gpt2-s']['nlayers'] = 12
36
+ models['gpt2-s']['vocab_size'] = 50257
37
 
38
  models['gpt2-m'] = {}
39
  models['gpt2-m']['seqlen'] = 1024
40
  models['gpt2-m']['dmodel'] = 1024
41
+ models['gpt2-m']['dhid'] = 1024*4
42
  models['gpt2-m']['nlayers'] = 24
43
+ models['gpt2-m']['vocab_size'] = 50257
44
 
45
  models['gpt2-l'] = {}
46
  models['gpt2-l']['seqlen'] = 1024
47
  models['gpt2-l']['dmodel'] = 1280
48
+ models['gpt2-l']['dhid'] = 1280*4
49
  models['gpt2-l']['nlayers'] = 36
50
+ models['gpt2-l']['vocab_size'] = 50257
51
 
52
  models['gpt2-xl'] = {}
53
  models['gpt2-xl']['seqlen'] = 1024
54
  models['gpt2-xl']['dmodel'] = 1600
55
+ models['gpt2-xl']['dhid'] = 1600*4
56
  models['gpt2-xl']['nlayers'] = 48
57
+ models['gpt2-xl']['vocab_size'] = 50257
58
 
59
+ models['gpt-j-6b'] = {}
60
+ models['gpt-j-6b']['seqlen'] = 2048
61
+ models['gpt-j-6b']['dmodel'] = 4096
62
+ models['gpt-j-6b']['dhid'] = 4096 * 4
63
+ models['gpt-j-6b']['nlayers'] = 28
64
+ models['gpt-j-6b']['vocab_size'] = 50400
65
 
66
  models['gpt3-s'] = {}
67
  models['gpt3-s']['seqlen'] = 2048
68
  models['gpt3-s']['dmodel'] = 768
69
+ models['gpt3-s']['dhid'] = 768*4
70
  models['gpt3-s']['nlayers'] = 12
71
+ models['gpt3-s']['vocab_size'] = 50257 # from public reimplementations
72
 
73
  models['gpt3-m'] = {}
74
  models['gpt3-m']['seqlen'] = 2048
75
  models['gpt3-m']['dmodel'] = 1024
76
+ models['gpt3-m']['dhid'] = 1024*4
77
  models['gpt3-m']['nlayers'] = 24
78
+ models['gpt3-m']['vocab_size'] = 50257 # from public reimplementations
79
 
80
  models['gpt3-l'] = {}
81
  models['gpt3-l']['seqlen'] = 2048
82
  models['gpt3-l']['dmodel'] = 1536
83
+ models['gpt3-l']['dhid'] = 1536*4
84
  models['gpt3-l']['nlayers'] = 24
85
+ models['gpt3-l']['vocab_size'] = 50257 # from public reimplementations
86
 
87
  models['gpt3-xl'] = {}
88
  models['gpt3-xl']['seqlen'] = 2048
89
  models['gpt3-xl']['dmodel'] = 2560
90
+ models['gpt3-xl']['dhid'] = 2560*4
91
  models['gpt3-xl']['nlayers'] = 24
92
+ models['gpt3-xl']['vocab_size'] = 50257 # from public reimplementations
93
 
94
  models['gpt3-3b'] = {}
95
  models['gpt3-3b']['seqlen'] = 2048
96
  models['gpt3-3b']['dmodel'] = 2560
97
+ models['gpt3-3b']['dhid'] = 2560*4
98
  models['gpt3-3b']['nlayers'] = 32
99
+ models['gpt3-3b']['vocab_size'] = 50257 # from public reimplementations
100
 
101
  models['gpt3-7b'] = {}
102
  models['gpt3-7b']['seqlen'] = 2048
103
  models['gpt3-7b']['dmodel'] = 4096
104
+ models['gpt3-7b']['dhid'] = 4096*4
105
  models['gpt3-7b']['nlayers'] = 32
106
+ models['gpt3-7b']['vocab_size'] = 50257 # from public reimplementations
107
 
108
  models['gpt3-13b'] = {}
109
  models['gpt3-13b']['seqlen'] = 2048
110
  models['gpt3-13b']['dmodel'] = 5120
111
+ models['gpt3-13b']['dhid'] = 5120*4
112
  models['gpt3-13b']['nlayers'] = 40
113
+ models['gpt3-13b']['vocab_size'] = 50257 # from public reimplementations
114
 
115
  models['gpt3-175b'] = {}
116
  models['gpt3-175b']['seqlen'] = 2048
117
  models['gpt3-175b']['dmodel'] = 12288
118
+ models['gpt3-175b']['dhid'] = 12288*4
119
  models['gpt3-175b']['nlayers'] = 96
120
+ models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations