File size: 3,718 Bytes
7b48c38
52aac07
 
 
 
 
 
 
 
 
 
 
 
 
 
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
52aac07
 
 
 
 
 
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
7b48c38
 
 
 
52aac07
7b48c38
52aac07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
models = {}
models['bert-base'] = {}
models['bert-base']['seqlen'] = 512
models['bert-base']['dmodel'] = 768
models['bert-base']['dhid'] = 3072
models['bert-base']['nlayers'] = 12
models['bert-base']['vocab_size'] = 30522


models['bert-large'] = {}
models['bert-large']['seqlen'] = 512
models['bert-large']['dmodel'] = 1024
models['bert-large']['dhid'] = 4096
models['bert-large']['nlayers'] = 24
models['bert-large']['vocab_size'] = 30522

models['t5-3b'] = {}
models['t5-3b']['seqlen'] = 512
models['t5-3b']['dmodel'] = 1024
models['t5-3b']['dhid'] = 16384
models['t5-3b']['nlayers'] = 48
models['t5-3b']['vocab_size'] = 32128

models['t5-11b'] = {}
models['t5-11b']['seqlen'] = 512
models['t5-11b']['dmodel'] = 1024
models['t5-11b']['dhid'] = 64*1024
models['t5-11b']['nlayers'] = 48
models['t5-11b']['vocab_size'] = 32128

models['gpt2-s'] = {}
models['gpt2-s']['seqlen'] = 1024
models['gpt2-s']['dmodel'] = 768
models['gpt2-s']['dhid'] = 768*4
models['gpt2-s']['nlayers'] = 12
models['gpt2-s']['vocab_size'] = 50257

models['gpt2-m'] = {}
models['gpt2-m']['seqlen'] = 1024
models['gpt2-m']['dmodel'] = 1024
models['gpt2-m']['dhid'] = 1024*4
models['gpt2-m']['nlayers'] = 24
models['gpt2-m']['vocab_size'] = 50257

models['gpt2-l'] = {}
models['gpt2-l']['seqlen'] = 1024
models['gpt2-l']['dmodel'] = 1280
models['gpt2-l']['dhid'] = 1280*4
models['gpt2-l']['nlayers'] = 36
models['gpt2-l']['vocab_size'] = 50257

models['gpt2-xl'] = {}
models['gpt2-xl']['seqlen'] = 1024
models['gpt2-xl']['dmodel'] = 1600
models['gpt2-xl']['dhid'] = 1600*4
models['gpt2-xl']['nlayers'] = 48
models['gpt2-xl']['vocab_size'] = 50257

models['gpt-j-6b'] = {}
models['gpt-j-6b']['seqlen'] = 2048
models['gpt-j-6b']['dmodel'] = 4096
models['gpt-j-6b']['dhid'] = 4096 * 4
models['gpt-j-6b']['nlayers'] = 28
models['gpt-j-6b']['vocab_size'] = 50400

models['gpt3-s'] = {}
models['gpt3-s']['seqlen'] = 2048
models['gpt3-s']['dmodel'] = 768
models['gpt3-s']['dhid'] = 768*4
models['gpt3-s']['nlayers'] = 12
models['gpt3-s']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-m'] = {}
models['gpt3-m']['seqlen'] = 2048
models['gpt3-m']['dmodel'] = 1024
models['gpt3-m']['dhid'] = 1024*4
models['gpt3-m']['nlayers'] = 24
models['gpt3-m']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-l'] = {}
models['gpt3-l']['seqlen'] = 2048
models['gpt3-l']['dmodel'] = 1536
models['gpt3-l']['dhid'] = 1536*4
models['gpt3-l']['nlayers'] = 24
models['gpt3-l']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-xl'] = {}
models['gpt3-xl']['seqlen'] = 2048
models['gpt3-xl']['dmodel'] = 2560
models['gpt3-xl']['dhid'] = 2560*4
models['gpt3-xl']['nlayers'] = 24
models['gpt3-xl']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-3b'] = {}
models['gpt3-3b']['seqlen'] = 2048
models['gpt3-3b']['dmodel'] = 2560
models['gpt3-3b']['dhid'] = 2560*4
models['gpt3-3b']['nlayers'] = 32
models['gpt3-3b']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-7b'] = {}
models['gpt3-7b']['seqlen'] = 2048
models['gpt3-7b']['dmodel'] = 4096
models['gpt3-7b']['dhid'] = 4096*4
models['gpt3-7b']['nlayers'] = 32
models['gpt3-7b']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-13b'] = {}
models['gpt3-13b']['seqlen'] = 2048
models['gpt3-13b']['dmodel'] = 5120
models['gpt3-13b']['dhid'] = 5120*4
models['gpt3-13b']['nlayers'] = 40
models['gpt3-13b']['vocab_size'] = 50257  # from public reimplementations

models['gpt3-175b'] = {}
models['gpt3-175b']['seqlen'] = 2048
models['gpt3-175b']['dmodel'] = 12288
models['gpt3-175b']['dhid'] = 12288*4
models['gpt3-175b']['nlayers'] = 96
models['gpt3-175b']['vocab_size'] = 50257  # from public reimplementations