File size: 3,911 Bytes
7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 7b48c38 52aac07 2133880 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
models = {}
models['bert-base'] = {}
models['bert-base']['seqlen'] = 512
models['bert-base']['dmodel'] = 768
models['bert-base']['dhid'] = 3072
models['bert-base']['nlayers'] = 12
models['bert-base']['vocab_size'] = 30522
models['bert-large'] = {}
models['bert-large']['seqlen'] = 512
models['bert-large']['dmodel'] = 1024
models['bert-large']['dhid'] = 4096
models['bert-large']['nlayers'] = 24
models['bert-large']['vocab_size'] = 30522
models['t5-3b'] = {}
models['t5-3b']['seqlen'] = 512
models['t5-3b']['dmodel'] = 1024
models['t5-3b']['dhid'] = 16384
models['t5-3b']['nlayers'] = 48
models['t5-3b']['vocab_size'] = 32128
models['t5-11b'] = {}
models['t5-11b']['seqlen'] = 512
models['t5-11b']['dmodel'] = 1024
models['t5-11b']['dhid'] = 64*1024
models['t5-11b']['nlayers'] = 48
models['t5-11b']['vocab_size'] = 32128
models['gpt2-s'] = {}
models['gpt2-s']['seqlen'] = 1024
models['gpt2-s']['dmodel'] = 768
models['gpt2-s']['dhid'] = 768*4
models['gpt2-s']['nlayers'] = 12
models['gpt2-s']['vocab_size'] = 50257
models['gpt2-m'] = {}
models['gpt2-m']['seqlen'] = 1024
models['gpt2-m']['dmodel'] = 1024
models['gpt2-m']['dhid'] = 1024*4
models['gpt2-m']['nlayers'] = 24
models['gpt2-m']['vocab_size'] = 50257
models['gpt2-l'] = {}
models['gpt2-l']['seqlen'] = 1024
models['gpt2-l']['dmodel'] = 1280
models['gpt2-l']['dhid'] = 1280*4
models['gpt2-l']['nlayers'] = 36
models['gpt2-l']['vocab_size'] = 50257
models['gpt2-xl'] = {}
models['gpt2-xl']['seqlen'] = 1024
models['gpt2-xl']['dmodel'] = 1600
models['gpt2-xl']['dhid'] = 1600*4
models['gpt2-xl']['nlayers'] = 48
models['gpt2-xl']['vocab_size'] = 50257
models['gpt3-s'] = {}
models['gpt3-s']['seqlen'] = 2048
models['gpt3-s']['dmodel'] = 768
models['gpt3-s']['dhid'] = 768*4
models['gpt3-s']['nlayers'] = 12
models['gpt3-s']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-m'] = {}
models['gpt3-m']['seqlen'] = 2048
models['gpt3-m']['dmodel'] = 1024
models['gpt3-m']['dhid'] = 1024*4
models['gpt3-m']['nlayers'] = 24
models['gpt3-m']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-l'] = {}
models['gpt3-l']['seqlen'] = 2048
models['gpt3-l']['dmodel'] = 1536
models['gpt3-l']['dhid'] = 1536*4
models['gpt3-l']['nlayers'] = 24
models['gpt3-l']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-xl'] = {}
models['gpt3-xl']['seqlen'] = 2048
models['gpt3-xl']['dmodel'] = 2560
models['gpt3-xl']['dhid'] = 2560*4
models['gpt3-xl']['nlayers'] = 24
models['gpt3-xl']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-3b'] = {}
models['gpt3-3b']['seqlen'] = 2048
models['gpt3-3b']['dmodel'] = 2560
models['gpt3-3b']['dhid'] = 2560*4
models['gpt3-3b']['nlayers'] = 32
models['gpt3-3b']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-7b'] = {}
models['gpt3-7b']['seqlen'] = 2048
models['gpt3-7b']['dmodel'] = 4096
models['gpt3-7b']['dhid'] = 4096*4
models['gpt3-7b']['nlayers'] = 32
models['gpt3-7b']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-13b'] = {}
models['gpt3-13b']['seqlen'] = 2048
models['gpt3-13b']['dmodel'] = 5120
models['gpt3-13b']['dhid'] = 5120*4
models['gpt3-13b']['nlayers'] = 40
models['gpt3-13b']['vocab_size'] = 50257 # from public reimplementations
models['gpt3-175b'] = {}
models['gpt3-175b']['seqlen'] = 2048
models['gpt3-175b']['dmodel'] = 12288
models['gpt3-175b']['dhid'] = 12288*4
models['gpt3-175b']['nlayers'] = 96
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
models['gpt-j-6b'] = {}
models['gpt-j-6b']['seqlen'] = 2048
models['gpt-j-6b']['dmodel'] = 4096
models['gpt-j-6b']['dhid'] = 4096 * 4
models['gpt-j-6b']['nlayers'] = 28
models['gpt-j-6b']['vocab_size'] = 50400
models['dalle-12b'] = {}
models['dalle-12b']['seqlen'] = 1024 + 256
models['dalle-12b']['dmodel'] = 62 * 64
models['dalle-12b']['nlayers'] = 64
models['dalle-12b']['vocab_size'] = 8192 + 16384 |