Chua, Vui Seng commited on
Commit
ac8897e
1 Parent(s): ffc90e9

Update readme and model analysis

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ eval_nbest_predictions.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -8,13 +8,14 @@ This model is a replication of [block pruning paper](https://arxiv.org/abs/2109.
8
  To reproduce this model, pls follow [documentation here](https://github.com/vuiseng9/nn_pruning/blob/reproduce-evaluation/reproduce-eval/readme.md) until step 2.
9
 
10
  # Eval
11
- The model can be evaluated out-of-the-box with HF QA example. Note that only pruned self-attention heads are discarded where pruned ffn dimension are sparsified instead of removal. Verified in v4.13
12
  ```bash
13
  export CUDA_VISIBLE_DEVICES=0
14
 
15
  OUTDIR=eval-bert-base-squadv1-block-pruning-hybrid
16
  WORKDIR=transformers/examples/pytorch/question-answering
17
  cd $WORKDIR
 
18
 
19
  nohup python run_qa.py \
20
  --model_name_or_path vuiseng9/bert-base-squadv1-block-pruning-hybrid \
@@ -34,17 +35,21 @@ git clone https://github.com/vuiseng9/nncf && cd nncf
34
  git checkout tld-poc
35
  git reset --hard 1dec7afe7a4b567c059fcf287ea2c234980fded2
36
  python setup.py develop
 
 
 
 
 
 
 
37
 
38
  # Huggingface Transformers
39
  git clone https://github.com/vuiseng9/transformers && cd transformers
40
  git checkout tld-poc
41
  git reset --hard 10a1e29d84484e48fd106f58957d9ffc89dc43c5
42
  pip install -e .
 
43
 
44
- # Huggingface nn_pruning
45
- git clone https://github.com/vuiseng9/nn_pruning && cd nn_pruning
46
- git checkout reproduce-evaluation
47
- git reset --hard 2d4e196d694c465e43e5fbce6c3836d0a60e1446
48
  ```
49
  Add ```--optimize_model_before_eval``` during evaluation.
50
  ```bash
@@ -53,6 +58,7 @@ export CUDA_VISIBLE_DEVICES=0
53
  OUTDIR=eval-bert-base-squadv1-block-pruning-hybrid-cropped
54
  WORKDIR=transformers/examples/pytorch/question-answering
55
  cd $WORKDIR
 
56
 
57
  nohup python run_qa.py \
58
  --model_name_or_path vuiseng9/bert-base-squadv1-block-pruning-hybrid \
 
8
  To reproduce this model, pls follow [documentation here](https://github.com/vuiseng9/nn_pruning/blob/reproduce-evaluation/reproduce-eval/readme.md) until step 2.
9
 
10
  # Eval
11
+ The model can be evaluated out-of-the-box with HF QA example. Note that only pruned self-attention heads are discarded where pruned ffn dimension are sparsified instead of removal. Verified in v4.13.0, v4.9.1.
12
  ```bash
13
  export CUDA_VISIBLE_DEVICES=0
14
 
15
  OUTDIR=eval-bert-base-squadv1-block-pruning-hybrid
16
  WORKDIR=transformers/examples/pytorch/question-answering
17
  cd $WORKDIR
18
+ mkdir $OUTDIR
19
 
20
  nohup python run_qa.py \
21
  --model_name_or_path vuiseng9/bert-base-squadv1-block-pruning-hybrid \
 
35
  git checkout tld-poc
36
  git reset --hard 1dec7afe7a4b567c059fcf287ea2c234980fded2
37
  python setup.py develop
38
+ pip install -r examples/torch/requirements.txt
39
+
40
+ # Huggingface nn_pruning
41
+ git clone https://github.com/vuiseng9/nn_pruning && cd nn_pruning
42
+ git checkout reproduce-evaluation
43
+ git reset --hard 2d4e196d694c465e43e5fbce6c3836d0a60e1446
44
+ pip install -e ".[dev]"
45
 
46
  # Huggingface Transformers
47
  git clone https://github.com/vuiseng9/transformers && cd transformers
48
  git checkout tld-poc
49
  git reset --hard 10a1e29d84484e48fd106f58957d9ffc89dc43c5
50
  pip install -e .
51
+ head -n 1 examples/pytorch/question-answering/requirements.txt | xargs -i pip install {}
52
 
 
 
 
 
53
  ```
54
  Add ```--optimize_model_before_eval``` during evaluation.
55
  ```bash
 
58
  OUTDIR=eval-bert-base-squadv1-block-pruning-hybrid-cropped
59
  WORKDIR=transformers/examples/pytorch/question-answering
60
  cd $WORKDIR
61
+ mkdir $OUTDIR
62
 
63
  nohup python run_qa.py \
64
  --model_name_or_path vuiseng9/bert-base-squadv1-block-pruning-hybrid \
XP_layer_wise_sparsity_global_rate_15.41.csv ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,layer_type,param_type,shape,nparam,nnz,sparsity
2
+ 0,bert.embeddings.word_embeddings,Embedding,weight,"[30522, 768]",23440896,23440896,0.0
3
+ 1,bert.embeddings.position_embeddings,Embedding,weight,"[512, 768]",393216,393216,0.0
4
+ 2,bert.embeddings.token_type_embeddings,Embedding,weight,"[2, 768]",1536,1536,0.0
5
+ 3,bert.embeddings.LayerNorm,LayerNorm,weight,[768],768,768,0.0
6
+ 4,bert.embeddings.LayerNorm,LayerNorm,bias,[768],768,768,0.0
7
+ 5,bert.encoder.layer.0.attention.self.query,Linear,weight,"[320, 768]",245760,135168,0.44999998807907104
8
+ 6,bert.encoder.layer.0.attention.self.query,Linear,bias,[320],320,256,0.19999998807907104
9
+ 7,bert.encoder.layer.0.attention.self.key,Linear,weight,"[320, 768]",245760,149504,0.3916666507720947
10
+ 8,bert.encoder.layer.0.attention.self.key,Linear,bias,[320],320,256,0.19999998807907104
11
+ 9,bert.encoder.layer.0.attention.self.value,Linear,weight,"[320, 768]",245760,173056,0.2958332896232605
12
+ 10,bert.encoder.layer.0.attention.self.value,Linear,bias,[320],320,256,0.19999998807907104
13
+ 11,bert.encoder.layer.0.attention.output.dense,Linear,weight,"[768, 320]",245760,181248,0.26249998807907104
14
+ 12,bert.encoder.layer.0.attention.output.dense,Linear,bias,[768],768,768,0.0
15
+ 13,bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
16
+ 14,bert.encoder.layer.0.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
17
+ 15,bert.encoder.layer.0.intermediate.dense,Linear,weight,"[185, 768]",142080,142080,0.0
18
+ 16,bert.encoder.layer.0.intermediate.dense,Linear,bias,[185],185,185,0.0
19
+ 17,bert.encoder.layer.0.output.dense,Linear,weight,"[768, 185]",142080,142080,0.0
20
+ 18,bert.encoder.layer.0.output.dense,Linear,bias,[768],768,768,0.0
21
+ 19,bert.encoder.layer.0.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
22
+ 20,bert.encoder.layer.0.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
23
+ 21,bert.encoder.layer.1.attention.self.query,Linear,weight,"[320, 768]",245760,175104,0.28749996423721313
24
+ 22,bert.encoder.layer.1.attention.self.query,Linear,bias,[320],320,288,0.09999996423721313
25
+ 23,bert.encoder.layer.1.attention.self.key,Linear,weight,"[320, 768]",245760,177152,0.27916663885116577
26
+ 24,bert.encoder.layer.1.attention.self.key,Linear,bias,[320],320,288,0.09999996423721313
27
+ 25,bert.encoder.layer.1.attention.self.value,Linear,weight,"[320, 768]",245760,166912,0.32083332538604736
28
+ 26,bert.encoder.layer.1.attention.self.value,Linear,bias,[320],320,288,0.09999996423721313
29
+ 27,bert.encoder.layer.1.attention.output.dense,Linear,weight,"[768, 320]",245760,167936,0.3166666030883789
30
+ 28,bert.encoder.layer.1.attention.output.dense,Linear,bias,[768],768,768,0.0
31
+ 29,bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
32
+ 30,bert.encoder.layer.1.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
33
+ 31,bert.encoder.layer.1.intermediate.dense,Linear,weight,"[315, 768]",241920,241920,0.0
34
+ 32,bert.encoder.layer.1.intermediate.dense,Linear,bias,[315],315,315,0.0
35
+ 33,bert.encoder.layer.1.output.dense,Linear,weight,"[768, 315]",241920,241920,0.0
36
+ 34,bert.encoder.layer.1.output.dense,Linear,bias,[768],768,768,0.0
37
+ 35,bert.encoder.layer.1.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
38
+ 36,bert.encoder.layer.1.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
39
+ 37,bert.encoder.layer.2.attention.self.query,Linear,weight,"[576, 768]",442368,285696,0.3541666865348816
40
+ 38,bert.encoder.layer.2.attention.self.query,Linear,bias,[576],576,480,0.1666666865348816
41
+ 39,bert.encoder.layer.2.attention.self.key,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
42
+ 40,bert.encoder.layer.2.attention.self.key,Linear,bias,[576],576,480,0.1666666865348816
43
+ 41,bert.encoder.layer.2.attention.self.value,Linear,weight,"[576, 768]",442368,226304,0.4884259104728699
44
+ 42,bert.encoder.layer.2.attention.self.value,Linear,bias,[576],576,384,0.3333333134651184
45
+ 43,bert.encoder.layer.2.attention.output.dense,Linear,weight,"[768, 576]",442368,237568,0.4629629850387573
46
+ 44,bert.encoder.layer.2.attention.output.dense,Linear,bias,[768],768,768,0.0
47
+ 45,bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
48
+ 46,bert.encoder.layer.2.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
49
+ 47,bert.encoder.layer.2.intermediate.dense,Linear,weight,"[339, 768]",260352,260352,0.0
50
+ 48,bert.encoder.layer.2.intermediate.dense,Linear,bias,[339],339,339,0.0
51
+ 49,bert.encoder.layer.2.output.dense,Linear,weight,"[768, 339]",260352,260352,0.0
52
+ 50,bert.encoder.layer.2.output.dense,Linear,bias,[768],768,768,0.0
53
+ 51,bert.encoder.layer.2.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
54
+ 52,bert.encoder.layer.2.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
55
+ 53,bert.encoder.layer.3.attention.self.query,Linear,weight,"[576, 768]",442368,277504,0.37268519401550293
56
+ 54,bert.encoder.layer.3.attention.self.query,Linear,bias,[576],576,512,0.1111111044883728
57
+ 55,bert.encoder.layer.3.attention.self.key,Linear,weight,"[576, 768]",442368,303104,0.31481480598449707
58
+ 56,bert.encoder.layer.3.attention.self.key,Linear,bias,[576],576,512,0.1111111044883728
59
+ 57,bert.encoder.layer.3.attention.self.value,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
60
+ 58,bert.encoder.layer.3.attention.self.value,Linear,bias,[576],576,512,0.1111111044883728
61
+ 59,bert.encoder.layer.3.attention.output.dense,Linear,weight,"[768, 576]",442368,308224,0.30324071645736694
62
+ 60,bert.encoder.layer.3.attention.output.dense,Linear,bias,[768],768,768,0.0
63
+ 61,bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
64
+ 62,bert.encoder.layer.3.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
65
+ 63,bert.encoder.layer.3.intermediate.dense,Linear,weight,"[368, 768]",282624,282624,0.0
66
+ 64,bert.encoder.layer.3.intermediate.dense,Linear,bias,[368],368,368,0.0
67
+ 65,bert.encoder.layer.3.output.dense,Linear,weight,"[768, 368]",282624,282624,0.0
68
+ 66,bert.encoder.layer.3.output.dense,Linear,bias,[768],768,768,0.0
69
+ 67,bert.encoder.layer.3.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
70
+ 68,bert.encoder.layer.3.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
71
+ 69,bert.encoder.layer.4.attention.self.query,Linear,weight,"[576, 768]",442368,291840,0.3402777910232544
72
+ 70,bert.encoder.layer.4.attention.self.query,Linear,bias,[576],576,544,0.055555522441864014
73
+ 71,bert.encoder.layer.4.attention.self.key,Linear,weight,"[576, 768]",442368,310272,0.2986111044883728
74
+ 72,bert.encoder.layer.4.attention.self.key,Linear,bias,[576],576,544,0.055555522441864014
75
+ 73,bert.encoder.layer.4.attention.self.value,Linear,weight,"[576, 768]",442368,272384,0.38425928354263306
76
+ 74,bert.encoder.layer.4.attention.self.value,Linear,bias,[576],576,480,0.1666666865348816
77
+ 75,bert.encoder.layer.4.attention.output.dense,Linear,weight,"[768, 576]",442368,263168,0.40509259700775146
78
+ 76,bert.encoder.layer.4.attention.output.dense,Linear,bias,[768],768,768,0.0
79
+ 77,bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
80
+ 78,bert.encoder.layer.4.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
81
+ 79,bert.encoder.layer.4.intermediate.dense,Linear,weight,"[386, 768]",296448,296448,0.0
82
+ 80,bert.encoder.layer.4.intermediate.dense,Linear,bias,[386],386,386,0.0
83
+ 81,bert.encoder.layer.4.output.dense,Linear,weight,"[768, 386]",296448,296448,0.0
84
+ 82,bert.encoder.layer.4.output.dense,Linear,bias,[768],768,768,0.0
85
+ 83,bert.encoder.layer.4.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
86
+ 84,bert.encoder.layer.4.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
87
+ 85,bert.encoder.layer.5.attention.self.query,Linear,weight,"[384, 768]",294912,171008,0.4201388955116272
88
+ 86,bert.encoder.layer.5.attention.self.query,Linear,bias,[384],384,352,0.08333331346511841
89
+ 87,bert.encoder.layer.5.attention.self.key,Linear,weight,"[384, 768]",294912,205824,0.3020833134651184
90
+ 88,bert.encoder.layer.5.attention.self.key,Linear,bias,[384],384,352,0.08333331346511841
91
+ 89,bert.encoder.layer.5.attention.self.value,Linear,weight,"[384, 768]",294912,217088,0.2638888955116272
92
+ 90,bert.encoder.layer.5.attention.self.value,Linear,bias,[384],384,384,0.0
93
+ 91,bert.encoder.layer.5.attention.output.dense,Linear,weight,"[768, 384]",294912,223232,0.243055522441864
94
+ 92,bert.encoder.layer.5.attention.output.dense,Linear,bias,[768],768,768,0.0
95
+ 93,bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
96
+ 94,bert.encoder.layer.5.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
97
+ 95,bert.encoder.layer.5.intermediate.dense,Linear,weight,"[336, 768]",258048,258048,0.0
98
+ 96,bert.encoder.layer.5.intermediate.dense,Linear,bias,[336],336,336,0.0
99
+ 97,bert.encoder.layer.5.output.dense,Linear,weight,"[768, 336]",258048,258048,0.0
100
+ 98,bert.encoder.layer.5.output.dense,Linear,bias,[768],768,768,0.0
101
+ 99,bert.encoder.layer.5.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
102
+ 100,bert.encoder.layer.5.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
103
+ 101,bert.encoder.layer.6.attention.self.query,Linear,weight,"[448, 768]",344064,192512,0.4404761791229248
104
+ 102,bert.encoder.layer.6.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
105
+ 103,bert.encoder.layer.6.attention.self.key,Linear,weight,"[448, 768]",344064,224256,0.3482142686843872
106
+ 104,bert.encoder.layer.6.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
107
+ 105,bert.encoder.layer.6.attention.self.value,Linear,weight,"[448, 768]",344064,209920,0.3898809552192688
108
+ 106,bert.encoder.layer.6.attention.self.value,Linear,bias,[448],448,352,0.21428567171096802
109
+ 107,bert.encoder.layer.6.attention.output.dense,Linear,weight,"[768, 448]",344064,199680,0.4196428656578064
110
+ 108,bert.encoder.layer.6.attention.output.dense,Linear,bias,[768],768,768,0.0
111
+ 109,bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
112
+ 110,bert.encoder.layer.6.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
113
+ 111,bert.encoder.layer.6.intermediate.dense,Linear,weight,"[280, 768]",215040,215040,0.0
114
+ 112,bert.encoder.layer.6.intermediate.dense,Linear,bias,[280],280,280,0.0
115
+ 113,bert.encoder.layer.6.output.dense,Linear,weight,"[768, 280]",215040,215040,0.0
116
+ 114,bert.encoder.layer.6.output.dense,Linear,bias,[768],768,768,0.0
117
+ 115,bert.encoder.layer.6.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
118
+ 116,bert.encoder.layer.6.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
119
+ 117,bert.encoder.layer.7.attention.self.query,Linear,weight,"[448, 768]",344064,201728,0.413690447807312
120
+ 118,bert.encoder.layer.7.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
121
+ 119,bert.encoder.layer.7.attention.self.key,Linear,weight,"[448, 768]",344064,237568,0.3095238208770752
122
+ 120,bert.encoder.layer.7.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
123
+ 121,bert.encoder.layer.7.attention.self.value,Linear,weight,"[448, 768]",344064,218112,0.3660714030265808
124
+ 122,bert.encoder.layer.7.attention.self.value,Linear,bias,[448],448,352,0.21428567171096802
125
+ 123,bert.encoder.layer.7.attention.output.dense,Linear,weight,"[768, 448]",344064,202752,0.4107142686843872
126
+ 124,bert.encoder.layer.7.attention.output.dense,Linear,bias,[768],768,768,0.0
127
+ 125,bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
128
+ 126,bert.encoder.layer.7.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
129
+ 127,bert.encoder.layer.7.intermediate.dense,Linear,weight,"[211, 768]",162048,162048,0.0
130
+ 128,bert.encoder.layer.7.intermediate.dense,Linear,bias,[211],211,211,0.0
131
+ 129,bert.encoder.layer.7.output.dense,Linear,weight,"[768, 211]",162048,162048,0.0
132
+ 130,bert.encoder.layer.7.output.dense,Linear,bias,[768],768,768,0.0
133
+ 131,bert.encoder.layer.7.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
134
+ 132,bert.encoder.layer.7.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
135
+ 133,bert.encoder.layer.8.attention.self.query,Linear,weight,"[448, 768]",344064,186368,0.4583333134651184
136
+ 134,bert.encoder.layer.8.attention.self.query,Linear,bias,[448],448,416,0.07142853736877441
137
+ 135,bert.encoder.layer.8.attention.self.key,Linear,weight,"[448, 768]",344064,197632,0.425595223903656
138
+ 136,bert.encoder.layer.8.attention.self.key,Linear,bias,[448],448,416,0.07142853736877441
139
+ 137,bert.encoder.layer.8.attention.self.value,Linear,weight,"[448, 768]",344064,154624,0.550595223903656
140
+ 138,bert.encoder.layer.8.attention.self.value,Linear,bias,[448],448,288,0.3571428060531616
141
+ 139,bert.encoder.layer.8.attention.output.dense,Linear,weight,"[768, 448]",344064,148480,0.5684523582458496
142
+ 140,bert.encoder.layer.8.attention.output.dense,Linear,bias,[768],768,768,0.0
143
+ 141,bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
144
+ 142,bert.encoder.layer.8.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
145
+ 143,bert.encoder.layer.8.intermediate.dense,Linear,weight,"[108, 768]",82944,82944,0.0
146
+ 144,bert.encoder.layer.8.intermediate.dense,Linear,bias,[108],108,108,0.0
147
+ 145,bert.encoder.layer.8.output.dense,Linear,weight,"[768, 108]",82944,82944,0.0
148
+ 146,bert.encoder.layer.8.output.dense,Linear,bias,[768],768,768,0.0
149
+ 147,bert.encoder.layer.8.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
150
+ 148,bert.encoder.layer.8.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
151
+ 149,bert.encoder.layer.9.attention.self.query,Linear,weight,"[320, 768]",245760,144384,0.41249996423721313
152
+ 150,bert.encoder.layer.9.attention.self.query,Linear,bias,[320],320,288,0.09999996423721313
153
+ 151,bert.encoder.layer.9.attention.self.key,Linear,weight,"[320, 768]",245760,155648,0.36666661500930786
154
+ 152,bert.encoder.layer.9.attention.self.key,Linear,bias,[320],320,288,0.09999996423721313
155
+ 153,bert.encoder.layer.9.attention.self.value,Linear,weight,"[320, 768]",245760,63488,0.7416666746139526
156
+ 154,bert.encoder.layer.9.attention.self.value,Linear,bias,[320],320,160,0.5
157
+ 155,bert.encoder.layer.9.attention.output.dense,Linear,weight,"[768, 320]",245760,65536,0.7333333492279053
158
+ 156,bert.encoder.layer.9.attention.output.dense,Linear,bias,[768],768,704,0.08333331346511841
159
+ 157,bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
160
+ 158,bert.encoder.layer.9.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
161
+ 159,bert.encoder.layer.9.intermediate.dense,Linear,weight,"[53, 768]",40704,40704,5.960464477539063e-08
162
+ 160,bert.encoder.layer.9.intermediate.dense,Linear,bias,[53],53,53,0.0
163
+ 161,bert.encoder.layer.9.output.dense,Linear,weight,"[768, 53]",40704,40704,5.960464477539063e-08
164
+ 162,bert.encoder.layer.9.output.dense,Linear,bias,[768],768,768,0.0
165
+ 163,bert.encoder.layer.9.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
166
+ 164,bert.encoder.layer.9.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
167
+ 165,bert.encoder.layer.10.attention.self.query,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
168
+ 166,bert.encoder.layer.10.attention.self.query,Linear,bias,[384],384,320,0.16666662693023682
169
+ 167,bert.encoder.layer.10.attention.self.key,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
170
+ 168,bert.encoder.layer.10.attention.self.key,Linear,bias,[384],384,320,0.16666662693023682
171
+ 169,bert.encoder.layer.10.attention.self.value,Linear,weight,"[384, 768]",294912,77824,0.7361111044883728
172
+ 170,bert.encoder.layer.10.attention.self.value,Linear,bias,[384],384,192,0.5
173
+ 171,bert.encoder.layer.10.attention.output.dense,Linear,weight,"[768, 384]",294912,78848,0.7326388955116272
174
+ 172,bert.encoder.layer.10.attention.output.dense,Linear,bias,[768],768,736,0.041666626930236816
175
+ 173,bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
176
+ 174,bert.encoder.layer.10.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
177
+ 175,bert.encoder.layer.10.intermediate.dense,Linear,weight,"[86, 768]",66048,66048,0.0
178
+ 176,bert.encoder.layer.10.intermediate.dense,Linear,bias,[86],86,86,0.0
179
+ 177,bert.encoder.layer.10.output.dense,Linear,weight,"[768, 86]",66048,66048,0.0
180
+ 178,bert.encoder.layer.10.output.dense,Linear,bias,[768],768,768,0.0
181
+ 179,bert.encoder.layer.10.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
182
+ 180,bert.encoder.layer.10.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
183
+ 181,bert.encoder.layer.11.attention.self.query,Linear,weight,"[384, 768]",294912,107520,0.6354166269302368
184
+ 182,bert.encoder.layer.11.attention.self.query,Linear,bias,[384],384,256,0.3333333134651184
185
+ 183,bert.encoder.layer.11.attention.self.key,Linear,weight,"[384, 768]",294912,118784,0.5972222089767456
186
+ 184,bert.encoder.layer.11.attention.self.key,Linear,bias,[384],384,256,0.3333333134651184
187
+ 185,bert.encoder.layer.11.attention.self.value,Linear,weight,"[384, 768]",294912,62464,0.7881944179534912
188
+ 186,bert.encoder.layer.11.attention.self.value,Linear,bias,[384],384,192,0.5
189
+ 187,bert.encoder.layer.11.attention.output.dense,Linear,weight,"[768, 384]",294912,54272,0.8159722089767456
190
+ 188,bert.encoder.layer.11.attention.output.dense,Linear,bias,[768],768,672,0.125
191
+ 189,bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
192
+ 190,bert.encoder.layer.11.attention.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
193
+ 191,bert.encoder.layer.11.intermediate.dense,Linear,weight,"[105, 768]",80640,80640,0.0
194
+ 192,bert.encoder.layer.11.intermediate.dense,Linear,bias,[105],105,105,0.0
195
+ 193,bert.encoder.layer.11.output.dense,Linear,weight,"[768, 105]",80640,80640,0.0
196
+ 194,bert.encoder.layer.11.output.dense,Linear,bias,[768],768,768,0.0
197
+ 195,bert.encoder.layer.11.output.LayerNorm,LayerNorm,weight,[768],768,768,0.0
198
+ 196,bert.encoder.layer.11.output.LayerNorm,LayerNorm,bias,[768],768,768,0.0
199
+ 197,qa_outputs,Linear,weight,"[2, 768]",1536,1536,0.0
200
+ 198,qa_outputs,Linear,bias,[2],2,2,0.0
XP_layer_wise_sparsity_global_rate_15.41.md ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | layer_type | param_type | shape | nparam | nnz | sparsity |
2
+ |----:|:-------------------------------------------------|:-------------|:-------------|:-------------|---------:|---------:|------------:|
3
+ | 0 | bert.embeddings.word_embeddings | Embedding | weight | [30522, 768] | 23440896 | 23440896 | 0 |
4
+ | 1 | bert.embeddings.position_embeddings | Embedding | weight | [512, 768] | 393216 | 393216 | 0 |
5
+ | 2 | bert.embeddings.token_type_embeddings | Embedding | weight | [2, 768] | 1536 | 1536 | 0 |
6
+ | 3 | bert.embeddings.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
7
+ | 4 | bert.embeddings.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
8
+ | 5 | bert.encoder.layer.0.attention.self.query | Linear | weight | [320, 768] | 245760 | 135168 | 0.45 |
9
+ | 6 | bert.encoder.layer.0.attention.self.query | Linear | bias | [320] | 320 | 256 | 0.2 |
10
+ | 7 | bert.encoder.layer.0.attention.self.key | Linear | weight | [320, 768] | 245760 | 149504 | 0.391667 |
11
+ | 8 | bert.encoder.layer.0.attention.self.key | Linear | bias | [320] | 320 | 256 | 0.2 |
12
+ | 9 | bert.encoder.layer.0.attention.self.value | Linear | weight | [320, 768] | 245760 | 173056 | 0.295833 |
13
+ | 10 | bert.encoder.layer.0.attention.self.value | Linear | bias | [320] | 320 | 256 | 0.2 |
14
+ | 11 | bert.encoder.layer.0.attention.output.dense | Linear | weight | [768, 320] | 245760 | 181248 | 0.2625 |
15
+ | 12 | bert.encoder.layer.0.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
16
+ | 13 | bert.encoder.layer.0.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
17
+ | 14 | bert.encoder.layer.0.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
18
+ | 15 | bert.encoder.layer.0.intermediate.dense | Linear | weight | [185, 768] | 142080 | 142080 | 0 |
19
+ | 16 | bert.encoder.layer.0.intermediate.dense | Linear | bias | [185] | 185 | 185 | 0 |
20
+ | 17 | bert.encoder.layer.0.output.dense | Linear | weight | [768, 185] | 142080 | 142080 | 0 |
21
+ | 18 | bert.encoder.layer.0.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
22
+ | 19 | bert.encoder.layer.0.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
23
+ | 20 | bert.encoder.layer.0.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
24
+ | 21 | bert.encoder.layer.1.attention.self.query | Linear | weight | [320, 768] | 245760 | 175104 | 0.2875 |
25
+ | 22 | bert.encoder.layer.1.attention.self.query | Linear | bias | [320] | 320 | 288 | 0.1 |
26
+ | 23 | bert.encoder.layer.1.attention.self.key | Linear | weight | [320, 768] | 245760 | 177152 | 0.279167 |
27
+ | 24 | bert.encoder.layer.1.attention.self.key | Linear | bias | [320] | 320 | 288 | 0.1 |
28
+ | 25 | bert.encoder.layer.1.attention.self.value | Linear | weight | [320, 768] | 245760 | 166912 | 0.320833 |
29
+ | 26 | bert.encoder.layer.1.attention.self.value | Linear | bias | [320] | 320 | 288 | 0.1 |
30
+ | 27 | bert.encoder.layer.1.attention.output.dense | Linear | weight | [768, 320] | 245760 | 167936 | 0.316667 |
31
+ | 28 | bert.encoder.layer.1.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
32
+ | 29 | bert.encoder.layer.1.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
33
+ | 30 | bert.encoder.layer.1.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
34
+ | 31 | bert.encoder.layer.1.intermediate.dense | Linear | weight | [315, 768] | 241920 | 241920 | 0 |
35
+ | 32 | bert.encoder.layer.1.intermediate.dense | Linear | bias | [315] | 315 | 315 | 0 |
36
+ | 33 | bert.encoder.layer.1.output.dense | Linear | weight | [768, 315] | 241920 | 241920 | 0 |
37
+ | 34 | bert.encoder.layer.1.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
38
+ | 35 | bert.encoder.layer.1.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
39
+ | 36 | bert.encoder.layer.1.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
40
+ | 37 | bert.encoder.layer.2.attention.self.query | Linear | weight | [576, 768] | 442368 | 285696 | 0.354167 |
41
+ | 38 | bert.encoder.layer.2.attention.self.query | Linear | bias | [576] | 576 | 480 | 0.166667 |
42
+ | 39 | bert.encoder.layer.2.attention.self.key | Linear | weight | [576, 768] | 442368 | 297984 | 0.326389 |
43
+ | 40 | bert.encoder.layer.2.attention.self.key | Linear | bias | [576] | 576 | 480 | 0.166667 |
44
+ | 41 | bert.encoder.layer.2.attention.self.value | Linear | weight | [576, 768] | 442368 | 226304 | 0.488426 |
45
+ | 42 | bert.encoder.layer.2.attention.self.value | Linear | bias | [576] | 576 | 384 | 0.333333 |
46
+ | 43 | bert.encoder.layer.2.attention.output.dense | Linear | weight | [768, 576] | 442368 | 237568 | 0.462963 |
47
+ | 44 | bert.encoder.layer.2.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
48
+ | 45 | bert.encoder.layer.2.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
49
+ | 46 | bert.encoder.layer.2.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
50
+ | 47 | bert.encoder.layer.2.intermediate.dense | Linear | weight | [339, 768] | 260352 | 260352 | 0 |
51
+ | 48 | bert.encoder.layer.2.intermediate.dense | Linear | bias | [339] | 339 | 339 | 0 |
52
+ | 49 | bert.encoder.layer.2.output.dense | Linear | weight | [768, 339] | 260352 | 260352 | 0 |
53
+ | 50 | bert.encoder.layer.2.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
54
+ | 51 | bert.encoder.layer.2.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
55
+ | 52 | bert.encoder.layer.2.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
56
+ | 53 | bert.encoder.layer.3.attention.self.query | Linear | weight | [576, 768] | 442368 | 277504 | 0.372685 |
57
+ | 54 | bert.encoder.layer.3.attention.self.query | Linear | bias | [576] | 576 | 512 | 0.111111 |
58
+ | 55 | bert.encoder.layer.3.attention.self.key | Linear | weight | [576, 768] | 442368 | 303104 | 0.314815 |
59
+ | 56 | bert.encoder.layer.3.attention.self.key | Linear | bias | [576] | 576 | 512 | 0.111111 |
60
+ | 57 | bert.encoder.layer.3.attention.self.value | Linear | weight | [576, 768] | 442368 | 297984 | 0.326389 |
61
+ | 58 | bert.encoder.layer.3.attention.self.value | Linear | bias | [576] | 576 | 512 | 0.111111 |
62
+ | 59 | bert.encoder.layer.3.attention.output.dense | Linear | weight | [768, 576] | 442368 | 308224 | 0.303241 |
63
+ | 60 | bert.encoder.layer.3.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
64
+ | 61 | bert.encoder.layer.3.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
65
+ | 62 | bert.encoder.layer.3.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
66
+ | 63 | bert.encoder.layer.3.intermediate.dense | Linear | weight | [368, 768] | 282624 | 282624 | 0 |
67
+ | 64 | bert.encoder.layer.3.intermediate.dense | Linear | bias | [368] | 368 | 368 | 0 |
68
+ | 65 | bert.encoder.layer.3.output.dense | Linear | weight | [768, 368] | 282624 | 282624 | 0 |
69
+ | 66 | bert.encoder.layer.3.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
70
+ | 67 | bert.encoder.layer.3.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
71
+ | 68 | bert.encoder.layer.3.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
72
+ | 69 | bert.encoder.layer.4.attention.self.query | Linear | weight | [576, 768] | 442368 | 291840 | 0.340278 |
73
+ | 70 | bert.encoder.layer.4.attention.self.query | Linear | bias | [576] | 576 | 544 | 0.0555555 |
74
+ | 71 | bert.encoder.layer.4.attention.self.key | Linear | weight | [576, 768] | 442368 | 310272 | 0.298611 |
75
+ | 72 | bert.encoder.layer.4.attention.self.key | Linear | bias | [576] | 576 | 544 | 0.0555555 |
76
+ | 73 | bert.encoder.layer.4.attention.self.value | Linear | weight | [576, 768] | 442368 | 272384 | 0.384259 |
77
+ | 74 | bert.encoder.layer.4.attention.self.value | Linear | bias | [576] | 576 | 480 | 0.166667 |
78
+ | 75 | bert.encoder.layer.4.attention.output.dense | Linear | weight | [768, 576] | 442368 | 263168 | 0.405093 |
79
+ | 76 | bert.encoder.layer.4.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
80
+ | 77 | bert.encoder.layer.4.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
81
+ | 78 | bert.encoder.layer.4.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
82
+ | 79 | bert.encoder.layer.4.intermediate.dense | Linear | weight | [386, 768] | 296448 | 296448 | 0 |
83
+ | 80 | bert.encoder.layer.4.intermediate.dense | Linear | bias | [386] | 386 | 386 | 0 |
84
+ | 81 | bert.encoder.layer.4.output.dense | Linear | weight | [768, 386] | 296448 | 296448 | 0 |
85
+ | 82 | bert.encoder.layer.4.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
86
+ | 83 | bert.encoder.layer.4.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
87
+ | 84 | bert.encoder.layer.4.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
88
+ | 85 | bert.encoder.layer.5.attention.self.query | Linear | weight | [384, 768] | 294912 | 171008 | 0.420139 |
89
+ | 86 | bert.encoder.layer.5.attention.self.query | Linear | bias | [384] | 384 | 352 | 0.0833333 |
90
+ | 87 | bert.encoder.layer.5.attention.self.key | Linear | weight | [384, 768] | 294912 | 205824 | 0.302083 |
91
+ | 88 | bert.encoder.layer.5.attention.self.key | Linear | bias | [384] | 384 | 352 | 0.0833333 |
92
+ | 89 | bert.encoder.layer.5.attention.self.value | Linear | weight | [384, 768] | 294912 | 217088 | 0.263889 |
93
+ | 90 | bert.encoder.layer.5.attention.self.value | Linear | bias | [384] | 384 | 384 | 0 |
94
+ | 91 | bert.encoder.layer.5.attention.output.dense | Linear | weight | [768, 384] | 294912 | 223232 | 0.243056 |
95
+ | 92 | bert.encoder.layer.5.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
96
+ | 93 | bert.encoder.layer.5.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
97
+ | 94 | bert.encoder.layer.5.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
98
+ | 95 | bert.encoder.layer.5.intermediate.dense | Linear | weight | [336, 768] | 258048 | 258048 | 0 |
99
+ | 96 | bert.encoder.layer.5.intermediate.dense | Linear | bias | [336] | 336 | 336 | 0 |
100
+ | 97 | bert.encoder.layer.5.output.dense | Linear | weight | [768, 336] | 258048 | 258048 | 0 |
101
+ | 98 | bert.encoder.layer.5.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
102
+ | 99 | bert.encoder.layer.5.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
103
+ | 100 | bert.encoder.layer.5.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
104
+ | 101 | bert.encoder.layer.6.attention.self.query | Linear | weight | [448, 768] | 344064 | 192512 | 0.440476 |
105
+ | 102 | bert.encoder.layer.6.attention.self.query | Linear | bias | [448] | 448 | 416 | 0.0714285 |
106
+ | 103 | bert.encoder.layer.6.attention.self.key | Linear | weight | [448, 768] | 344064 | 224256 | 0.348214 |
107
+ | 104 | bert.encoder.layer.6.attention.self.key | Linear | bias | [448] | 448 | 416 | 0.0714285 |
108
+ | 105 | bert.encoder.layer.6.attention.self.value | Linear | weight | [448, 768] | 344064 | 209920 | 0.389881 |
109
+ | 106 | bert.encoder.layer.6.attention.self.value | Linear | bias | [448] | 448 | 352 | 0.214286 |
110
+ | 107 | bert.encoder.layer.6.attention.output.dense | Linear | weight | [768, 448] | 344064 | 199680 | 0.419643 |
111
+ | 108 | bert.encoder.layer.6.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
112
+ | 109 | bert.encoder.layer.6.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
113
+ | 110 | bert.encoder.layer.6.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
114
+ | 111 | bert.encoder.layer.6.intermediate.dense | Linear | weight | [280, 768] | 215040 | 215040 | 0 |
115
+ | 112 | bert.encoder.layer.6.intermediate.dense | Linear | bias | [280] | 280 | 280 | 0 |
116
+ | 113 | bert.encoder.layer.6.output.dense | Linear | weight | [768, 280] | 215040 | 215040 | 0 |
117
+ | 114 | bert.encoder.layer.6.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
118
+ | 115 | bert.encoder.layer.6.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
119
+ | 116 | bert.encoder.layer.6.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
120
+ | 117 | bert.encoder.layer.7.attention.self.query | Linear | weight | [448, 768] | 344064 | 201728 | 0.41369 |
121
+ | 118 | bert.encoder.layer.7.attention.self.query | Linear | bias | [448] | 448 | 416 | 0.0714285 |
122
+ | 119 | bert.encoder.layer.7.attention.self.key | Linear | weight | [448, 768] | 344064 | 237568 | 0.309524 |
123
+ | 120 | bert.encoder.layer.7.attention.self.key | Linear | bias | [448] | 448 | 416 | 0.0714285 |
124
+ | 121 | bert.encoder.layer.7.attention.self.value | Linear | weight | [448, 768] | 344064 | 218112 | 0.366071 |
125
+ | 122 | bert.encoder.layer.7.attention.self.value | Linear | bias | [448] | 448 | 352 | 0.214286 |
126
+ | 123 | bert.encoder.layer.7.attention.output.dense | Linear | weight | [768, 448] | 344064 | 202752 | 0.410714 |
127
+ | 124 | bert.encoder.layer.7.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
128
+ | 125 | bert.encoder.layer.7.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
129
+ | 126 | bert.encoder.layer.7.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
130
+ | 127 | bert.encoder.layer.7.intermediate.dense | Linear | weight | [211, 768] | 162048 | 162048 | 0 |
131
+ | 128 | bert.encoder.layer.7.intermediate.dense | Linear | bias | [211] | 211 | 211 | 0 |
132
+ | 129 | bert.encoder.layer.7.output.dense | Linear | weight | [768, 211] | 162048 | 162048 | 0 |
133
+ | 130 | bert.encoder.layer.7.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
134
+ | 131 | bert.encoder.layer.7.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
135
+ | 132 | bert.encoder.layer.7.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
136
+ | 133 | bert.encoder.layer.8.attention.self.query | Linear | weight | [448, 768] | 344064 | 186368 | 0.458333 |
137
+ | 134 | bert.encoder.layer.8.attention.self.query | Linear | bias | [448] | 448 | 416 | 0.0714285 |
138
+ | 135 | bert.encoder.layer.8.attention.self.key | Linear | weight | [448, 768] | 344064 | 197632 | 0.425595 |
139
+ | 136 | bert.encoder.layer.8.attention.self.key | Linear | bias | [448] | 448 | 416 | 0.0714285 |
140
+ | 137 | bert.encoder.layer.8.attention.self.value | Linear | weight | [448, 768] | 344064 | 154624 | 0.550595 |
141
+ | 138 | bert.encoder.layer.8.attention.self.value | Linear | bias | [448] | 448 | 288 | 0.357143 |
142
+ | 139 | bert.encoder.layer.8.attention.output.dense | Linear | weight | [768, 448] | 344064 | 148480 | 0.568452 |
143
+ | 140 | bert.encoder.layer.8.attention.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
144
+ | 141 | bert.encoder.layer.8.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
145
+ | 142 | bert.encoder.layer.8.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
146
+ | 143 | bert.encoder.layer.8.intermediate.dense | Linear | weight | [108, 768] | 82944 | 82944 | 0 |
147
+ | 144 | bert.encoder.layer.8.intermediate.dense | Linear | bias | [108] | 108 | 108 | 0 |
148
+ | 145 | bert.encoder.layer.8.output.dense | Linear | weight | [768, 108] | 82944 | 82944 | 0 |
149
+ | 146 | bert.encoder.layer.8.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
150
+ | 147 | bert.encoder.layer.8.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
151
+ | 148 | bert.encoder.layer.8.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
152
+ | 149 | bert.encoder.layer.9.attention.self.query | Linear | weight | [320, 768] | 245760 | 144384 | 0.4125 |
153
+ | 150 | bert.encoder.layer.9.attention.self.query | Linear | bias | [320] | 320 | 288 | 0.1 |
154
+ | 151 | bert.encoder.layer.9.attention.self.key | Linear | weight | [320, 768] | 245760 | 155648 | 0.366667 |
155
+ | 152 | bert.encoder.layer.9.attention.self.key | Linear | bias | [320] | 320 | 288 | 0.1 |
156
+ | 153 | bert.encoder.layer.9.attention.self.value | Linear | weight | [320, 768] | 245760 | 63488 | 0.741667 |
157
+ | 154 | bert.encoder.layer.9.attention.self.value | Linear | bias | [320] | 320 | 160 | 0.5 |
158
+ | 155 | bert.encoder.layer.9.attention.output.dense | Linear | weight | [768, 320] | 245760 | 65536 | 0.733333 |
159
+ | 156 | bert.encoder.layer.9.attention.output.dense | Linear | bias | [768] | 768 | 704 | 0.0833333 |
160
+ | 157 | bert.encoder.layer.9.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
161
+ | 158 | bert.encoder.layer.9.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
162
+ | 159 | bert.encoder.layer.9.intermediate.dense | Linear | weight | [53, 768] | 40704 | 40704 | 5.96046e-08 |
163
+ | 160 | bert.encoder.layer.9.intermediate.dense | Linear | bias | [53] | 53 | 53 | 0 |
164
+ | 161 | bert.encoder.layer.9.output.dense | Linear | weight | [768, 53] | 40704 | 40704 | 5.96046e-08 |
165
+ | 162 | bert.encoder.layer.9.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
166
+ | 163 | bert.encoder.layer.9.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
167
+ | 164 | bert.encoder.layer.9.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
168
+ | 165 | bert.encoder.layer.10.attention.self.query | Linear | weight | [384, 768] | 294912 | 158720 | 0.461806 |
169
+ | 166 | bert.encoder.layer.10.attention.self.query | Linear | bias | [384] | 384 | 320 | 0.166667 |
170
+ | 167 | bert.encoder.layer.10.attention.self.key | Linear | weight | [384, 768] | 294912 | 158720 | 0.461806 |
171
+ | 168 | bert.encoder.layer.10.attention.self.key | Linear | bias | [384] | 384 | 320 | 0.166667 |
172
+ | 169 | bert.encoder.layer.10.attention.self.value | Linear | weight | [384, 768] | 294912 | 77824 | 0.736111 |
173
+ | 170 | bert.encoder.layer.10.attention.self.value | Linear | bias | [384] | 384 | 192 | 0.5 |
174
+ | 171 | bert.encoder.layer.10.attention.output.dense | Linear | weight | [768, 384] | 294912 | 78848 | 0.732639 |
175
+ | 172 | bert.encoder.layer.10.attention.output.dense | Linear | bias | [768] | 768 | 736 | 0.0416666 |
176
+ | 173 | bert.encoder.layer.10.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
177
+ | 174 | bert.encoder.layer.10.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
178
+ | 175 | bert.encoder.layer.10.intermediate.dense | Linear | weight | [86, 768] | 66048 | 66048 | 0 |
179
+ | 176 | bert.encoder.layer.10.intermediate.dense | Linear | bias | [86] | 86 | 86 | 0 |
180
+ | 177 | bert.encoder.layer.10.output.dense | Linear | weight | [768, 86] | 66048 | 66048 | 0 |
181
+ | 178 | bert.encoder.layer.10.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
182
+ | 179 | bert.encoder.layer.10.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
183
+ | 180 | bert.encoder.layer.10.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
184
+ | 181 | bert.encoder.layer.11.attention.self.query | Linear | weight | [384, 768] | 294912 | 107520 | 0.635417 |
185
+ | 182 | bert.encoder.layer.11.attention.self.query | Linear | bias | [384] | 384 | 256 | 0.333333 |
186
+ | 183 | bert.encoder.layer.11.attention.self.key | Linear | weight | [384, 768] | 294912 | 118784 | 0.597222 |
187
+ | 184 | bert.encoder.layer.11.attention.self.key | Linear | bias | [384] | 384 | 256 | 0.333333 |
188
+ | 185 | bert.encoder.layer.11.attention.self.value | Linear | weight | [384, 768] | 294912 | 62464 | 0.788194 |
189
+ | 186 | bert.encoder.layer.11.attention.self.value | Linear | bias | [384] | 384 | 192 | 0.5 |
190
+ | 187 | bert.encoder.layer.11.attention.output.dense | Linear | weight | [768, 384] | 294912 | 54272 | 0.815972 |
191
+ | 188 | bert.encoder.layer.11.attention.output.dense | Linear | bias | [768] | 768 | 672 | 0.125 |
192
+ | 189 | bert.encoder.layer.11.attention.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
193
+ | 190 | bert.encoder.layer.11.attention.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
194
+ | 191 | bert.encoder.layer.11.intermediate.dense | Linear | weight | [105, 768] | 80640 | 80640 | 0 |
195
+ | 192 | bert.encoder.layer.11.intermediate.dense | Linear | bias | [105] | 105 | 105 | 0 |
196
+ | 193 | bert.encoder.layer.11.output.dense | Linear | weight | [768, 105] | 80640 | 80640 | 0 |
197
+ | 194 | bert.encoder.layer.11.output.dense | Linear | bias | [768] | 768 | 768 | 0 |
198
+ | 195 | bert.encoder.layer.11.output.LayerNorm | LayerNorm | weight | [768] | 768 | 768 | 0 |
199
+ | 196 | bert.encoder.layer.11.output.LayerNorm | LayerNorm | bias | [768] | 768 | 768 | 0 |
200
+ | 197 | qa_outputs | Linear | weight | [2, 768] | 1536 | 1536 | 0 |
201
+ | 198 | qa_outputs | Linear | bias | [2] | 2 | 2 | 0 |
XP_linear_layer_sparsity_20M_params_33.64_sparsity.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer_id,layer_type,param_type,shape,nparam,nnz,sparsity
2
+ 5,bert.encoder.layer.0.attention.self.query,Linear,weight,"[320, 768]",245760,135168,0.44999998807907104
3
+ 7,bert.encoder.layer.0.attention.self.key,Linear,weight,"[320, 768]",245760,149504,0.3916666507720947
4
+ 9,bert.encoder.layer.0.attention.self.value,Linear,weight,"[320, 768]",245760,173056,0.2958332896232605
5
+ 11,bert.encoder.layer.0.attention.output.dense,Linear,weight,"[768, 320]",245760,181248,0.26249998807907104
6
+ 15,bert.encoder.layer.0.intermediate.dense,Linear,weight,"[185, 768]",142080,142080,0.0
7
+ 17,bert.encoder.layer.0.output.dense,Linear,weight,"[768, 185]",142080,142080,0.0
8
+ 21,bert.encoder.layer.1.attention.self.query,Linear,weight,"[320, 768]",245760,175104,0.28749996423721313
9
+ 23,bert.encoder.layer.1.attention.self.key,Linear,weight,"[320, 768]",245760,177152,0.27916663885116577
10
+ 25,bert.encoder.layer.1.attention.self.value,Linear,weight,"[320, 768]",245760,166912,0.32083332538604736
11
+ 27,bert.encoder.layer.1.attention.output.dense,Linear,weight,"[768, 320]",245760,167936,0.3166666030883789
12
+ 31,bert.encoder.layer.1.intermediate.dense,Linear,weight,"[315, 768]",241920,241920,0.0
13
+ 33,bert.encoder.layer.1.output.dense,Linear,weight,"[768, 315]",241920,241920,0.0
14
+ 37,bert.encoder.layer.2.attention.self.query,Linear,weight,"[576, 768]",442368,285696,0.3541666865348816
15
+ 39,bert.encoder.layer.2.attention.self.key,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
16
+ 41,bert.encoder.layer.2.attention.self.value,Linear,weight,"[576, 768]",442368,226304,0.4884259104728699
17
+ 43,bert.encoder.layer.2.attention.output.dense,Linear,weight,"[768, 576]",442368,237568,0.4629629850387573
18
+ 47,bert.encoder.layer.2.intermediate.dense,Linear,weight,"[339, 768]",260352,260352,0.0
19
+ 49,bert.encoder.layer.2.output.dense,Linear,weight,"[768, 339]",260352,260352,0.0
20
+ 53,bert.encoder.layer.3.attention.self.query,Linear,weight,"[576, 768]",442368,277504,0.37268519401550293
21
+ 55,bert.encoder.layer.3.attention.self.key,Linear,weight,"[576, 768]",442368,303104,0.31481480598449707
22
+ 57,bert.encoder.layer.3.attention.self.value,Linear,weight,"[576, 768]",442368,297984,0.3263888955116272
23
+ 59,bert.encoder.layer.3.attention.output.dense,Linear,weight,"[768, 576]",442368,308224,0.30324071645736694
24
+ 63,bert.encoder.layer.3.intermediate.dense,Linear,weight,"[368, 768]",282624,282624,0.0
25
+ 65,bert.encoder.layer.3.output.dense,Linear,weight,"[768, 368]",282624,282624,0.0
26
+ 69,bert.encoder.layer.4.attention.self.query,Linear,weight,"[576, 768]",442368,291840,0.3402777910232544
27
+ 71,bert.encoder.layer.4.attention.self.key,Linear,weight,"[576, 768]",442368,310272,0.2986111044883728
28
+ 73,bert.encoder.layer.4.attention.self.value,Linear,weight,"[576, 768]",442368,272384,0.38425928354263306
29
+ 75,bert.encoder.layer.4.attention.output.dense,Linear,weight,"[768, 576]",442368,263168,0.40509259700775146
30
+ 79,bert.encoder.layer.4.intermediate.dense,Linear,weight,"[386, 768]",296448,296448,0.0
31
+ 81,bert.encoder.layer.4.output.dense,Linear,weight,"[768, 386]",296448,296448,0.0
32
+ 85,bert.encoder.layer.5.attention.self.query,Linear,weight,"[384, 768]",294912,171008,0.4201388955116272
33
+ 87,bert.encoder.layer.5.attention.self.key,Linear,weight,"[384, 768]",294912,205824,0.3020833134651184
34
+ 89,bert.encoder.layer.5.attention.self.value,Linear,weight,"[384, 768]",294912,217088,0.2638888955116272
35
+ 91,bert.encoder.layer.5.attention.output.dense,Linear,weight,"[768, 384]",294912,223232,0.243055522441864
36
+ 95,bert.encoder.layer.5.intermediate.dense,Linear,weight,"[336, 768]",258048,258048,0.0
37
+ 97,bert.encoder.layer.5.output.dense,Linear,weight,"[768, 336]",258048,258048,0.0
38
+ 101,bert.encoder.layer.6.attention.self.query,Linear,weight,"[448, 768]",344064,192512,0.4404761791229248
39
+ 103,bert.encoder.layer.6.attention.self.key,Linear,weight,"[448, 768]",344064,224256,0.3482142686843872
40
+ 105,bert.encoder.layer.6.attention.self.value,Linear,weight,"[448, 768]",344064,209920,0.3898809552192688
41
+ 107,bert.encoder.layer.6.attention.output.dense,Linear,weight,"[768, 448]",344064,199680,0.4196428656578064
42
+ 111,bert.encoder.layer.6.intermediate.dense,Linear,weight,"[280, 768]",215040,215040,0.0
43
+ 113,bert.encoder.layer.6.output.dense,Linear,weight,"[768, 280]",215040,215040,0.0
44
+ 117,bert.encoder.layer.7.attention.self.query,Linear,weight,"[448, 768]",344064,201728,0.413690447807312
45
+ 119,bert.encoder.layer.7.attention.self.key,Linear,weight,"[448, 768]",344064,237568,0.3095238208770752
46
+ 121,bert.encoder.layer.7.attention.self.value,Linear,weight,"[448, 768]",344064,218112,0.3660714030265808
47
+ 123,bert.encoder.layer.7.attention.output.dense,Linear,weight,"[768, 448]",344064,202752,0.4107142686843872
48
+ 127,bert.encoder.layer.7.intermediate.dense,Linear,weight,"[211, 768]",162048,162048,0.0
49
+ 129,bert.encoder.layer.7.output.dense,Linear,weight,"[768, 211]",162048,162048,0.0
50
+ 133,bert.encoder.layer.8.attention.self.query,Linear,weight,"[448, 768]",344064,186368,0.4583333134651184
51
+ 135,bert.encoder.layer.8.attention.self.key,Linear,weight,"[448, 768]",344064,197632,0.425595223903656
52
+ 137,bert.encoder.layer.8.attention.self.value,Linear,weight,"[448, 768]",344064,154624,0.550595223903656
53
+ 139,bert.encoder.layer.8.attention.output.dense,Linear,weight,"[768, 448]",344064,148480,0.5684523582458496
54
+ 143,bert.encoder.layer.8.intermediate.dense,Linear,weight,"[108, 768]",82944,82944,0.0
55
+ 145,bert.encoder.layer.8.output.dense,Linear,weight,"[768, 108]",82944,82944,0.0
56
+ 149,bert.encoder.layer.9.attention.self.query,Linear,weight,"[320, 768]",245760,144384,0.41249996423721313
57
+ 151,bert.encoder.layer.9.attention.self.key,Linear,weight,"[320, 768]",245760,155648,0.36666661500930786
58
+ 153,bert.encoder.layer.9.attention.self.value,Linear,weight,"[320, 768]",245760,63488,0.7416666746139526
59
+ 155,bert.encoder.layer.9.attention.output.dense,Linear,weight,"[768, 320]",245760,65536,0.7333333492279053
60
+ 159,bert.encoder.layer.9.intermediate.dense,Linear,weight,"[53, 768]",40704,40704,5.960464477539063e-08
61
+ 161,bert.encoder.layer.9.output.dense,Linear,weight,"[768, 53]",40704,40704,5.960464477539063e-08
62
+ 165,bert.encoder.layer.10.attention.self.query,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
63
+ 167,bert.encoder.layer.10.attention.self.key,Linear,weight,"[384, 768]",294912,158720,0.461805522441864
64
+ 169,bert.encoder.layer.10.attention.self.value,Linear,weight,"[384, 768]",294912,77824,0.7361111044883728
65
+ 171,bert.encoder.layer.10.attention.output.dense,Linear,weight,"[768, 384]",294912,78848,0.7326388955116272
66
+ 175,bert.encoder.layer.10.intermediate.dense,Linear,weight,"[86, 768]",66048,66048,0.0
67
+ 177,bert.encoder.layer.10.output.dense,Linear,weight,"[768, 86]",66048,66048,0.0
68
+ 181,bert.encoder.layer.11.attention.self.query,Linear,weight,"[384, 768]",294912,107520,0.6354166269302368
69
+ 183,bert.encoder.layer.11.attention.self.key,Linear,weight,"[384, 768]",294912,118784,0.5972222089767456
70
+ 185,bert.encoder.layer.11.attention.self.value,Linear,weight,"[384, 768]",294912,62464,0.7881944179534912
71
+ 187,bert.encoder.layer.11.attention.output.dense,Linear,weight,"[768, 384]",294912,54272,0.8159722089767456
72
+ 191,bert.encoder.layer.11.intermediate.dense,Linear,weight,"[105, 768]",80640,80640,0.0
73
+ 193,bert.encoder.layer.11.output.dense,Linear,weight,"[768, 105]",80640,80640,0.0
XP_linear_layer_sparsity_20M_params_33.64_sparsity.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | | layer_id | layer_type | param_type | shape | nparam | nnz | sparsity |
2
+ |----:|:---------------------------------------------|:-------------|:-------------|:-----------|---------:|-------:|------------:|
3
+ | 5 | bert.encoder.layer.0.attention.self.query | Linear | weight | [320, 768] | 245760 | 135168 | 0.45 |
4
+ | 7 | bert.encoder.layer.0.attention.self.key | Linear | weight | [320, 768] | 245760 | 149504 | 0.391667 |
5
+ | 9 | bert.encoder.layer.0.attention.self.value | Linear | weight | [320, 768] | 245760 | 173056 | 0.295833 |
6
+ | 11 | bert.encoder.layer.0.attention.output.dense | Linear | weight | [768, 320] | 245760 | 181248 | 0.2625 |
7
+ | 15 | bert.encoder.layer.0.intermediate.dense | Linear | weight | [185, 768] | 142080 | 142080 | 0 |
8
+ | 17 | bert.encoder.layer.0.output.dense | Linear | weight | [768, 185] | 142080 | 142080 | 0 |
9
+ | 21 | bert.encoder.layer.1.attention.self.query | Linear | weight | [320, 768] | 245760 | 175104 | 0.2875 |
10
+ | 23 | bert.encoder.layer.1.attention.self.key | Linear | weight | [320, 768] | 245760 | 177152 | 0.279167 |
11
+ | 25 | bert.encoder.layer.1.attention.self.value | Linear | weight | [320, 768] | 245760 | 166912 | 0.320833 |
12
+ | 27 | bert.encoder.layer.1.attention.output.dense | Linear | weight | [768, 320] | 245760 | 167936 | 0.316667 |
13
+ | 31 | bert.encoder.layer.1.intermediate.dense | Linear | weight | [315, 768] | 241920 | 241920 | 0 |
14
+ | 33 | bert.encoder.layer.1.output.dense | Linear | weight | [768, 315] | 241920 | 241920 | 0 |
15
+ | 37 | bert.encoder.layer.2.attention.self.query | Linear | weight | [576, 768] | 442368 | 285696 | 0.354167 |
16
+ | 39 | bert.encoder.layer.2.attention.self.key | Linear | weight | [576, 768] | 442368 | 297984 | 0.326389 |
17
+ | 41 | bert.encoder.layer.2.attention.self.value | Linear | weight | [576, 768] | 442368 | 226304 | 0.488426 |
18
+ | 43 | bert.encoder.layer.2.attention.output.dense | Linear | weight | [768, 576] | 442368 | 237568 | 0.462963 |
19
+ | 47 | bert.encoder.layer.2.intermediate.dense | Linear | weight | [339, 768] | 260352 | 260352 | 0 |
20
+ | 49 | bert.encoder.layer.2.output.dense | Linear | weight | [768, 339] | 260352 | 260352 | 0 |
21
+ | 53 | bert.encoder.layer.3.attention.self.query | Linear | weight | [576, 768] | 442368 | 277504 | 0.372685 |
22
+ | 55 | bert.encoder.layer.3.attention.self.key | Linear | weight | [576, 768] | 442368 | 303104 | 0.314815 |
23
+ | 57 | bert.encoder.layer.3.attention.self.value | Linear | weight | [576, 768] | 442368 | 297984 | 0.326389 |
24
+ | 59 | bert.encoder.layer.3.attention.output.dense | Linear | weight | [768, 576] | 442368 | 308224 | 0.303241 |
25
+ | 63 | bert.encoder.layer.3.intermediate.dense | Linear | weight | [368, 768] | 282624 | 282624 | 0 |
26
+ | 65 | bert.encoder.layer.3.output.dense | Linear | weight | [768, 368] | 282624 | 282624 | 0 |
27
+ | 69 | bert.encoder.layer.4.attention.self.query | Linear | weight | [576, 768] | 442368 | 291840 | 0.340278 |
28
+ | 71 | bert.encoder.layer.4.attention.self.key | Linear | weight | [576, 768] | 442368 | 310272 | 0.298611 |
29
+ | 73 | bert.encoder.layer.4.attention.self.value | Linear | weight | [576, 768] | 442368 | 272384 | 0.384259 |
30
+ | 75 | bert.encoder.layer.4.attention.output.dense | Linear | weight | [768, 576] | 442368 | 263168 | 0.405093 |
31
+ | 79 | bert.encoder.layer.4.intermediate.dense | Linear | weight | [386, 768] | 296448 | 296448 | 0 |
32
+ | 81 | bert.encoder.layer.4.output.dense | Linear | weight | [768, 386] | 296448 | 296448 | 0 |
33
+ | 85 | bert.encoder.layer.5.attention.self.query | Linear | weight | [384, 768] | 294912 | 171008 | 0.420139 |
34
+ | 87 | bert.encoder.layer.5.attention.self.key | Linear | weight | [384, 768] | 294912 | 205824 | 0.302083 |
35
+ | 89 | bert.encoder.layer.5.attention.self.value | Linear | weight | [384, 768] | 294912 | 217088 | 0.263889 |
36
+ | 91 | bert.encoder.layer.5.attention.output.dense | Linear | weight | [768, 384] | 294912 | 223232 | 0.243056 |
37
+ | 95 | bert.encoder.layer.5.intermediate.dense | Linear | weight | [336, 768] | 258048 | 258048 | 0 |
38
+ | 97 | bert.encoder.layer.5.output.dense | Linear | weight | [768, 336] | 258048 | 258048 | 0 |
39
+ | 101 | bert.encoder.layer.6.attention.self.query | Linear | weight | [448, 768] | 344064 | 192512 | 0.440476 |
40
+ | 103 | bert.encoder.layer.6.attention.self.key | Linear | weight | [448, 768] | 344064 | 224256 | 0.348214 |
41
+ | 105 | bert.encoder.layer.6.attention.self.value | Linear | weight | [448, 768] | 344064 | 209920 | 0.389881 |
42
+ | 107 | bert.encoder.layer.6.attention.output.dense | Linear | weight | [768, 448] | 344064 | 199680 | 0.419643 |
43
+ | 111 | bert.encoder.layer.6.intermediate.dense | Linear | weight | [280, 768] | 215040 | 215040 | 0 |
44
+ | 113 | bert.encoder.layer.6.output.dense | Linear | weight | [768, 280] | 215040 | 215040 | 0 |
45
+ | 117 | bert.encoder.layer.7.attention.self.query | Linear | weight | [448, 768] | 344064 | 201728 | 0.41369 |
46
+ | 119 | bert.encoder.layer.7.attention.self.key | Linear | weight | [448, 768] | 344064 | 237568 | 0.309524 |
47
+ | 121 | bert.encoder.layer.7.attention.self.value | Linear | weight | [448, 768] | 344064 | 218112 | 0.366071 |
48
+ | 123 | bert.encoder.layer.7.attention.output.dense | Linear | weight | [768, 448] | 344064 | 202752 | 0.410714 |
49
+ | 127 | bert.encoder.layer.7.intermediate.dense | Linear | weight | [211, 768] | 162048 | 162048 | 0 |
50
+ | 129 | bert.encoder.layer.7.output.dense | Linear | weight | [768, 211] | 162048 | 162048 | 0 |
51
+ | 133 | bert.encoder.layer.8.attention.self.query | Linear | weight | [448, 768] | 344064 | 186368 | 0.458333 |
52
+ | 135 | bert.encoder.layer.8.attention.self.key | Linear | weight | [448, 768] | 344064 | 197632 | 0.425595 |
53
+ | 137 | bert.encoder.layer.8.attention.self.value | Linear | weight | [448, 768] | 344064 | 154624 | 0.550595 |
54
+ | 139 | bert.encoder.layer.8.attention.output.dense | Linear | weight | [768, 448] | 344064 | 148480 | 0.568452 |
55
+ | 143 | bert.encoder.layer.8.intermediate.dense | Linear | weight | [108, 768] | 82944 | 82944 | 0 |
56
+ | 145 | bert.encoder.layer.8.output.dense | Linear | weight | [768, 108] | 82944 | 82944 | 0 |
57
+ | 149 | bert.encoder.layer.9.attention.self.query | Linear | weight | [320, 768] | 245760 | 144384 | 0.4125 |
58
+ | 151 | bert.encoder.layer.9.attention.self.key | Linear | weight | [320, 768] | 245760 | 155648 | 0.366667 |
59
+ | 153 | bert.encoder.layer.9.attention.self.value | Linear | weight | [320, 768] | 245760 | 63488 | 0.741667 |
60
+ | 155 | bert.encoder.layer.9.attention.output.dense | Linear | weight | [768, 320] | 245760 | 65536 | 0.733333 |
61
+ | 159 | bert.encoder.layer.9.intermediate.dense | Linear | weight | [53, 768] | 40704 | 40704 | 5.96046e-08 |
62
+ | 161 | bert.encoder.layer.9.output.dense | Linear | weight | [768, 53] | 40704 | 40704 | 5.96046e-08 |
63
+ | 165 | bert.encoder.layer.10.attention.self.query | Linear | weight | [384, 768] | 294912 | 158720 | 0.461806 |
64
+ | 167 | bert.encoder.layer.10.attention.self.key | Linear | weight | [384, 768] | 294912 | 158720 | 0.461806 |
65
+ | 169 | bert.encoder.layer.10.attention.self.value | Linear | weight | [384, 768] | 294912 | 77824 | 0.736111 |
66
+ | 171 | bert.encoder.layer.10.attention.output.dense | Linear | weight | [768, 384] | 294912 | 78848 | 0.732639 |
67
+ | 175 | bert.encoder.layer.10.intermediate.dense | Linear | weight | [86, 768] | 66048 | 66048 | 0 |
68
+ | 177 | bert.encoder.layer.10.output.dense | Linear | weight | [768, 86] | 66048 | 66048 | 0 |
69
+ | 181 | bert.encoder.layer.11.attention.self.query | Linear | weight | [384, 768] | 294912 | 107520 | 0.635417 |
70
+ | 183 | bert.encoder.layer.11.attention.self.key | Linear | weight | [384, 768] | 294912 | 118784 | 0.597222 |
71
+ | 185 | bert.encoder.layer.11.attention.self.value | Linear | weight | [384, 768] | 294912 | 62464 | 0.788194 |
72
+ | 187 | bert.encoder.layer.11.attention.output.dense | Linear | weight | [768, 384] | 294912 | 54272 | 0.815972 |
73
+ | 191 | bert.encoder.layer.11.intermediate.dense | Linear | weight | [105, 768] | 80640 | 80640 | 0 |
74
+ | 193 | bert.encoder.layer.11.output.dense | Linear | weight | [768, 105] | 80640 | 80640 | 0 |
all_results.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eval_exact_match": 78.52412488174078,
3
+ "eval_f1": 86.41375972267356,
4
+ "eval_samples": 10784
5
+ }
eval_XP_results.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eval_exact_match": 78.52412488174078,
3
+ "eval_f1": 86.41375972267356,
4
+ "eval_samples": 10784
5
+ }
eval_nbest_predictions.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c20dd1cd87aef1f85546fe8ca84d29cc07b2c058d81cc83b86fb02cee38c166b
3
+ size 48940269
eval_predictions.json ADDED
The diff for this file is too large to render. See raw diff