diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a4352a20389838a178e8ad7ec30f479cae1b09d8 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +--- +language: en +thumbnail: +license: mit +tags: +- question-answering +- bert +- bert-base +datasets: +- squad +metrics: +- squad +widget: +- text: "Where is the Eiffel Tower located?" + context: "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower." +- text: "Who is Frederic Chopin?" + context: "Frédéric François Chopin, born Fryderyk Franciszek Chopin (1 March 1810 – 17 October 1849), was a Polish composer and virtuoso pianist of the Romantic era who wrote primarily for solo piano." +--- + +## BERT-base uncased model fine-tuned on SQuAD v1 + +This model is block sparse: the **linear** layers contains **7.5%** of the original weights. + + +The model contains **28.2%** of the original weights **overall**. + +The training use a modified version of Victor Sanh [Movement Pruning](https://arxiv.org/abs/2005.07683) method. + +That means that with the [block-sparse](https://github.com/huggingface/pytorch_block_sparse) runtime it ran **1.92x** faster than an dense networks on the evaluation, at the price of some impact on the accuracy (see below). + + + +This model was fine-tuned from the HuggingFace [BERT](https://www.aclweb.org/anthology/N19-1423/) base uncased checkpoint on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer), and distilled from the equivalent model [csarron/bert-base-uncased-squad-v1](https://huggingface.co/csarron/bert-base-uncased-squad-v1). +This model is case-insensitive: it does not make a difference between english and English. + +## Pruning details +A side-effect of the block pruning is that some of the attention heads are completely removed: 106 heads were removed on a total of 144 (73.6%). + +Here is a detailed view on how the remaining heads are distributed in the network after pruning. + +![Pruning details](https://huggingface.co/madlag/bert-base-uncased-squad1.1-block-sparse-0.07-v1/raw/main/model_card/pruning.svg) + +## Density plot + + + +## Details + +| Dataset | Split | # samples | +| -------- | ----- | --------- | +| SQuAD1.1 | train | 90.6K | +| SQuAD1.1 | eval | 11.1k | + +### Fine-tuning +- Python: `3.8.5` + +- Machine specs: + +```CPU: Intel(R) Core(TM) i7-6700K CPU +Memory: 64 GiB +GPUs: 1 GeForce GTX 3090, with 24GiB memory +GPU driver: 455.23.05, CUDA: 11.1 +``` + + +### Results + +**Pytorch model file size**: `335M` (original BERT: `438M`) + +| Metric | # Value | # Original ([Table 2](https://www.aclweb.org/anthology/N19-1423.pdf))| +| ------ | --------- | --------- | +| **EM** | **71.88** | **80.8** | +| **F1** | **81.36** | **88.5** | + +## Example Usage + +```python +from transformers import pipeline + +qa_pipeline = pipeline( + "question-answering", + model="madlag/bert-base-uncased-squad1.1-block-sparse-0.07-v1", + tokenizer="madlag/bert-base-uncased-squad1.1-block-sparse-0.07-v1" +) + +predictions = qa_pipeline({ + 'context': "Frédéric François Chopin, born Fryderyk Franciszek Chopin (1 March 1810 – 17 October 1849), was a Polish composer and virtuoso pianist of the Romantic era who wrote primarily for solo piano.", + 'question': "Who is Frederic Chopin?", +}) + +print(predictions) +``` \ No newline at end of file diff --git a/model_card/layer_images/layer_0_attention_output_dense.png b/model_card/layer_images/layer_0_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..5542d2abb6b634b586d8c423ca6d8a730edbcb35 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_0_attention_self_key.png b/model_card/layer_images/layer_0_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..083d82e2261c15136e930a1eddcb3d308760393b Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_key.png differ diff --git a/model_card/layer_images/layer_0_attention_self_query.png b/model_card/layer_images/layer_0_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..ef98a72b371f1370fba836882f260289c5ddeda8 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_query.png differ diff --git a/model_card/layer_images/layer_0_attention_self_value.png b/model_card/layer_images/layer_0_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..7a0d52dddc228ad3ccda0be5df777dd3a2950b65 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_value.png differ diff --git a/model_card/layer_images/layer_0_intermediate_dense.png b/model_card/layer_images/layer_0_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d2cc4105c914e6806033d1a7d43f9dc9d46ed920 Binary files /dev/null and b/model_card/layer_images/layer_0_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_0_output_dense.png b/model_card/layer_images/layer_0_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..cadf13e767f7b37252f8df06724b4e6918b58517 Binary files /dev/null and b/model_card/layer_images/layer_0_output_dense.png differ diff --git a/model_card/layer_images/layer_10_attention_output_dense.png b/model_card/layer_images/layer_10_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..2beb1e6a675ad28e61a68d65cb38deb974def2b7 Binary files /dev/null and b/model_card/layer_images/layer_10_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_10_attention_self_key.png b/model_card/layer_images/layer_10_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..6719d2c06ba5630d302ae27c6f5c0eabea37aff3 Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_key.png differ diff --git a/model_card/layer_images/layer_10_attention_self_query.png b/model_card/layer_images/layer_10_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..a899eeb66f58419e0c74bf54a03352dfe1eff92d Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_query.png differ diff --git a/model_card/layer_images/layer_10_attention_self_value.png b/model_card/layer_images/layer_10_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..fbb371249d798290f1a22fe67db7831eb1a804cb Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_value.png differ diff --git a/model_card/layer_images/layer_10_intermediate_dense.png b/model_card/layer_images/layer_10_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f750c2c3086a60deb876c72d1218452c4e1d2cd1 Binary files /dev/null and b/model_card/layer_images/layer_10_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_10_output_dense.png b/model_card/layer_images/layer_10_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..742ca056ca3a7fa3d3429f8a0a8accdf9c578de8 Binary files /dev/null and b/model_card/layer_images/layer_10_output_dense.png differ diff --git a/model_card/layer_images/layer_11_attention_output_dense.png b/model_card/layer_images/layer_11_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d940c82133dfcb0f4f77edefad9c7fa75d919f65 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_11_attention_self_key.png b/model_card/layer_images/layer_11_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..e20616ea539eae802fdb2668fbcd3debab99914b Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_key.png differ diff --git a/model_card/layer_images/layer_11_attention_self_query.png b/model_card/layer_images/layer_11_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..f05277824fe8b9d2e778552243f19805d36c6b16 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_query.png differ diff --git a/model_card/layer_images/layer_11_attention_self_value.png b/model_card/layer_images/layer_11_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..05023ca12f32c3c6f84e9a6cf68c68643ad56620 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_value.png differ diff --git a/model_card/layer_images/layer_11_intermediate_dense.png b/model_card/layer_images/layer_11_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..12824cb636d1164cd6c994239f0e4fbcf90fc904 Binary files /dev/null and b/model_card/layer_images/layer_11_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_11_output_dense.png b/model_card/layer_images/layer_11_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..cb3cf5077d208b1573215d3c206b4a55c1fe9719 Binary files /dev/null and b/model_card/layer_images/layer_11_output_dense.png differ diff --git a/model_card/layer_images/layer_1_attention_output_dense.png b/model_card/layer_images/layer_1_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..07f29296c4c7342e54e42313fe09b7d8f5b36268 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_1_attention_self_key.png b/model_card/layer_images/layer_1_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..11a9d9341669b1deda3370396253b875f545330b Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_key.png differ diff --git a/model_card/layer_images/layer_1_attention_self_query.png b/model_card/layer_images/layer_1_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..742020a025d74f7f5afe70463079d6c2af97abe0 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_query.png differ diff --git a/model_card/layer_images/layer_1_attention_self_value.png b/model_card/layer_images/layer_1_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..7a03f304dc9333ace6275415c9613b61c3039554 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_value.png differ diff --git a/model_card/layer_images/layer_1_intermediate_dense.png b/model_card/layer_images/layer_1_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..4a34a6bd7af70297587501a2d12f5dd609edd858 Binary files /dev/null and b/model_card/layer_images/layer_1_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_1_output_dense.png b/model_card/layer_images/layer_1_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..1c9610ea242bf2a2cd6310ee7a5ff9827c852f62 Binary files /dev/null and b/model_card/layer_images/layer_1_output_dense.png differ diff --git a/model_card/layer_images/layer_2_attention_output_dense.png b/model_card/layer_images/layer_2_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f047b23c7cb80511ee15a4f83886565680275127 Binary files /dev/null and b/model_card/layer_images/layer_2_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_2_attention_self_key.png b/model_card/layer_images/layer_2_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..053ed60f5876d020afa9963ec12d12b3e410cf6c Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_key.png differ diff --git a/model_card/layer_images/layer_2_attention_self_query.png b/model_card/layer_images/layer_2_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..b69f2acf4840bf50d650c13060d4bd09e3d44279 Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_query.png differ diff --git a/model_card/layer_images/layer_2_attention_self_value.png b/model_card/layer_images/layer_2_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..896fe9e2a84f25eae11638d240070d2f9c8ba3ae Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_value.png differ diff --git a/model_card/layer_images/layer_2_intermediate_dense.png b/model_card/layer_images/layer_2_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..25eb54c0bffaca16ca29919802a25e701dcd7772 Binary files /dev/null and b/model_card/layer_images/layer_2_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_2_output_dense.png b/model_card/layer_images/layer_2_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..a0f2d18bf4343950e8bb82317988796cd89dceb1 Binary files /dev/null and b/model_card/layer_images/layer_2_output_dense.png differ diff --git a/model_card/layer_images/layer_3_attention_output_dense.png b/model_card/layer_images/layer_3_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..880c018483e6d018da26afccbc2fbe8c342b7f28 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_3_attention_self_key.png b/model_card/layer_images/layer_3_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..3d5b3afa2afb77f80c4314fcf66ddc87fecc81c5 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_key.png differ diff --git a/model_card/layer_images/layer_3_attention_self_query.png b/model_card/layer_images/layer_3_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..12e8826d8e3f2853424a3a7d0e100877cf13f4f9 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_query.png differ diff --git a/model_card/layer_images/layer_3_attention_self_value.png b/model_card/layer_images/layer_3_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..f5398fc1737ca8285c8fd61447eb18a9b2929668 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_value.png differ diff --git a/model_card/layer_images/layer_3_intermediate_dense.png b/model_card/layer_images/layer_3_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..fda4707ccbd89f10094ac428c972709294ec6d0d Binary files /dev/null and b/model_card/layer_images/layer_3_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_3_output_dense.png b/model_card/layer_images/layer_3_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..640fdcf95a74de34185e28646863bbb9a07b379a Binary files /dev/null and b/model_card/layer_images/layer_3_output_dense.png differ diff --git a/model_card/layer_images/layer_4_attention_output_dense.png b/model_card/layer_images/layer_4_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..b3352a845957c1f60b921a51dedef4041666a908 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_4_attention_self_key.png b/model_card/layer_images/layer_4_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..35d08f3d6a17dec3e62d4635716673d4182f9b79 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_key.png differ diff --git a/model_card/layer_images/layer_4_attention_self_query.png b/model_card/layer_images/layer_4_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..7bc934db00decec909f88c11c962292ff85dd149 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_query.png differ diff --git a/model_card/layer_images/layer_4_attention_self_value.png b/model_card/layer_images/layer_4_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..afd2252f0419791c97f4b4298de49b1a15990f33 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_value.png differ diff --git a/model_card/layer_images/layer_4_intermediate_dense.png b/model_card/layer_images/layer_4_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..ff1f9a7f97fa367241ab72582d1813adcd625ad3 Binary files /dev/null and b/model_card/layer_images/layer_4_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_4_output_dense.png b/model_card/layer_images/layer_4_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..510b70e53b1f42b0cb73736dcf94358efba95194 Binary files /dev/null and b/model_card/layer_images/layer_4_output_dense.png differ diff --git a/model_card/layer_images/layer_5_attention_output_dense.png b/model_card/layer_images/layer_5_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..e2fb63949fcb1d11da0950c866ae8c6dd8dc770c Binary files /dev/null and b/model_card/layer_images/layer_5_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_5_attention_self_key.png b/model_card/layer_images/layer_5_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..6eca4a533ea8e1e4dbee7367b3ea415a9c9e75b0 Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_key.png differ diff --git a/model_card/layer_images/layer_5_attention_self_query.png b/model_card/layer_images/layer_5_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..3ea8cdc8ce64887146d00f61551a7cac111a3ffd Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_query.png differ diff --git a/model_card/layer_images/layer_5_attention_self_value.png b/model_card/layer_images/layer_5_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..c700403adfa766eb02d4ef362bbab38a21be90b0 Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_value.png differ diff --git a/model_card/layer_images/layer_5_intermediate_dense.png b/model_card/layer_images/layer_5_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f3efe38e22669260b5df8abd00b5321d27468e25 Binary files /dev/null and b/model_card/layer_images/layer_5_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_5_output_dense.png b/model_card/layer_images/layer_5_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..4b30b1c7eec1c0fe5224f0d9dfd593f2648f7f2a Binary files /dev/null and b/model_card/layer_images/layer_5_output_dense.png differ diff --git a/model_card/layer_images/layer_6_attention_output_dense.png b/model_card/layer_images/layer_6_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..eab13b453ac474fe0012467f4fc78f6717948ab5 Binary files /dev/null and b/model_card/layer_images/layer_6_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_6_attention_self_key.png b/model_card/layer_images/layer_6_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..3fc744f0b10c33dc7202bef2decaf17fa257dbff Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_key.png differ diff --git a/model_card/layer_images/layer_6_attention_self_query.png b/model_card/layer_images/layer_6_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..5b6c4156034298a62b4faa7f3313d40068ba54e5 Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_query.png differ diff --git a/model_card/layer_images/layer_6_attention_self_value.png b/model_card/layer_images/layer_6_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..dd92aa0a8949c9621dd0f66d00e957b53132d3c7 Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_value.png differ diff --git a/model_card/layer_images/layer_6_intermediate_dense.png b/model_card/layer_images/layer_6_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..739c971eec8893f442789bae3262db641910c356 Binary files /dev/null and b/model_card/layer_images/layer_6_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_6_output_dense.png b/model_card/layer_images/layer_6_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..accf104e79589e3e30b6850b557651ed9499e749 Binary files /dev/null and b/model_card/layer_images/layer_6_output_dense.png differ diff --git a/model_card/layer_images/layer_7_attention_output_dense.png b/model_card/layer_images/layer_7_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..23978e0ce478694b9be209dabe1e8c436f80389d Binary files /dev/null and b/model_card/layer_images/layer_7_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_7_attention_self_key.png b/model_card/layer_images/layer_7_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..ca38b023489b9f7fb93ebea926991d2f8d670510 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_key.png differ diff --git a/model_card/layer_images/layer_7_attention_self_query.png b/model_card/layer_images/layer_7_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..dc0d573c07d5a54944d90c335be4a4307e3df692 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_query.png differ diff --git a/model_card/layer_images/layer_7_attention_self_value.png b/model_card/layer_images/layer_7_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..5158a308c93c2fcc95608696cf1efe7da163ce33 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_value.png differ diff --git a/model_card/layer_images/layer_7_intermediate_dense.png b/model_card/layer_images/layer_7_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..0970e6d6444a7502f6a36afca8f24e7fe68c5cae Binary files /dev/null and b/model_card/layer_images/layer_7_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_7_output_dense.png b/model_card/layer_images/layer_7_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..8ed55aa33e9488551658545b8bb50b8d43420263 Binary files /dev/null and b/model_card/layer_images/layer_7_output_dense.png differ diff --git a/model_card/layer_images/layer_8_attention_output_dense.png b/model_card/layer_images/layer_8_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d1b9d8a95776c62331992e91795e86cf0abceb51 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_8_attention_self_key.png b/model_card/layer_images/layer_8_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..4582d00b6850b295c6a4bc36c99dfae5a32de2b8 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_key.png differ diff --git a/model_card/layer_images/layer_8_attention_self_query.png b/model_card/layer_images/layer_8_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..a2c437788c571107b938e2954d6d0728c1338359 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_query.png differ diff --git a/model_card/layer_images/layer_8_attention_self_value.png b/model_card/layer_images/layer_8_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..0b3c55845f22eb1b1cb73cedb542cb81370e9407 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_value.png differ diff --git a/model_card/layer_images/layer_8_intermediate_dense.png b/model_card/layer_images/layer_8_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f63185bdbd8685fb331a117b63a172d0980d30b2 Binary files /dev/null and b/model_card/layer_images/layer_8_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_8_output_dense.png b/model_card/layer_images/layer_8_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..78bab33d84125868d848f53e15f6884f54be621a Binary files /dev/null and b/model_card/layer_images/layer_8_output_dense.png differ diff --git a/model_card/layer_images/layer_9_attention_output_dense.png b/model_card/layer_images/layer_9_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..3488ff14566004b8e5f69bda95b95fd759a70225 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_9_attention_self_key.png b/model_card/layer_images/layer_9_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..2a6fbe2cdcd09c51f4b2fbea2cf42038b7a4e5d7 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_key.png differ diff --git a/model_card/layer_images/layer_9_attention_self_query.png b/model_card/layer_images/layer_9_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..48cd9153045b0cf6a0093aae3a3d886654bbfc77 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_query.png differ diff --git a/model_card/layer_images/layer_9_attention_self_value.png b/model_card/layer_images/layer_9_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..9a1ecfd5a3044904ad807d66f8634c2213cd36fc Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_value.png differ diff --git a/model_card/layer_images/layer_9_intermediate_dense.png b/model_card/layer_images/layer_9_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d2aed47501093f18f3e2797d7b34305f58737c83 Binary files /dev/null and b/model_card/layer_images/layer_9_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_9_output_dense.png b/model_card/layer_images/layer_9_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f25956ec64aa33a34c5b225686439e9cdc023ff4 Binary files /dev/null and b/model_card/layer_images/layer_9_output_dense.png differ diff --git a/model_card/pruning.svg b/model_card/pruning.svg new file mode 100644 index 0000000000000000000000000000000000000000..dfd0e23f16827bf8be80d966f500efffdc3f0f6e --- /dev/null +++ b/model_card/pruning.svg @@ -0,0 +1 @@ +32342425343391098108107989901234567891011024681012prunedactivePruned Transformer HeadsLayer indexHeads count \ No newline at end of file diff --git a/model_meta.json b/model_meta.json new file mode 100644 index 0000000000000000000000000000000000000000..304f0a25d887d368d5d46b4ea329579ba63d53e6 --- /dev/null +++ b/model_meta.json @@ -0,0 +1,160 @@ +{ + "args": { + "adam_epsilon": 1e-08, + "alpha_ce": 0.1, + "alpha_distil": 0.9, + "ampere_learning_rate": 0.01, + "ampere_mask_init": "constant", + "ampere_mask_scale": 0.0, + "ampere_pruning_method": "disabled", + "cache_dir": "", + "config_name": "", + "data_dir": "squad_data", + "do_eval": true, + "do_lower_case": true, + "do_train": true, + "doc_stride": 128, + "eval_all_checkpoints": true, + "eval_batch_size": 16, + "evaluate_during_training": false, + "final_ampere_temperature": 20, + "final_lambda": 200, + "final_shuffling_temperature": 20, + "final_threshold": 0.1, + "final_warmup": 10, + "fp16": false, + "fp16_opt_level": "O1", + "global_topk": false, + "global_topk_frequency_compute": 25, + "gradient_accumulation_steps": 1, + "in_shuffling_group": 4, + "initial_ampere_temperature": 0.0, + "initial_shuffling_temperature": 0.1, + "initial_threshold": 0.0, + "initial_warmup": 1, + "lang_id": 0, + "learning_rate": 3e-05, + "local_rank": -1, + "logging_steps": 500, + "mask_block_cols": 32, + "mask_block_rows": 32, + "mask_init": "constant", + "mask_scale": 0.0, + "mask_scores_learning_rate": 0.01, + "max_answer_length": 30, + "max_grad_norm": 1.0, + "max_query_length": 64, + "max_seq_length": 384, + "max_steps": -1, + "model_name_or_path": "bert-base-uncased", + "model_type": "masked_bert", + "n_best_size": 20, + "n_gpu": 1, + "no_cuda": false, + "null_score_diff_threshold": 0.0, + "num_train_epochs": 20.0, + "out_shuffling_group": 4, + "overwrite_cache": false, + "overwrite_output_dir": true, + "per_gpu_eval_batch_size": 16, + "per_gpu_train_batch_size": 16, + "predict_file": "dev-v1.1.json", + "pruning_method": "sigmoied_threshold", + "pruning_submethod": "default", + "regularization": "l1", + "save_steps": 5000, + "seed": 42, + "server_ip": "", + "server_port": "", + "shuffling_learning_rate": 0.001, + "shuffling_method": "disabled", + "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1", + "teacher_type": "bert", + "temperature": 2.0, + "threads": 8, + "tokenizer_name": "", + "train_batch_size": 16, + "train_file": "train-v1.1.json", + "truncate_train_examples": -1, + "verbose_logging": false, + "version_2_with_negative": false, + "warmup_steps": 5400, + "weight_decay": 0.0 + }, + "config": { + "_name_or_path": "bert-base-uncased", + "ampere_mask_init": "constant", + "ampere_mask_scale": 0.0, + "ampere_pruning_method": "disabled", + "architectures": ["MaskedBertForQuestionAnswering"], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "in_shuffling_group": 4, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "mask_block_cols": 32, + "mask_block_rows": 32, + "mask_init": "constant", + "mask_scale": 0.0, + "max_position_embeddings": 512, + "model_type": "masked_bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "out_shuffling_group": 4, + "pad_token_id": 0, + "pruning_method": "sigmoied_threshold", + "pruning_submethod": "default", + "shuffling_method": "disabled", + "type_vocab_size": 2, + "vocab_size": 30522 + }, + "packaging": { + "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.07-v1", + "model_owner": "madlag", + "pytorch_final_file_size": 352215223 + }, + "performance": { + "dense": { + "eval_elapsed_time": 43.41997216496384 + }, + "pytorch_block_sparse": { + "eval_elapsed_time": 22.587281233048998 + }, + "speedup": 1.922319544214693 + }, + "precision": { + "exact": 71.8826904296875, + "f1": 81.3593978881836 + }, + "sparsity": { + "ampere": false, + "block_size": [32, 32], + "block_sparse": true, + "block_sparse_density": 0.07493007330246913, + "block_sparse_nnz": 6215, + "block_sparse_total": 82944, + "global_density": 0.2823549074092054, + "is_block_sparse_valid": true, + "nnz_parameters": 30913282, + "parameters": 109483778, + "pruned_heads": { + "0": [0, 1, 2, 4, 5, 6, 7, 9, 11], + "1": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], + "2": [1, 2, 3, 4, 5, 7, 8, 10, 11], + "3": [2, 3, 4, 6, 7, 9, 10, 11], + "4": [0, 1, 2, 4, 6, 7, 8, 9, 10, 11], + "5": [0, 1, 2, 4, 5, 6, 7, 11], + "6": [0, 1, 2, 3, 4, 5, 6, 7, 10, 11], + "7": [1, 2, 3, 5, 6, 7, 11], + "8": [0, 1, 2, 3, 4, 5, 6, 7, 8], + "9": [1, 3, 4, 5, 7, 9, 10, 11], + "10": [0, 1, 2, 4, 5, 6, 7, 8, 9], + "11": [0, 2, 3, 5, 7, 8, 9, 10, 11] + }, + "total_attention_heads": 144, + "total_pruned_attention_heads": 106 + } +} \ No newline at end of file