diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fa97131ddb3bf2125e445c9e472d600c587fc689 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +--- +language: en +thumbnail: +license: mit +tags: +- question-answering +- bert +- bert-base +datasets: +- squad +metrics: +- squad +widget: +- text: "Where is the Eiffel Tower located?" + context: "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower." +- text: "Who is Frederic Chopin?" + context: "Frédéric François Chopin, born Fryderyk Franciszek Chopin (1 March 1810 – 17 October 1849), was a Polish composer and virtuoso pianist of the Romantic era who wrote primarily for solo piano." +--- + +## BERT-base uncased model fine-tuned on SQuAD v1 + +This model is block sparse: the **linear** layers contains **31.7%** of the original weights. + + +The model contains **47.0%** of the original weights **overall**. + +The training use a modified version of Victor Sanh [Movement Pruning](https://arxiv.org/abs/2005.07683) method. + +That means that with the [block-sparse](https://github.com/huggingface/pytorch_block_sparse) runtime it ran **1.12x** faster than an dense networks on the evaluation, at the price of some impact on the accuracy (see below). + + + +This model was fine-tuned from the HuggingFace [BERT](https://www.aclweb.org/anthology/N19-1423/) base uncased checkpoint on [SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer), and distilled from the equivalent model [csarron/bert-base-uncased-squad-v1](https://huggingface.co/csarron/bert-base-uncased-squad-v1). +This model is case-insensitive: it does not make a difference between english and English. + +## Pruning details +A side-effect of the block pruning is that some of the attention heads are completely removed: 80 heads were removed on a total of 144 (55.6%). + +Here is a detailed view on how the remaining heads are distributed in the network after pruning. + +![Pruning details](https://huggingface.co/madlag/bert-base-uncased-squad1.1-block-sparse-0.32-v1/raw/main/model_card/pruning.svg) + +## Density plot + + + +## Details + +| Dataset | Split | # samples | +| -------- | ----- | --------- | +| SQuAD1.1 | train | 90.6K | +| SQuAD1.1 | eval | 11.1k | + +### Fine-tuning +- Python: `3.8.5` + +- Machine specs: + +```CPU: Intel(R) Core(TM) i7-6700K CPU +Memory: 64 GiB +GPUs: 1 GeForce GTX 3090, with 24GiB memory +GPU driver: 455.23.05, CUDA: 11.1 +``` + + +### Results + +**Pytorch model file size**: `355M` (original BERT: `438M`) + +| Metric | # Value | # Original ([Table 2](https://www.aclweb.org/anthology/N19-1423.pdf))| +| ------ | --------- | --------- | +| **EM** | **79.04** | **80.8** | +| **F1** | **86.70** | **88.5** | + +## Example Usage + +```python +from transformers import pipeline + +qa_pipeline = pipeline( + "question-answering", + model="madlag/bert-base-uncased-squad1.1-block-sparse-0.32-v1", + tokenizer="madlag/bert-base-uncased-squad1.1-block-sparse-0.32-v1" +) + +predictions = qa_pipeline({ + 'context': "Frédéric François Chopin, born Fryderyk Franciszek Chopin (1 March 1810 – 17 October 1849), was a Polish composer and virtuoso pianist of the Romantic era who wrote primarily for solo piano.", + 'question': "Who is Frederic Chopin?", +}) + +print(predictions) +``` \ No newline at end of file diff --git a/model_card/layer_images/layer_0_attention_output_dense.png b/model_card/layer_images/layer_0_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d629d9efe14f461694a431e9dea01c6c0062c686 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_0_attention_self_key.png b/model_card/layer_images/layer_0_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..661325ce204a20d8c221d561fcffbb122ce5c811 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_key.png differ diff --git a/model_card/layer_images/layer_0_attention_self_query.png b/model_card/layer_images/layer_0_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..6b135b176eea7bd24ea0febd54f44fd2c0463c96 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_query.png differ diff --git a/model_card/layer_images/layer_0_attention_self_value.png b/model_card/layer_images/layer_0_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..39f8b13a29b10b94d3d86cafcb726cad02a81b67 Binary files /dev/null and b/model_card/layer_images/layer_0_attention_self_value.png differ diff --git a/model_card/layer_images/layer_0_intermediate_dense.png b/model_card/layer_images/layer_0_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..9fe1c2a28ada959c2dceefb6e79be03d9e83b4f1 Binary files /dev/null and b/model_card/layer_images/layer_0_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_0_output_dense.png b/model_card/layer_images/layer_0_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..48ae97e78df7fb9f107ddd7497d2a5b70ae1ef41 Binary files /dev/null and b/model_card/layer_images/layer_0_output_dense.png differ diff --git a/model_card/layer_images/layer_10_attention_output_dense.png b/model_card/layer_images/layer_10_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..6c4db5f8741ea8563f3efe3a2b02b16b4a287f62 Binary files /dev/null and b/model_card/layer_images/layer_10_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_10_attention_self_key.png b/model_card/layer_images/layer_10_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..fa786742ed998ac9924f18eac97af1849875e64d Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_key.png differ diff --git a/model_card/layer_images/layer_10_attention_self_query.png b/model_card/layer_images/layer_10_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..ed83e7995d7b5a5fab3241174b17ec941cd7e900 Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_query.png differ diff --git a/model_card/layer_images/layer_10_attention_self_value.png b/model_card/layer_images/layer_10_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..42b8d12f7adf43daf4ecabe14aa89828c0169710 Binary files /dev/null and b/model_card/layer_images/layer_10_attention_self_value.png differ diff --git a/model_card/layer_images/layer_10_intermediate_dense.png b/model_card/layer_images/layer_10_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..8f3b85dfb401d61c8dc6725f9e1e56b8dfdd6165 Binary files /dev/null and b/model_card/layer_images/layer_10_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_10_output_dense.png b/model_card/layer_images/layer_10_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..a58215f8f9307e46d6c3b4e4bcbe90d19b2e9ac4 Binary files /dev/null and b/model_card/layer_images/layer_10_output_dense.png differ diff --git a/model_card/layer_images/layer_11_attention_output_dense.png b/model_card/layer_images/layer_11_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..5a172e615014df1f5ccba0278b64f7730be7ec52 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_11_attention_self_key.png b/model_card/layer_images/layer_11_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..fdebca896ff68c81bfacdecf4be9a87fcc793817 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_key.png differ diff --git a/model_card/layer_images/layer_11_attention_self_query.png b/model_card/layer_images/layer_11_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..da8ad67c2fe8a405d183f895ddae5d4323474586 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_query.png differ diff --git a/model_card/layer_images/layer_11_attention_self_value.png b/model_card/layer_images/layer_11_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..8c0b19494d464cb7d0fc30d1fd977a81e11298a1 Binary files /dev/null and b/model_card/layer_images/layer_11_attention_self_value.png differ diff --git a/model_card/layer_images/layer_11_intermediate_dense.png b/model_card/layer_images/layer_11_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..725d80ab2b9f4d8d4c736036f4846619317d91c6 Binary files /dev/null and b/model_card/layer_images/layer_11_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_11_output_dense.png b/model_card/layer_images/layer_11_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..4bfd77dc27eb1fc39528e73afd1a4b42edb4b7ab Binary files /dev/null and b/model_card/layer_images/layer_11_output_dense.png differ diff --git a/model_card/layer_images/layer_1_attention_output_dense.png b/model_card/layer_images/layer_1_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..b7a987140ecb227e55aec6b19824b7cea8f2d4b4 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_1_attention_self_key.png b/model_card/layer_images/layer_1_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..d10dca8aa24008caf5faefd09cddcfb61f956614 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_key.png differ diff --git a/model_card/layer_images/layer_1_attention_self_query.png b/model_card/layer_images/layer_1_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..470d8e48b72058f77c8b9972718ef68cab4b9076 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_query.png differ diff --git a/model_card/layer_images/layer_1_attention_self_value.png b/model_card/layer_images/layer_1_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..c06ef97919af13d61d23d5ba028e1183feb8d7e5 Binary files /dev/null and b/model_card/layer_images/layer_1_attention_self_value.png differ diff --git a/model_card/layer_images/layer_1_intermediate_dense.png b/model_card/layer_images/layer_1_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..100401072322cb27cd1ff8374746ad954e9bbaae Binary files /dev/null and b/model_card/layer_images/layer_1_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_1_output_dense.png b/model_card/layer_images/layer_1_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..f07c85a79dc468413a0fd80f7261d7336b158b4a Binary files /dev/null and b/model_card/layer_images/layer_1_output_dense.png differ diff --git a/model_card/layer_images/layer_2_attention_output_dense.png b/model_card/layer_images/layer_2_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..cf048bbe1ad6c23f628fbd80ed09b5869d4f043c Binary files /dev/null and b/model_card/layer_images/layer_2_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_2_attention_self_key.png b/model_card/layer_images/layer_2_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..fb6e112f9b4bd8a7f71d57cec403fddbe6a2e240 Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_key.png differ diff --git a/model_card/layer_images/layer_2_attention_self_query.png b/model_card/layer_images/layer_2_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..0d07732b9ded9dde5f2bd5d1d87fa073e92d73a9 Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_query.png differ diff --git a/model_card/layer_images/layer_2_attention_self_value.png b/model_card/layer_images/layer_2_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..185ed50f291eb14219a6cf5e63cd7f40b15993e6 Binary files /dev/null and b/model_card/layer_images/layer_2_attention_self_value.png differ diff --git a/model_card/layer_images/layer_2_intermediate_dense.png b/model_card/layer_images/layer_2_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..89ad508daa9bc4211f50320233c1769bf8b9c1eb Binary files /dev/null and b/model_card/layer_images/layer_2_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_2_output_dense.png b/model_card/layer_images/layer_2_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..aa88e7443d612fd290d2e542dbc58e29b903aee0 Binary files /dev/null and b/model_card/layer_images/layer_2_output_dense.png differ diff --git a/model_card/layer_images/layer_3_attention_output_dense.png b/model_card/layer_images/layer_3_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..87c60c231bb4e0f151bec80d999516f3e47fb0af Binary files /dev/null and b/model_card/layer_images/layer_3_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_3_attention_self_key.png b/model_card/layer_images/layer_3_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..b89269c47fdec06a279768aee2416c82171626c2 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_key.png differ diff --git a/model_card/layer_images/layer_3_attention_self_query.png b/model_card/layer_images/layer_3_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..0dcdc1d2e1bcdea0d5872b254d494589856e83f4 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_query.png differ diff --git a/model_card/layer_images/layer_3_attention_self_value.png b/model_card/layer_images/layer_3_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..e6712648aaf36479a7b2c700fd25d8a0069b8da8 Binary files /dev/null and b/model_card/layer_images/layer_3_attention_self_value.png differ diff --git a/model_card/layer_images/layer_3_intermediate_dense.png b/model_card/layer_images/layer_3_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..647290ba452ac9d00a8610f6f238b21ea55f9ef1 Binary files /dev/null and b/model_card/layer_images/layer_3_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_3_output_dense.png b/model_card/layer_images/layer_3_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..76f009f1ec5ec2a134df5938fce3af1a88869c3e Binary files /dev/null and b/model_card/layer_images/layer_3_output_dense.png differ diff --git a/model_card/layer_images/layer_4_attention_output_dense.png b/model_card/layer_images/layer_4_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..6f42ebc8afc8ef2dfaa7f874c14f3645202484fc Binary files /dev/null and b/model_card/layer_images/layer_4_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_4_attention_self_key.png b/model_card/layer_images/layer_4_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..6f7c77b171b84524f6979637fdec6c8a56f0f33f Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_key.png differ diff --git a/model_card/layer_images/layer_4_attention_self_query.png b/model_card/layer_images/layer_4_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..662d774452fcc62e7403101713834b69125cbfc1 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_query.png differ diff --git a/model_card/layer_images/layer_4_attention_self_value.png b/model_card/layer_images/layer_4_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..27012edfdda4010b1030277556db308853aef522 Binary files /dev/null and b/model_card/layer_images/layer_4_attention_self_value.png differ diff --git a/model_card/layer_images/layer_4_intermediate_dense.png b/model_card/layer_images/layer_4_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..73cec9c2bdcc02265e7991dca80f0384ee5167c8 Binary files /dev/null and b/model_card/layer_images/layer_4_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_4_output_dense.png b/model_card/layer_images/layer_4_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..602164e776df996ed7bdee451f6295529229d69f Binary files /dev/null and b/model_card/layer_images/layer_4_output_dense.png differ diff --git a/model_card/layer_images/layer_5_attention_output_dense.png b/model_card/layer_images/layer_5_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..0574dc29b068a97a533152f7fccee5e713a49486 Binary files /dev/null and b/model_card/layer_images/layer_5_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_5_attention_self_key.png b/model_card/layer_images/layer_5_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..c46df74e69837d40aab53f33b71da420eedcc86d Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_key.png differ diff --git a/model_card/layer_images/layer_5_attention_self_query.png b/model_card/layer_images/layer_5_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..d62981c04169d60310529c67940ae7b319c72c9b Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_query.png differ diff --git a/model_card/layer_images/layer_5_attention_self_value.png b/model_card/layer_images/layer_5_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..d4053b6fbba46656cfdd5e6bc3775521ce8f5895 Binary files /dev/null and b/model_card/layer_images/layer_5_attention_self_value.png differ diff --git a/model_card/layer_images/layer_5_intermediate_dense.png b/model_card/layer_images/layer_5_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..b90dbb89e26b24582a591b47d3b584aabdd3d4f9 Binary files /dev/null and b/model_card/layer_images/layer_5_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_5_output_dense.png b/model_card/layer_images/layer_5_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..6f86d0f272d54643b524830dde317bff4b2b7ab1 Binary files /dev/null and b/model_card/layer_images/layer_5_output_dense.png differ diff --git a/model_card/layer_images/layer_6_attention_output_dense.png b/model_card/layer_images/layer_6_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..81a30a4de27e100b5a84d09f8097d1a40dba632e Binary files /dev/null and b/model_card/layer_images/layer_6_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_6_attention_self_key.png b/model_card/layer_images/layer_6_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..73df0ba46f8f59708f17777b3df8971d8c310523 Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_key.png differ diff --git a/model_card/layer_images/layer_6_attention_self_query.png b/model_card/layer_images/layer_6_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..68f427b212c1f9ad36f2fe1139139b78f35fd7ab Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_query.png differ diff --git a/model_card/layer_images/layer_6_attention_self_value.png b/model_card/layer_images/layer_6_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..5667c97b763ca4b5aa995600157b0901f5e35bb5 Binary files /dev/null and b/model_card/layer_images/layer_6_attention_self_value.png differ diff --git a/model_card/layer_images/layer_6_intermediate_dense.png b/model_card/layer_images/layer_6_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d3ec8587ec9f0414cb55fe9e569f3b9e8c441bff Binary files /dev/null and b/model_card/layer_images/layer_6_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_6_output_dense.png b/model_card/layer_images/layer_6_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..a09e09a32970e6c94eb4168c76e29f00ea061b3e Binary files /dev/null and b/model_card/layer_images/layer_6_output_dense.png differ diff --git a/model_card/layer_images/layer_7_attention_output_dense.png b/model_card/layer_images/layer_7_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d47e79868392cd2ffdc229dfc8d164073cafb2fb Binary files /dev/null and b/model_card/layer_images/layer_7_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_7_attention_self_key.png b/model_card/layer_images/layer_7_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..bfd7c6c54273991918620d273fe54b7f85ca1469 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_key.png differ diff --git a/model_card/layer_images/layer_7_attention_self_query.png b/model_card/layer_images/layer_7_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..22a42632e47116e1ff19fea6983dacfdf6947141 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_query.png differ diff --git a/model_card/layer_images/layer_7_attention_self_value.png b/model_card/layer_images/layer_7_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..c7af55056cf3d31c8894db6b756b90ac588ad018 Binary files /dev/null and b/model_card/layer_images/layer_7_attention_self_value.png differ diff --git a/model_card/layer_images/layer_7_intermediate_dense.png b/model_card/layer_images/layer_7_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..891a3a844667406cbaaaa5651dcc4df0689b748e Binary files /dev/null and b/model_card/layer_images/layer_7_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_7_output_dense.png b/model_card/layer_images/layer_7_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..4c6d8c25dfef592267b1a7ac0e21b3850dc96492 Binary files /dev/null and b/model_card/layer_images/layer_7_output_dense.png differ diff --git a/model_card/layer_images/layer_8_attention_output_dense.png b/model_card/layer_images/layer_8_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d28645fa68a664bb891912aa0ddd9482984258e5 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_8_attention_self_key.png b/model_card/layer_images/layer_8_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..63e6fa1cf9f87cb6055280eecc891b6cf01d29eb Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_key.png differ diff --git a/model_card/layer_images/layer_8_attention_self_query.png b/model_card/layer_images/layer_8_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..cc41e806f69b4451e1ca74b4fe423213e845eda1 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_query.png differ diff --git a/model_card/layer_images/layer_8_attention_self_value.png b/model_card/layer_images/layer_8_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..2f07cac518ee74ab45566ef4ebca7e760f6a8023 Binary files /dev/null and b/model_card/layer_images/layer_8_attention_self_value.png differ diff --git a/model_card/layer_images/layer_8_intermediate_dense.png b/model_card/layer_images/layer_8_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..ffc78caf3ff65541f36c908b4940d521f5d46d34 Binary files /dev/null and b/model_card/layer_images/layer_8_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_8_output_dense.png b/model_card/layer_images/layer_8_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..b7a1a636e1472861b56edfedee6159e4fbe68dd2 Binary files /dev/null and b/model_card/layer_images/layer_8_output_dense.png differ diff --git a/model_card/layer_images/layer_9_attention_output_dense.png b/model_card/layer_images/layer_9_attention_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..96d4f96ec2a9e2b67972c9d8a5d94c2dc8101c77 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_output_dense.png differ diff --git a/model_card/layer_images/layer_9_attention_self_key.png b/model_card/layer_images/layer_9_attention_self_key.png new file mode 100644 index 0000000000000000000000000000000000000000..b3b6fc6a0e89dc949ab53ec50799ecf654b79327 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_key.png differ diff --git a/model_card/layer_images/layer_9_attention_self_query.png b/model_card/layer_images/layer_9_attention_self_query.png new file mode 100644 index 0000000000000000000000000000000000000000..47dcba6c90fc46fd79a9e8ea87e4e5777aa7c93d Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_query.png differ diff --git a/model_card/layer_images/layer_9_attention_self_value.png b/model_card/layer_images/layer_9_attention_self_value.png new file mode 100644 index 0000000000000000000000000000000000000000..50115e237bcc95356f0404d73a5b8a6f192c5ea8 Binary files /dev/null and b/model_card/layer_images/layer_9_attention_self_value.png differ diff --git a/model_card/layer_images/layer_9_intermediate_dense.png b/model_card/layer_images/layer_9_intermediate_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..06dddd07aa83a2b342448840517a83945bc54095 Binary files /dev/null and b/model_card/layer_images/layer_9_intermediate_dense.png differ diff --git a/model_card/layer_images/layer_9_output_dense.png b/model_card/layer_images/layer_9_output_dense.png new file mode 100644 index 0000000000000000000000000000000000000000..d52cd1e5fa292ef698a1427f809fecb4690c0dd6 Binary files /dev/null and b/model_card/layer_images/layer_9_output_dense.png differ diff --git a/model_card/pruning.svg b/model_card/pruning.svg new file mode 100644 index 0000000000000000000000000000000000000000..82960e22c15f62559791124835624d6d0d3fdd81 --- /dev/null +++ b/model_card/pruning.svg @@ -0,0 +1 @@ +44577567555488755765777801234567891011024681012prunedactivePruned Transformer HeadsLayer indexHeads count \ No newline at end of file diff --git a/model_meta.json b/model_meta.json new file mode 100644 index 0000000000000000000000000000000000000000..06302f2f500214cee02893d2d68db1e3b59b3caf --- /dev/null +++ b/model_meta.json @@ -0,0 +1,160 @@ +{ + "args": { + "adam_epsilon": 1e-08, + "alpha_ce": 0.1, + "alpha_distil": 0.9, + "ampere_learning_rate": 0.01, + "ampere_mask_init": "constant", + "ampere_mask_scale": 0.0, + "ampere_pruning_method": "disabled", + "cache_dir": "", + "config_name": "", + "data_dir": "squad_data", + "do_eval": true, + "do_lower_case": true, + "do_train": true, + "doc_stride": 128, + "eval_all_checkpoints": true, + "eval_batch_size": 16, + "evaluate_during_training": false, + "final_ampere_temperature": 20, + "final_lambda": 25, + "final_shuffling_temperature": 20, + "final_threshold": 0.1, + "final_warmup": 10, + "fp16": false, + "fp16_opt_level": "O1", + "global_topk": false, + "global_topk_frequency_compute": 25, + "gradient_accumulation_steps": 1, + "in_shuffling_group": 4, + "initial_ampere_temperature": 0.0, + "initial_shuffling_temperature": 0.1, + "initial_threshold": 0.0, + "initial_warmup": 1, + "lang_id": 0, + "learning_rate": 3e-05, + "local_rank": -1, + "logging_steps": 500, + "mask_block_cols": 32, + "mask_block_rows": 32, + "mask_init": "constant", + "mask_scale": 0.0, + "mask_scores_learning_rate": 0.01, + "max_answer_length": 30, + "max_grad_norm": 1.0, + "max_query_length": 64, + "max_seq_length": 384, + "max_steps": -1, + "model_name_or_path": "bert-base-uncased", + "model_type": "masked_bert", + "n_best_size": 20, + "n_gpu": 1, + "no_cuda": false, + "null_score_diff_threshold": 0.0, + "num_train_epochs": 20.0, + "out_shuffling_group": 4, + "overwrite_cache": false, + "overwrite_output_dir": true, + "per_gpu_eval_batch_size": 16, + "per_gpu_train_batch_size": 16, + "predict_file": "dev-v1.1.json", + "pruning_method": "sigmoied_threshold", + "pruning_submethod": "default", + "regularization": "l1", + "save_steps": 5000, + "seed": 42, + "server_ip": "", + "server_port": "", + "shuffling_learning_rate": 0.001, + "shuffling_method": "disabled", + "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1", + "teacher_type": "bert", + "temperature": 2.0, + "threads": 8, + "tokenizer_name": "", + "train_batch_size": 16, + "train_file": "train-v1.1.json", + "truncate_train_examples": -1, + "verbose_logging": false, + "version_2_with_negative": false, + "warmup_steps": 5400, + "weight_decay": 0.0 + }, + "config": { + "_name_or_path": "bert-base-uncased", + "ampere_mask_init": "constant", + "ampere_mask_scale": 0.0, + "ampere_pruning_method": "disabled", + "architectures": ["MaskedBertForQuestionAnswering"], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "in_shuffling_group": 4, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "mask_block_cols": 32, + "mask_block_rows": 32, + "mask_init": "constant", + "mask_scale": 0.0, + "max_position_embeddings": 512, + "model_type": "masked_bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "out_shuffling_group": 4, + "pad_token_id": 0, + "pruning_method": "sigmoied_threshold", + "pruning_submethod": "default", + "shuffling_method": "disabled", + "type_vocab_size": 2, + "vocab_size": 30522 + }, + "packaging": { + "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.32-v1", + "model_owner": "madlag", + "pytorch_final_file_size": 372682487 + }, + "performance": { + "dense": { + "eval_elapsed_time": 43.74243663600646 + }, + "pytorch_block_sparse": { + "eval_elapsed_time": 38.91637165797874 + }, + "speedup": 1.1240111750510087 + }, + "precision": { + "exact": 79.03500366210938, + "f1": 86.70228576660156 + }, + "sparsity": { + "ampere": false, + "block_size": [32, 32], + "block_sparse": true, + "block_sparse_density": 0.3171537422839506, + "block_sparse_nnz": 26306, + "block_sparse_total": 82944, + "global_density": 0.47026570456858, + "is_block_sparse_valid": true, + "nnz_parameters": 51486466, + "parameters": 109483778, + "pruned_heads": { + "0": [0, 2, 4, 5, 6, 7, 9, 11], + "1": [0, 2, 3, 5, 6, 7, 8, 9], + "2": [1, 2, 3, 4, 7, 8, 11], + "3": [2, 4, 6, 7, 10], + "4": [0, 1, 2, 6, 11], + "5": [0, 1, 2, 5, 6, 7, 11], + "6": [0, 2, 3, 4, 7, 10], + "7": [1, 3, 6, 7, 11], + "8": [0, 2, 3, 4, 5, 6, 8], + "9": [1, 3, 4, 5, 7, 9, 10], + "10": [1, 4, 5, 6, 7, 8, 9], + "11": [0, 2, 5, 6, 7, 8, 10, 11] + }, + "total_attention_heads": 144, + "total_pruned_attention_heads": 80 + } +} \ No newline at end of file