Muennighoff commited on
Commit
424904b
1 Parent(s): 496ff8d
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zst filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 4096,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": true,
8
+ "pooling_mode_lasttoken": false
9
+ }
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ ---
8
+
9
+ # {MODEL_NAME}
10
+
11
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 4096 dimensional dense vector space and can be used for tasks like clustering or semantic search.
12
+
13
+ <!--- Describe your model here -->
14
+
15
+ ## Usage (Sentence-Transformers)
16
+
17
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
18
+
19
+ ```
20
+ pip install -U sentence-transformers
21
+ ```
22
+
23
+ Then you can use the model like this:
24
+
25
+ ```python
26
+ from sentence_transformers import SentenceTransformer
27
+ sentences = ["This is an example sentence", "Each sentence is converted"]
28
+
29
+ model = SentenceTransformer('{MODEL_NAME}')
30
+ embeddings = model.encode(sentences)
31
+ print(embeddings)
32
+ ```
33
+
34
+
35
+
36
+ ## Evaluation Results
37
+
38
+ <!--- Describe how your model was evaluated -->
39
+
40
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
41
+
42
+
43
+ ## Training
44
+ The model was trained with the parameters:
45
+
46
+ **DataLoader**:
47
+
48
+ `torch.utils.data.dataloader.DataLoader` of length 15600 with parameters:
49
+ ```
50
+ {'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
51
+ ```
52
+
53
+ **Loss**:
54
+
55
+ `sentence_transformers.losses.MultipleNegativesRankingLoss.MNRLGradCache`
56
+
57
+ Parameters of the fit()-Method:
58
+ ```
59
+ {
60
+ "epochs": 10,
61
+ "evaluation_steps": 0,
62
+ "evaluator": "NoneType",
63
+ "max_grad_norm": 1,
64
+ "optimizer_class": "<class 'transformers.optimization.AdamW'>",
65
+ "optimizer_params": {
66
+ "lr": 0.0004
67
+ },
68
+ "scheduler": "WarmupLinear",
69
+ "steps_per_epoch": null,
70
+ "warmup_steps": 1000,
71
+ "weight_decay": 0.01
72
+ }
73
+ ```
74
+
75
+
76
+ ## Full Model Architecture
77
+ ```
78
+ SentenceTransformer(
79
+ (0): Transformer({'max_seq_length': 300, 'do_lower_case': False}) with Transformer model: BloomModel
80
+ (1): Pooling({'word_embedding_dimension': 4096, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': True, 'pooling_mode_lasttoken': False})
81
+ )
82
+ ```
83
+
84
+ ## Citing & Authors
85
+
86
+ <!--- Describe where people can find more information -->
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "initializer_range": 0.02,
14
+ "layer_norm_epsilon": 1e-05,
15
+ "masked_softmax_fusion": true,
16
+ "model_type": "bloom",
17
+ "n_embed": 4096,
18
+ "n_inner": null,
19
+ "n_layer": 30,
20
+ "num_attention_heads": 32,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 4,
24
+ "seq_length": 2048,
25
+ "skip_bias_add": true,
26
+ "skip_bias_add_qkv": false,
27
+ "slow_but_exact": false,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.20.1",
30
+ "unk_token_id": 0,
31
+ "use_cache": true,
32
+ "vocab_size": 250682
33
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.1.0",
4
+ "transformers": "4.20.1",
5
+ "pytorch": "1.12.0"
6
+ }
7
+ }
evaluation/beir.json ADDED
@@ -0,0 +1 @@
 
1
+ {"ndcgs": {"sgpt-bloom-7b1-msmarco": {"scifact": {"NDCG@1": 0.59, "NDCG@3": 0.66868, "NDCG@5": 0.69178, "NDCG@10": 0.71824, "NDCG@100": 0.74152, "NDCG@1000": 0.74616}, "nfcorpus": {"NDCG@1": 0.45975, "NDCG@3": 0.41542, "NDCG@5": 0.39154, "NDCG@10": 0.35748, "NDCG@100": 0.32859, "NDCG@1000": 0.41791}, "arguana": {"NDCG@1": 0.23542, "NDCG@3": 0.36629, "NDCG@5": 0.41642, "NDCG@10": 0.47281, "NDCG@100": 0.52192, "NDCG@1000": 0.52529}, "cqadupstack_webmasters": {"NDCG@1": 0.27668, "NDCG@3": 0.32185, "NDCG@5": 0.33953, "NDCG@10": 0.36713, "NDCG@100": 0.42275, "NDCG@1000": 0.45387}, "cqadupstack_android": {"NDCG@1": 0.33476, "NDCG@3": 0.3679, "NDCG@5": 0.39528, "NDCG@10": 0.42525, "NDCG@100": 0.47769, "NDCG@1000": 0.50024}, "cqadupstack_mathematica": {"NDCG@1": 0.15796, "NDCG@3": 0.20281, "NDCG@5": 0.22875, "NDCG@10": 0.25536, "NDCG@100": 0.31178, "NDCG@1000": 0.34362}, "cqadupstack_english": {"NDCG@1": 0.37325, "NDCG@3": 0.40509, "NDCG@5": 0.42261, "NDCG@10": 0.44531, "NDCG@100": 0.48826, "NDCG@1000": 0.50904}, "cqadupstack_gaming": {"NDCG@1": 0.40313, "NDCG@3": 0.46762, "NDCG@5": 0.4959, "NDCG@10": 0.52259, "NDCG@100": 0.56564, "NDCG@1000": 0.5797}, "scidocs": {"NDCG@1": 0.208, "NDCG@3": 0.17601, "NDCG@5": 0.15464, "NDCG@10": 0.18435, "NDCG@100": 0.25829, "NDCG@1000": 0.30881}, "cqadupstack_programmers": {"NDCG@1": 0.2968, "NDCG@3": 0.33463, "NDCG@5": 0.35794, "NDCG@10": 0.38225, "NDCG@100": 0.44025, "NDCG@1000": 0.46704}, "cqadupstack_physics": {"NDCG@1": 0.31954, "NDCG@3": 0.36422, "NDCG@5": 0.39347, "NDCG@10": 0.41971, "NDCG@100": 0.47164, "NDCG@1000": 0.49612}, "cqadupstack_gis": {"NDCG@1": 0.25424, "NDCG@3": 0.31326, "NDCG@5": 0.33256, "NDCG@10": 0.35723, "NDCG@100": 0.41311, "NDCG@1000": 0.43261}, "cqadupstack_unix": {"NDCG@1": 0.27985, "NDCG@3": 0.3171, "NDCG@5": 0.33422, "NDCG@10": 0.35778, "NDCG@100": 0.41138, "NDCG@1000": 0.43812}, "cqadupstack_stats": {"NDCG@1": 0.23773, "NDCG@3": 0.28818, "NDCG@5": 0.30341, "NDCG@10": 0.32666, "NDCG@100": 0.36923, "NDCG@1000": 0.39845}, "fiqa": {"NDCG@1": 0.34105, "NDCG@3": 0.31934, "NDCG@5": 0.33612, "NDCG@10": 0.35736, "NDCG@100": 0.42409, "NDCG@1000": 0.45831}, "cqadupstack_wordpress": {"NDCG@1": 0.21627, "NDCG@3": 0.26489, "NDCG@5": 0.29248, "NDCG@10": 0.31721, "NDCG@100": 0.36279, "NDCG@1000": 0.39041}, "cqadupstack_tex": {"NDCG@1": 0.19718, "NDCG@3": 0.23505, "NDCG@5": 0.25141, "NDCG@10": 0.27375, "NDCG@100": 0.32004, "NDCG@1000": 0.35069}, "cqadupstack": {"NDCG@1": 0.2789491666666667, "NDCG@3": 0.32354999999999995, "NDCG@5": 0.3456300000000001, "NDCG@10": 0.3708525, "NDCG@100": 0.42121333333333333, "NDCG@1000": 0.4466591666666666}, "quora": {"NDCG@1": 0.6307, "NDCG@3": 0.69659, "NDCG@5": 0.72302, "NDCG@10": 0.74655, "NDCG@100": 0.77546, "NDCG@1000": 0.77864}, "trec-covid": {"NDCG@1": 0.88, "NDCG@3": 0.86877, "NDCG@5": 0.84621, "NDCG@10": 0.82731, "NDCG@100": 0.61742, "NDCG@1000": 0.52225}, "webis-touche2020": {"NDCG@1": 0.26531, "NDCG@3": 0.26197, "NDCG@5": 0.24561, "NDCG@10": 0.2365, "NDCG@100": 0.35251, "NDCG@1000": 0.46791}}}, "maps": {"sgpt-bloom-7b1-msmarco": {"scifact": {"MAP@1": 0.55661, "MAP@3": 0.6387, "MAP@5": 0.65383, "MAP@10": 0.66782, "MAP@100": 0.67334, "MAP@1000": 0.67348}, "nfcorpus": {"MAP@1": 0.0569, "MAP@3": 0.09625, "MAP@5": 0.11132, "MAP@10": 0.1309, "MAP@100": 0.16717, "MAP@1000": 0.18237}, "arguana": {"MAP@1": 0.23542, "MAP@3": 0.33345, "MAP@5": 0.36112, "MAP@10": 0.3848, "MAP@100": 0.3964, "MAP@1000": 0.39655}, "cqadupstack_webmasters": {"MAP@1": 0.23058, "MAP@3": 0.28613, "MAP@5": 0.29925, "MAP@10": 0.31308, "MAP@100": 0.32818, "MAP@1000": 0.33044}, "cqadupstack_android": {"MAP@1": 0.2696, "MAP@3": 0.32762, "MAP@5": 0.34832, "MAP@10": 0.36428, "MAP@100": 0.37719, "MAP@1000": 0.37843}, "cqadupstack_mathematica": {"MAP@1": 0.12403, "MAP@3": 0.17655, "MAP@5": 0.1919, "MAP@10": 0.20359, "MAP@100": 0.21506, "MAP@1000": 0.21639}, "cqadupstack_english": {"MAP@1": 0.29571, "MAP@3": 0.36207, "MAP@5": 0.37697, "MAP@10": 0.39027, "MAP@100": 0.40193, "MAP@1000": 0.40318}, "cqadupstack_gaming": {"MAP@1": 0.3551, "MAP@3": 0.43422, "MAP@5": 0.45209, "MAP@10": 0.4655, "MAP@100": 0.47634, "MAP@1000": 0.47699}, "scidocs": {"MAP@1": 0.04223, "MAP@3": 0.07854, "MAP@5": 0.09393, "MAP@10": 0.10847, "MAP@100": 0.12704, "MAP@1000": 0.12977}, "cqadupstack_programmers": {"MAP@1": 0.23765, "MAP@3": 0.2984, "MAP@5": 0.31543, "MAP@10": 0.32737, "MAP@100": 0.34121, "MAP@1000": 0.34252}, "cqadupstack_physics": {"MAP@1": 0.2679, "MAP@3": 0.32851, "MAP@5": 0.34813, "MAP@10": 0.36139, "MAP@100": 0.37356, "MAP@1000": 0.3749}, "cqadupstack_gis": {"MAP@1": 0.234, "MAP@3": 0.28982, "MAP@5": 0.30107, "MAP@10": 0.3118, "MAP@100": 0.3229, "MAP@1000": 0.32358}, "cqadupstack_unix": {"MAP@1": 0.24157, "MAP@3": 0.29199, "MAP@5": 0.30319, "MAP@10": 0.31382, "MAP@100": 0.32501, "MAP@1000": 0.32611}, "cqadupstack_stats": {"MAP@1": 0.21433, "MAP@3": 0.26445, "MAP@5": 0.27429, "MAP@10": 0.28455, "MAP@100": 0.29284, "MAP@1000": 0.29398}, "fiqa": {"MAP@1": 0.17039, "MAP@3": 0.24454, "MAP@5": 0.26793, "MAP@10": 0.28384, "MAP@100": 0.30107, "MAP@1000": 0.303}, "cqadupstack_wordpress": {"MAP@1": 0.20123, "MAP@3": 0.24563, "MAP@5": 0.26177, "MAP@10": 0.27227, "MAP@100": 0.28093, "MAP@1000": 0.28198}, "cqadupstack_tex": {"MAP@1": 0.16314, "MAP@3": 0.20877, "MAP@5": 0.21981, "MAP@10": 0.23003, "MAP@100": 0.23921, "MAP@1000": 0.24047}, "quora": {"MAP@1": 0.54941, "MAP@3": 0.65382, "MAP@5": 0.67572, "MAP@10": 0.69008, "MAP@100": 0.70003, "MAP@1000": 0.70036}, "trec-covid": {"MAP@1": 0.00251, "MAP@3": 0.00716, "MAP@5": 0.01121, "MAP@10": 0.02093, "MAP@100": 0.12252, "MAP@1000": 0.27831}, "webis-touche2020": {"MAP@1": 0.02322, "MAP@3": 0.04791, "MAP@5": 0.06072, "MAP@10": 0.08799, "MAP@100": 0.15053, "MAP@1000": 0.16628}}}, "recalls": {"sgpt-bloom-7b1-msmarco": {"scifact": {"Recall@1": 0.55661, "Recall@3": 0.72433, "Recall@5": 0.78361, "Recall@10": 0.858, "Recall@100": 0.96167, "Recall@1000": 1.0}, "nfcorpus": {"Recall@1": 0.0569, "Recall@3": 0.10864, "Recall@5": 0.13256, "Recall@10": 0.17061, "Recall@100": 0.33703, "Recall@1000": 0.66706}, "arguana": {"Recall@1": 0.23542, "Recall@3": 0.46159, "Recall@5": 0.58393, "Recall@10": 0.75605, "Recall@100": 0.97013, "Recall@1000": 0.99573}, "cqadupstack_webmasters": {"Recall@1": 0.23058, "Recall@3": 0.34204, "Recall@5": 0.38728, "Recall@10": 0.47347, "Recall@100": 0.72013, "Recall@1000": 0.92319}, "cqadupstack_android": {"Recall@1": 0.2696, "Recall@3": 0.38594, "Recall@5": 0.45881, "Recall@10": 0.54882, "Recall@100": 0.77388, "Recall@1000": 0.92183}, "cqadupstack_mathematica": {"Recall@1": 0.12403, "Recall@3": 0.23551, "Recall@5": 0.2988, "Recall@10": 0.3762, "Recall@100": 0.62595, "Recall@1000": 0.85285}, "cqadupstack_english": {"Recall@1": 0.29571, "Recall@3": 0.41956, "Recall@5": 0.46965, "Recall@10": 0.53965, "Recall@100": 0.7261, "Recall@1000": 0.85665}, "cqadupstack_gaming": {"Recall@1": 0.3551, "Recall@3": 0.51142, "Recall@5": 0.58008, "Recall@10": 0.65914, "Recall@100": 0.84275, "Recall@1000": 0.94272}, "scidocs": {"Recall@1": 0.04223, "Recall@3": 0.10038, "Recall@5": 0.13857, "Recall@10": 0.19287, "Recall@100": 0.40922, "Recall@1000": 0.65665}, "cqadupstack_programmers": {"Recall@1": 0.23765, "Recall@3": 0.35897, "Recall@5": 0.42129, "Recall@10": 0.49281, "Recall@100": 0.74021, "Recall@1000": 0.92297}, "cqadupstack_physics": {"Recall@1": 0.2679, "Recall@3": 0.39176, "Recall@5": 0.4631, "Recall@10": 0.54045, "Recall@100": 0.76192, "Recall@1000": 0.92081}, "cqadupstack_gis": {"Recall@1": 0.234, "Recall@3": 0.356, "Recall@5": 0.40299, "Recall@10": 0.47713, "Recall@100": 0.73541, "Recall@1000": 0.88709}, "cqadupstack_unix": {"Recall@1": 0.24157, "Recall@3": 0.34361, "Recall@5": 0.38646, "Recall@10": 0.45523, "Recall@100": 0.69103, "Recall@1000": 0.88251}, "cqadupstack_stats": {"Recall@1": 0.21433, "Recall@3": 0.32467, "Recall@5": 0.36117, "Recall@10": 0.42994, "Recall@100": 0.62737, "Recall@1000": 0.84587}, "fiqa": {"Recall@1": 0.17039, "Recall@3": 0.29267, "Recall@5": 0.35873, "Recall@10": 0.42707, "Recall@100": 0.67557, "Recall@1000": 0.88364}, "cqadupstack_wordpress": {"Recall@1": 0.20123, "Recall@3": 0.29901, "Recall@5": 0.36537, "Recall@10": 0.43971, "Recall@100": 0.65135, "Recall@1000": 0.86095}, "cqadupstack_tex": {"Recall@1": 0.16314, "Recall@3": 0.26079, "Recall@5": 0.30313, "Recall@10": 0.36994, "Recall@100": 0.58138, "Recall@1000": 0.79975}, "quora": {"Recall@1": 0.54941, "Recall@3": 0.73605, "Recall@5": 0.8037, "Recall@10": 0.87129, "Recall@100": 0.98129, "Recall@1000": 0.99825}, "trec-covid": {"Recall@1": 0.00251, "Recall@3": 0.00738, "Recall@5": 0.01163, "Recall@10": 0.02236, "Recall@100": 0.15228, "Recall@1000": 0.48675}, "webis-touche2020": {"Recall@1": 0.02322, "Recall@3": 0.06065, "Recall@5": 0.08608, "Recall@10": 0.15224, "Recall@100": 0.45549, "Recall@1000": 0.80094}}}, "precisions": {"sgpt-bloom-7b1-msmarco": {"scifact": {"P@1": 0.59, "P@3": 0.26444, "P@5": 0.174, "P@10": 0.09733, "P@100": 0.0109, "P@1000": 0.00113}, "nfcorpus": {"P@1": 0.47988, "P@3": 0.38803, "P@5": 0.33994, "P@10": 0.26749, "P@100": 0.08514, "P@1000": 0.0213}, "arguana": {"P@1": 0.23542, "P@3": 0.15386, "P@5": 0.11679, "P@10": 0.0756, "P@100": 0.0097, "P@1000": 0.001}, "cqadupstack_webmasters": {"P@1": 0.27668, "P@3": 0.14822, "P@5": 0.10909, "P@10": 0.07016, "P@100": 0.01437, "P@1000": 0.0023}, "cqadupstack_android": {"P@1": 0.33476, "P@3": 0.17263, "P@5": 0.1279, "P@10": 0.08083, "P@100": 0.01303, "P@1000": 0.00176}, "cqadupstack_mathematica": {"P@1": 0.15796, "P@3": 0.10075, "P@5": 0.07786, "P@10": 0.0505, "P@100": 0.00917, "P@1000": 0.00133}, "cqadupstack_english": {"P@1": 0.37325, "P@3": 0.19427, "P@5": 0.13682, "P@10": 0.08363, "P@100": 0.01347, "P@1000": 0.00182}, "cqadupstack_gaming": {"P@1": 0.40313, "P@3": 0.20564, "P@5": 0.14408, "P@10": 0.08401, "P@100": 0.01147, "P@1000": 0.00132}, "scidocs": {"P@1": 0.208, "P@3": 0.165, "P@5": 0.1362, "P@10": 0.095, "P@100": 0.02015, "P@1000": 0.00323}, "cqadupstack_programmers": {"P@1": 0.2968, "P@3": 0.16096, "P@5": 0.11598, "P@10": 0.07043, "P@100": 0.01162, "P@1000": 0.00158}, "cqadupstack_physics": {"P@1": 0.31954, "P@3": 0.17036, "P@5": 0.12589, "P@10": 0.07719, "P@100": 0.01207, "P@1000": 0.0016}, "cqadupstack_gis": {"P@1": 0.25424, "P@3": 0.13371, "P@5": 0.09175, "P@10": 0.05469, "P@100": 0.00877, "P@1000": 0.00108}, "cqadupstack_unix": {"P@1": 0.27985, "P@3": 0.13993, "P@5": 0.0972, "P@10": 0.05802, "P@100": 0.0096, "P@1000": 0.0013}, "cqadupstack_stats": {"P@1": 0.23773, "P@3": 0.12423, "P@5": 0.08528, "P@10": 0.0523, "P@100": 0.00802, "P@1000": 0.00112}, "fiqa": {"P@1": 0.34105, "P@3": 0.21399, "P@5": 0.16296, "P@10": 0.09985, "P@100": 0.01694, "P@1000": 0.00229}, "cqadupstack_wordpress": {"P@1": 0.21627, "P@3": 0.11337, "P@5": 0.08392, "P@10": 0.05046, "P@100": 0.00793, "P@1000": 0.00111}, "cqadupstack_tex": {"P@1": 0.19718, "P@3": 0.11023, "P@5": 0.07915, "P@10": 0.04917, "P@100": 0.00837, "P@1000": 0.00127}, "quora": {"P@1": 0.6307, "P@3": 0.30727, "P@5": 0.20858, "P@10": 0.11841, "P@100": 0.01481, "P@1000": 0.00156}, "trec-covid": {"P@1": 0.92, "P@3": 0.91333, "P@5": 0.884, "P@10": 0.864, "P@100": 0.6364, "P@1000": 0.23266}, "webis-touche2020": {"P@1": 0.28571, "P@3": 0.27891, "P@5": 0.24898, "P@10": 0.21633, "P@100": 0.07327, "P@1000": 0.01496}}}}
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa0368f5635d9558fff1652236f8e16dccc2f65aaa7320d3decc7d559eda631
3
+ size 9947280444
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2350a568a6491d756f271357ba5147c86d4e0ebe5d83b97156218d9d5366c835
3
+ size 9733438429
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75f5c706e045b84e7b54c554eae2da0f02fea5870a19e943997cc567849a1e9d
3
+ size 8592219901
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 28272820224
4
+ },
5
+ "weight_map": {
6
+ "h.0.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
7
+ "h.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
8
+ "h.0.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
9
+ "h.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
10
+ "h.0.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
11
+ "h.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
12
+ "h.0.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
13
+ "h.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
14
+ "h.0.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
15
+ "h.0.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
16
+ "h.0.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
17
+ "h.0.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
18
+ "h.1.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
19
+ "h.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
20
+ "h.1.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
21
+ "h.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
22
+ "h.1.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
23
+ "h.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
24
+ "h.1.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
25
+ "h.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
26
+ "h.1.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
27
+ "h.1.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
28
+ "h.1.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
29
+ "h.1.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
30
+ "h.10.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
31
+ "h.10.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
32
+ "h.10.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
33
+ "h.10.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
34
+ "h.10.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
35
+ "h.10.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
36
+ "h.10.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
37
+ "h.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
38
+ "h.10.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
39
+ "h.10.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
40
+ "h.10.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
41
+ "h.10.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
42
+ "h.11.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
43
+ "h.11.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
44
+ "h.11.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
45
+ "h.11.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
46
+ "h.11.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
47
+ "h.11.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
48
+ "h.11.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
49
+ "h.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
50
+ "h.11.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
51
+ "h.11.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
52
+ "h.11.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
53
+ "h.11.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
54
+ "h.12.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
55
+ "h.12.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
56
+ "h.12.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
57
+ "h.12.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
58
+ "h.12.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
59
+ "h.12.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
60
+ "h.12.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
61
+ "h.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
62
+ "h.12.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
63
+ "h.12.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
64
+ "h.12.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
65
+ "h.12.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
66
+ "h.13.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
67
+ "h.13.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
68
+ "h.13.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
69
+ "h.13.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
70
+ "h.13.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
71
+ "h.13.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
72
+ "h.13.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
73
+ "h.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
74
+ "h.13.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
75
+ "h.13.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
76
+ "h.13.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
77
+ "h.13.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
78
+ "h.14.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
79
+ "h.14.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
80
+ "h.14.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
81
+ "h.14.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
82
+ "h.14.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
83
+ "h.14.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
84
+ "h.14.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
85
+ "h.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
86
+ "h.14.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
87
+ "h.14.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
88
+ "h.14.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
89
+ "h.14.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
90
+ "h.15.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
91
+ "h.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
92
+ "h.15.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
93
+ "h.15.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
94
+ "h.15.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
95
+ "h.15.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
96
+ "h.15.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
97
+ "h.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
98
+ "h.15.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
99
+ "h.15.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
100
+ "h.15.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
101
+ "h.15.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
102
+ "h.16.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
103
+ "h.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
104
+ "h.16.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
105
+ "h.16.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
106
+ "h.16.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
107
+ "h.16.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
108
+ "h.16.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
109
+ "h.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
110
+ "h.16.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
111
+ "h.16.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
112
+ "h.16.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
113
+ "h.16.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
114
+ "h.17.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
115
+ "h.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
116
+ "h.17.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
117
+ "h.17.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
118
+ "h.17.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
119
+ "h.17.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
120
+ "h.17.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
121
+ "h.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
122
+ "h.17.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
123
+ "h.17.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
124
+ "h.17.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
125
+ "h.17.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
126
+ "h.18.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
127
+ "h.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
128
+ "h.18.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
129
+ "h.18.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
130
+ "h.18.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
131
+ "h.18.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
132
+ "h.18.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
133
+ "h.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
134
+ "h.18.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
135
+ "h.18.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
136
+ "h.18.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
137
+ "h.18.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
138
+ "h.19.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
139
+ "h.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
140
+ "h.19.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
141
+ "h.19.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
142
+ "h.19.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
143
+ "h.19.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
144
+ "h.19.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
145
+ "h.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
146
+ "h.19.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
147
+ "h.19.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
148
+ "h.19.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
149
+ "h.19.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
150
+ "h.2.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
151
+ "h.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
152
+ "h.2.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
153
+ "h.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
154
+ "h.2.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
155
+ "h.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
156
+ "h.2.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
157
+ "h.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
158
+ "h.2.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
159
+ "h.2.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
160
+ "h.2.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
161
+ "h.2.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
162
+ "h.20.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
163
+ "h.20.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
164
+ "h.20.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
165
+ "h.20.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
166
+ "h.20.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
167
+ "h.20.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
168
+ "h.20.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
169
+ "h.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
170
+ "h.20.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
171
+ "h.20.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
172
+ "h.20.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
173
+ "h.20.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
174
+ "h.21.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
175
+ "h.21.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
176
+ "h.21.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
177
+ "h.21.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
178
+ "h.21.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
179
+ "h.21.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
180
+ "h.21.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
181
+ "h.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
182
+ "h.21.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
183
+ "h.21.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
184
+ "h.21.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
185
+ "h.21.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
186
+ "h.22.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
187
+ "h.22.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
188
+ "h.22.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
189
+ "h.22.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
190
+ "h.22.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
191
+ "h.22.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
192
+ "h.22.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
193
+ "h.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
194
+ "h.22.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
195
+ "h.22.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
196
+ "h.22.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
197
+ "h.22.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
198
+ "h.23.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
199
+ "h.23.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
200
+ "h.23.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
201
+ "h.23.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
202
+ "h.23.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
203
+ "h.23.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
204
+ "h.23.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
205
+ "h.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
206
+ "h.23.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
207
+ "h.23.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
208
+ "h.23.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
209
+ "h.23.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
210
+ "h.24.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
211
+ "h.24.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
212
+ "h.24.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
213
+ "h.24.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
214
+ "h.24.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
215
+ "h.24.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
216
+ "h.24.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
217
+ "h.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
218
+ "h.24.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
219
+ "h.24.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
220
+ "h.24.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
221
+ "h.24.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
222
+ "h.25.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
223
+ "h.25.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
224
+ "h.25.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
225
+ "h.25.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
226
+ "h.25.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
227
+ "h.25.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
228
+ "h.25.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
229
+ "h.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
230
+ "h.25.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
231
+ "h.25.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
232
+ "h.25.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
233
+ "h.25.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
234
+ "h.26.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
235
+ "h.26.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
236
+ "h.26.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
237
+ "h.26.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
238
+ "h.26.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
239
+ "h.26.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
240
+ "h.26.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
241
+ "h.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
242
+ "h.26.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
243
+ "h.26.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
244
+ "h.26.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
245
+ "h.26.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
246
+ "h.27.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
247
+ "h.27.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
248
+ "h.27.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
249
+ "h.27.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
250
+ "h.27.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
251
+ "h.27.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
252
+ "h.27.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
253
+ "h.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
254
+ "h.27.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
255
+ "h.27.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
256
+ "h.27.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
257
+ "h.27.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
258
+ "h.28.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
259
+ "h.28.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
260
+ "h.28.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
261
+ "h.28.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
262
+ "h.28.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
263
+ "h.28.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
264
+ "h.28.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
265
+ "h.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
266
+ "h.28.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
267
+ "h.28.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
268
+ "h.28.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
269
+ "h.28.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
270
+ "h.29.input_layernorm.bias": "pytorch_model-00003-of-00003.bin",
271
+ "h.29.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
272
+ "h.29.mlp.dense_4h_to_h.bias": "pytorch_model-00003-of-00003.bin",
273
+ "h.29.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00003.bin",
274
+ "h.29.mlp.dense_h_to_4h.bias": "pytorch_model-00003-of-00003.bin",
275
+ "h.29.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00003.bin",
276
+ "h.29.post_attention_layernorm.bias": "pytorch_model-00003-of-00003.bin",
277
+ "h.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
278
+ "h.29.self_attention.dense.bias": "pytorch_model-00003-of-00003.bin",
279
+ "h.29.self_attention.dense.weight": "pytorch_model-00003-of-00003.bin",
280
+ "h.29.self_attention.query_key_value.bias": "pytorch_model-00003-of-00003.bin",
281
+ "h.29.self_attention.query_key_value.weight": "pytorch_model-00003-of-00003.bin",
282
+ "h.3.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
283
+ "h.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
284
+ "h.3.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
285
+ "h.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
286
+ "h.3.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
287
+ "h.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
288
+ "h.3.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
289
+ "h.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
290
+ "h.3.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
291
+ "h.3.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
292
+ "h.3.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
293
+ "h.3.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
294
+ "h.4.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
295
+ "h.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
296
+ "h.4.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
297
+ "h.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
298
+ "h.4.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
299
+ "h.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
300
+ "h.4.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
301
+ "h.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
302
+ "h.4.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
303
+ "h.4.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
304
+ "h.4.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
305
+ "h.4.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
306
+ "h.5.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
307
+ "h.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
308
+ "h.5.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
309
+ "h.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
310
+ "h.5.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
311
+ "h.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
312
+ "h.5.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
313
+ "h.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
314
+ "h.5.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
315
+ "h.5.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
316
+ "h.5.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
317
+ "h.5.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
318
+ "h.6.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
319
+ "h.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
320
+ "h.6.mlp.dense_4h_to_h.bias": "pytorch_model-00001-of-00003.bin",
321
+ "h.6.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00003.bin",
322
+ "h.6.mlp.dense_h_to_4h.bias": "pytorch_model-00001-of-00003.bin",
323
+ "h.6.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00003.bin",
324
+ "h.6.post_attention_layernorm.bias": "pytorch_model-00001-of-00003.bin",
325
+ "h.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
326
+ "h.6.self_attention.dense.bias": "pytorch_model-00001-of-00003.bin",
327
+ "h.6.self_attention.dense.weight": "pytorch_model-00001-of-00003.bin",
328
+ "h.6.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
329
+ "h.6.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
330
+ "h.7.input_layernorm.bias": "pytorch_model-00001-of-00003.bin",
331
+ "h.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
332
+ "h.7.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
333
+ "h.7.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
334
+ "h.7.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
335
+ "h.7.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
336
+ "h.7.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
337
+ "h.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
338
+ "h.7.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
339
+ "h.7.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
340
+ "h.7.self_attention.query_key_value.bias": "pytorch_model-00001-of-00003.bin",
341
+ "h.7.self_attention.query_key_value.weight": "pytorch_model-00001-of-00003.bin",
342
+ "h.8.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
343
+ "h.8.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
344
+ "h.8.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
345
+ "h.8.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
346
+ "h.8.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
347
+ "h.8.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
348
+ "h.8.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
349
+ "h.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
350
+ "h.8.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
351
+ "h.8.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
352
+ "h.8.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
353
+ "h.8.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
354
+ "h.9.input_layernorm.bias": "pytorch_model-00002-of-00003.bin",
355
+ "h.9.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
356
+ "h.9.mlp.dense_4h_to_h.bias": "pytorch_model-00002-of-00003.bin",
357
+ "h.9.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00003.bin",
358
+ "h.9.mlp.dense_h_to_4h.bias": "pytorch_model-00002-of-00003.bin",
359
+ "h.9.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00003.bin",
360
+ "h.9.post_attention_layernorm.bias": "pytorch_model-00002-of-00003.bin",
361
+ "h.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
362
+ "h.9.self_attention.dense.bias": "pytorch_model-00002-of-00003.bin",
363
+ "h.9.self_attention.dense.weight": "pytorch_model-00002-of-00003.bin",
364
+ "h.9.self_attention.query_key_value.bias": "pytorch_model-00002-of-00003.bin",
365
+ "h.9.self_attention.query_key_value.weight": "pytorch_model-00002-of-00003.bin",
366
+ "ln_f.bias": "pytorch_model-00003-of-00003.bin",
367
+ "ln_f.weight": "pytorch_model-00003-of-00003.bin",
368
+ "word_embeddings.weight": "pytorch_model-00001-of-00003.bin",
369
+ "word_embeddings_layernorm.bias": "pytorch_model-00001-of-00003.bin",
370
+ "word_embeddings_layernorm.weight": "pytorch_model-00001-of-00003.bin"
371
+ }
372
+ }
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ {
2
+ "max_seq_length": 300,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53b546fa3c6b32936b8e411c6b0c98c66fa272b0e69d1bb025b0026b1908e930
3
+ size 14500905
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "name_or_path": "/gpfsscratch/rech/six/commun/commun/experiments/muennighoff/bloomckpt/6b3/bloom-7b1",
6
+ "pad_token": "<pad>",
7
+ "padding_side": "left",
8
+ "special_tokens_map_file": null,
9
+ "tokenizer_class": "BloomTokenizer",
10
+ "unk_token": "<unk>"
11
+ }