Muennighoff commited on
Commit
328c780
1 Parent(s): abd000a

Add SGPT-125M-scratchmean-nli

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
+ }
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+ ---
8
+
9
+ # {MODEL_NAME}
10
+
11
+ ** Trained from scratch only on NLI with reinitialized GPT-Neo weights **
12
+
13
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
+
15
+ <!--- Describe your model here -->
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('{MODEL_NAME}')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+
37
+
38
+ ## Evaluation Results
39
+
40
+ <!--- Describe how your model was evaluated -->
41
+
42
+ For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
43
+
44
+
45
+ ## Training
46
+ The model was trained with the parameters:
47
+
48
+ **DataLoader**:
49
+
50
+ `sentence_transformers.datasets.NoDuplicatesDataLoader.NoDuplicatesDataLoader` of length 8807 with parameters:
51
+ ```
52
+ {'batch_size': 64}
53
+ ```
54
+
55
+ **Loss**:
56
+
57
+ `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
58
+ ```
59
+ {'scale': 20.0, 'similarity_fct': 'cos_sim'}
60
+ ```
61
+
62
+ Parameters of the fit()-Method:
63
+ ```
64
+ {
65
+ "epochs": 1,
66
+ "evaluation_steps": 880,
67
+ "evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator",
68
+ "max_grad_norm": 1,
69
+ "optimizer_class": "<class 'transformers.optimization.AdamW'>",
70
+ "optimizer_params": {
71
+ "lr": 2e-05
72
+ },
73
+ "scheduler": "WarmupLinear",
74
+ "steps_per_epoch": null,
75
+ "warmup_steps": 881,
76
+ "weight_decay": 0.01
77
+ }
78
+ ```
79
+
80
+
81
+ ## Full Model Architecture
82
+ ```
83
+ SentenceTransformer(
84
+ (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: GPTNeoModel
85
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
86
+ )
87
+ ```
88
+
89
+ ## Citing & Authors
90
+
91
+ <!--- Describe where people can find more information -->
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/gpt-neo-125M",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPTNeoModel"
6
+ ],
7
+ "attention_dropout": 0,
8
+ "attention_layers": [
9
+ "global",
10
+ "local",
11
+ "global",
12
+ "local",
13
+ "global",
14
+ "local",
15
+ "global",
16
+ "local",
17
+ "global",
18
+ "local",
19
+ "global",
20
+ "local"
21
+ ],
22
+ "attention_types": [
23
+ [
24
+ [
25
+ "global",
26
+ "local"
27
+ ],
28
+ 6
29
+ ]
30
+ ],
31
+ "bos_token_id": 50256,
32
+ "embed_dropout": 0,
33
+ "eos_token_id": 50256,
34
+ "gradient_checkpointing": false,
35
+ "hidden_size": 768,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": null,
38
+ "layer_norm_epsilon": 1e-05,
39
+ "max_position_embeddings": 2048,
40
+ "model_type": "gpt_neo",
41
+ "num_heads": 12,
42
+ "num_layers": 12,
43
+ "resid_dropout": 0,
44
+ "summary_activation": null,
45
+ "summary_first_dropout": 0.1,
46
+ "summary_proj_to_labels": true,
47
+ "summary_type": "cls_index",
48
+ "summary_use_proj": true,
49
+ "torch_dtype": "float32",
50
+ "transformers_version": "4.12.3",
51
+ "use_cache": true,
52
+ "vocab_size": 50257,
53
+ "window_size": 256
54
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.1.0",
4
+ "transformers": "4.12.3",
5
+ "pytorch": "1.10.0+cu113"
6
+ }
7
+ }
eval/similarity_evaluation_sts-dev_results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
2
+ 0,880,0.5298895446169344,0.5312240374218833,0.5117367201753408,0.5245744420855956,0.5104904261012467,0.5231746284173534,0.4585831281872321,0.4757636486142692
3
+ 0,1760,0.6414522822064204,0.6466598626369797,0.6315813393117423,0.6351470966847795,0.6299802345481992,0.633638220198648,0.5564450205278492,0.5860890895197167
4
+ 0,2640,0.6651305710363661,0.6758417801736336,0.6722952033313845,0.6762575147352333,0.6732778021726432,0.6770590704485377,0.5615850866939909,0.5915882175060602
5
+ 0,3520,0.6936786606162212,0.7000569579383393,0.6870984944729166,0.6926032039515108,0.6852739285305858,0.6900644263987616,0.5915931944462255,0.6139743454184338
6
+ 0,4400,0.700899713888771,0.7047739810339679,0.6920555918002221,0.6958745033620944,0.6916965374691982,0.6954146507315779,0.6066928354981058,0.6273693546624726
7
+ 0,5280,0.7141938628219852,0.7201963457696333,0.6962514159036581,0.6984396706127707,0.6952247227203017,0.6971711750754467,0.6170316020557418,0.6385997236226345
8
+ 0,6160,0.7118754507353484,0.7179873371277351,0.6992091001590869,0.7010723806146854,0.6987773965936882,0.7003257377506648,0.619611553839931,0.6403750629261068
9
+ 0,7040,0.7189095591545901,0.7236743303762333,0.7004906826331114,0.7015026404319671,0.6996276311227727,0.7006774675307638,0.623697167086132,0.6521847383358765
10
+ 0,7920,0.725586194362776,0.7296134020905084,0.7079407270210554,0.7094840376652114,0.707021328265756,0.7083845297889915,0.6309189030417285,0.655483391857864
11
+ 0,8800,0.7267545474582126,0.7307983815559623,0.7085992003709655,0.7102699385634418,0.707714153882588,0.709112834562065,0.6312806322363824,0.6558650202351088
12
+ 0,-1,0.7267570775457864,0.7308159723154326,0.7085975081646948,0.7102818638385754,0.7077130942522993,0.7091036181041682,0.631292016689388,0.6558531305471883
merges.txt ADDED
The diff for this file is too large to render. See raw diff
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "pooling.Pooling"
13
+ }
14
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3ebb02ad389e1b23b364681bb5eccf5e94be41fef6c5ea6a9c2ce1dc2e88080
3
+ size 551190545
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ {
2
+ "max_seq_length": 75,
3
+ "do_lower_case": false
4
+ }
similarity_evaluation_sts-test_results.csv ADDED
@@ -0,0 +1,2 @@
 
 
1
+ epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
2
+ -1,-1,0.6779194087604842,0.659069787792863,0.6490090020325722,0.637928681167657,0.6482993637695468,0.6366318312354695,0.5731378793587686,0.5578949630024792
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "EleutherAI/gpt-neo-125M", "tokenizer_class": "GPT2Tokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff