HSR-HF commited on
Commit
00cb73f
1 Parent(s): ac67afa

Add new SentenceTransformer model.

Browse files
README.md CHANGED
@@ -5,6 +5,7 @@ tags:
5
  - sentence-transformers
6
  - feature-extraction
7
  - sentence-similarity
 
8
 
9
  ---
10
 
@@ -35,6 +36,44 @@ print(embeddings)
35
 
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ## Evaluation Results
39
 
40
  <!--- Describe how your model was evaluated -->
@@ -81,9 +120,8 @@ Parameters of the fit()-Method:
81
  ## Full Model Architecture
82
  ```
83
  SentenceTransformer(
84
- (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel
85
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
86
- (2): Normalize()
87
  )
88
  ```
89
 
 
5
  - sentence-transformers
6
  - feature-extraction
7
  - sentence-similarity
8
+ - transformers
9
 
10
  ---
11
 
 
36
 
37
 
38
 
39
+ ## Usage (HuggingFace Transformers)
40
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
41
+
42
+ ```python
43
+ from transformers import AutoTokenizer, AutoModel
44
+ import torch
45
+
46
+
47
+ #Mean Pooling - Take attention mask into account for correct averaging
48
+ def mean_pooling(model_output, attention_mask):
49
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
50
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
52
+
53
+
54
+ # Sentences we want sentence embeddings for
55
+ sentences = ['This is an example sentence', 'Each sentence is converted']
56
+
57
+ # Load model from HuggingFace Hub
58
+ tokenizer = AutoTokenizer.from_pretrained('HSR-HF/sts-rf-bc-contrastive')
59
+ model = AutoModel.from_pretrained('HSR-HF/sts-rf-bc-contrastive')
60
+
61
+ # Tokenize sentences
62
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
63
+
64
+ # Compute token embeddings
65
+ with torch.no_grad():
66
+ model_output = model(**encoded_input)
67
+
68
+ # Perform pooling. In this case, mean pooling.
69
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
70
+
71
+ print("Sentence embeddings:")
72
+ print(sentence_embeddings)
73
+ ```
74
+
75
+
76
+
77
  ## Evaluation Results
78
 
79
  <!--- Describe how your model was evaluated -->
 
120
  ## Full Model Architecture
121
  ```
122
  SentenceTransformer(
123
+ (0): Transformer({'max_seq_length': 100, 'do_lower_case': False}) with Transformer model: AlbertModel
124
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
 
125
  )
126
  ```
127
 
config.json CHANGED
@@ -1,24 +1,33 @@
1
  {
2
  "_name_or_path": "/content/output_train",
3
  "architectures": [
4
- "MPNetModel"
5
  ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "eos_token_id": 2,
9
- "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.1,
 
 
 
 
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
 
13
  "intermediate_size": 3072,
14
- "layer_norm_eps": 1e-05,
15
- "max_position_embeddings": 514,
16
- "model_type": "mpnet",
 
17
  "num_attention_heads": 12,
18
- "num_hidden_layers": 12,
19
- "pad_token_id": 1,
20
- "relative_attention_num_buckets": 32,
 
 
21
  "torch_dtype": "float32",
22
  "transformers_version": "4.40.2",
23
- "vocab_size": 30527
 
24
  }
 
1
  {
2
  "_name_or_path": "/content/output_train",
3
  "architectures": [
4
+ "AlbertModel"
5
  ],
6
+ "attention_probs_dropout_prob": 0,
7
+ "bos_token_id": 2,
8
+ "classifier_dropout_prob": 0.1,
9
+ "down_scale_factor": 1,
10
+ "embedding_size": 128,
11
+ "eos_token_id": 3,
12
+ "gap_size": 0,
13
+ "hidden_act": "gelu_new",
14
+ "hidden_dropout_prob": 0,
15
  "hidden_size": 768,
16
  "initializer_range": 0.02,
17
+ "inner_group_num": 1,
18
  "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "albert",
22
+ "net_structure_type": 0,
23
  "num_attention_heads": 12,
24
+ "num_hidden_groups": 1,
25
+ "num_hidden_layers": 6,
26
+ "num_memory_blocks": 0,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
  "torch_dtype": "float32",
30
  "transformers_version": "4.40.2",
31
+ "type_vocab_size": 2,
32
+ "vocab_size": 30000
33
  }
config_sentence_transformers.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "__version__": {
3
  "sentence_transformers": "2.0.0",
4
- "transformers": "4.6.1",
5
- "pytorch": "1.8.1"
6
  },
7
  "prompts": {},
8
  "default_prompt_name": null
 
1
  {
2
  "__version__": {
3
  "sentence_transformers": "2.0.0",
4
+ "transformers": "4.7.0",
5
+ "pytorch": "1.9.0+cu102"
6
  },
7
  "prompts": {},
8
  "default_prompt_name": null
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29be9e4e101225bdda169646851a0c0cc63b5554cb49f223bbf8a7dd104007d1
3
- size 437967672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:946b5b069f6fd0f14632b0e71ee6fcff00ac3c53524a1c8b00f357ee23400c39
3
+ size 46737416
modules.json CHANGED
@@ -10,11 +10,5 @@
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Normalize",
18
- "type": "sentence_transformers.models.Normalize"
19
  }
20
  ]
 
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
 
 
 
 
 
 
13
  }
14
  ]
sentence_bert_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "max_seq_length": 384,
3
  "do_lower_case": false
4
  }
 
1
  {
2
+ "max_seq_length": 100,
3
  "do_lower_case": false
4
  }
special_tokens_map.json CHANGED
@@ -1,27 +1,27 @@
1
  {
2
  "bos_token": {
3
- "content": "<s>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "cls_token": {
10
- "content": "<s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "eos_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "mask_token": {
24
- "content": "<mask>",
25
  "lstrip": true,
26
  "normalized": false,
27
  "rstrip": false,
@@ -35,14 +35,14 @@
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "</s>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "unk_token": {
45
- "content": "[UNK]",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
 
1
  {
2
  "bos_token": {
3
+ "content": "[CLS]",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "cls_token": {
10
+ "content": "[CLS]",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "eos_token": {
17
+ "content": "[SEP]",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "mask_token": {
24
+ "content": "[MASK]",
25
  "lstrip": true,
26
  "normalized": false,
27
  "rstrip": false,
 
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "[SEP]",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "unk_token": {
45
+ "content": "<unk>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336
3
+ size 760289
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<s>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "<pad>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -25,23 +25,15 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "104": {
36
- "content": "[UNK]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
- "30526": {
44
- "content": "<mask>",
45
  "lstrip": true,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,24 +41,24 @@
49
  "special": true
50
  }
51
  },
52
- "bos_token": "<s>",
53
  "clean_up_tokenization_spaces": true,
54
- "cls_token": "<s>",
55
  "do_lower_case": true,
56
- "eos_token": "</s>",
57
- "mask_token": "<mask>",
58
- "max_length": 128,
 
59
  "model_max_length": 512,
60
  "pad_to_multiple_of": null,
61
  "pad_token": "<pad>",
62
  "pad_token_type_id": 0,
63
  "padding_side": "right",
64
- "sep_token": "</s>",
 
65
  "stride": 0,
66
- "strip_accents": null,
67
- "tokenize_chinese_chars": true,
68
- "tokenizer_class": "MPNetTokenizer",
69
  "truncation_side": "right",
70
  "truncation_strategy": "longest_first",
71
- "unk_token": "[UNK]"
72
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "<pad>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "<unk>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "[SEP]",
 
 
 
 
 
 
 
 
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "4": {
36
+ "content": "[MASK]",
37
  "lstrip": true,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "[CLS]",
45
  "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
  "do_lower_case": true,
48
+ "eos_token": "[SEP]",
49
+ "keep_accents": false,
50
+ "mask_token": "[MASK]",
51
+ "max_length": 100,
52
  "model_max_length": 512,
53
  "pad_to_multiple_of": null,
54
  "pad_token": "<pad>",
55
  "pad_token_type_id": 0,
56
  "padding_side": "right",
57
+ "remove_space": true,
58
+ "sep_token": "[SEP]",
59
  "stride": 0,
60
+ "tokenizer_class": "AlbertTokenizer",
 
 
61
  "truncation_side": "right",
62
  "truncation_strategy": "longest_first",
63
+ "unk_token": "<unk>"
64
  }