shuttie commited on
Commit
0b16e3b
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ *.onnx filter=lfs diff=lfs merge=lfs -text
2
+ *.gz filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
1
+ .venv
2
+ venv
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+
8
+ ---
9
+
10
+ # metarank/esci-MiniLM-L6-v2
11
+
12
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
13
+
14
+ A [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model fine-tuned on
15
+ [Amazon ESCI dataset](https://github.com/amazon-science/esci-data).
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('metarank/esci-MiniLM-L6-v2')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+ ## Training
37
+ The model was trained with the parameters:
38
+
39
+ **DataLoader**:
40
+
41
+ `torch.utils.data.dataloader.DataLoader` of length 769 with parameters:
42
+ ```
43
+ {'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
44
+ ```
45
+
46
+ **Loss**:
47
+
48
+ `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
49
+ ```
50
+ {'scale': 20.0, 'similarity_fct': 'cos_sim'}
51
+ ```
52
+
53
+ Parameters of the fit()-Method:
54
+ ```
55
+ {
56
+ "epochs": 1,
57
+ "evaluation_steps": 0,
58
+ "evaluator": "NoneType",
59
+ "max_grad_norm": 1,
60
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
61
+ "optimizer_params": {
62
+ "lr": 2e-05
63
+ },
64
+ "scheduler": "WarmupLinear",
65
+ "steps_per_epoch": null,
66
+ "warmup_steps": 1000,
67
+ "weight_decay": 0.01
68
+ }
69
+ ```
70
+
71
+
72
+ ## Full Model Architecture
73
+ ```
74
+ SentenceTransformer(
75
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
76
+ (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
77
+ (2): Normalize()
78
+ )
79
+ ```
80
+
81
+ ## Citing & Authors
82
+
83
+ * Roman Grebennikov
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/shutty/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.27.4",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
finetune.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample, CrossEncoder
2
+ from torch import nn
3
+ import csv
4
+ from torch.utils.data import DataLoader, Dataset
5
+ import torch
6
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SentenceEvaluator, SimilarityFunction, RerankingEvaluator
7
+ from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
8
+ import logging
9
+ import json
10
+ import random
11
+ import gzip
12
+
13
+ model_name = 'sentence-transformers/all-MiniLM-L6-v2'
14
+
15
+ train_batch_size = 128
16
+ max_seq_length = 128
17
+ num_epochs = 1
18
+ warmup_steps = 1000
19
+ model_save_path = '.'
20
+ lr = 2e-5
21
+
22
+ class ESCIDataset(Dataset):
23
+ def __init__(self, input):
24
+ self.queries = []
25
+ with gzip.open(input) as jsonfile:
26
+ for line in jsonfile.readlines():
27
+ query = json.loads(line)
28
+ for i in range(1,10):
29
+ if len(query['e']) > 0 and len(query['i']) > 0:
30
+ p = random.choice(query['e'])
31
+ positive = p['title']
32
+ n = random.choice(query['i'])
33
+ negative = n['title']
34
+ self.queries.append(InputExample(texts=[query['query'], positive, negative]))
35
+
36
+ def __getitem__(self, item):
37
+ return self.queries[item]
38
+
39
+ def __len__(self):
40
+ return len(self.queries)
41
+
42
+
43
+ model = SentenceTransformer(model_name)
44
+ model.max_seq_length = max_seq_length
45
+
46
+
47
+ train_dataset = ESCIDataset(input='train-small.json.gz')
48
+ eval_dataset = ESCIDataset(input='test-small.json.gz')
49
+ train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
50
+ train_loss = losses.MultipleNegativesRankingLoss(model=model)
51
+
52
+ samples = {}
53
+ for query in eval_dataset.queries:
54
+ qstr = query.texts[0]
55
+ sample = samples.get(qstr, {'query': qstr})
56
+ positive = sample.get('positive', [])
57
+ positive.append(query.texts[1])
58
+ sample['positive'] = positive
59
+ negative = sample.get('negative', [])
60
+ negative.append(query.texts[2])
61
+ sample['negative'] = negative
62
+ samples[qstr] = sample
63
+
64
+ evaluator = RerankingEvaluator(samples=samples,name='esci')
65
+
66
+ # Train the model
67
+
68
+ model.fit(train_objectives=[(train_dataloader, train_loss)],
69
+ epochs=num_epochs,
70
+ warmup_steps=warmup_steps,
71
+ use_amp=True,
72
+ checkpoint_path=model_save_path,
73
+ checkpoint_save_steps=len(train_dataloader),
74
+ optimizer_params = {'lr': lr},
75
+ # evaluator=evaluator,
76
+ # evaluation_steps=300,
77
+ output_path=model_save_path
78
+ )
79
+
80
+ # Save the model
81
+
82
+ model.save(model_save_path)
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
onnx_convert.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+
4
+ max_seq_length=128
5
+
6
+ model = AutoModel.from_pretrained(".")
7
+ model.eval()
8
+
9
+ inputs = {"input_ids": torch.ones(1, max_seq_length, dtype=torch.int64),
10
+ "attention_mask": torch.ones(1, max_seq_length, dtype=torch.int64),
11
+ "token_type_ids": torch.ones(1, max_seq_length, dtype=torch.int64)}
12
+
13
+ symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
14
+
15
+ torch.onnx.export(model, args=tuple(inputs.values()), f='pytorch_model.onnx', export_params=True,
16
+ input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['last_hidden_state'],
17
+ dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names})
18
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd77ee0f10a3e91e0a3798f2b7b0ad1cf92538d062d089ea1fe83594dcdf5ab
3
+ size 90891565
pytorch_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9a477737434014d347f63a17bd90d092e3f0916e7b728e55ac887fd1b8b064c
3
+ size 90984263
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ sentence-transformers==2.2.2
2
+ torch==2.0.0
3
+ onnx==1.13.1
4
+ huggingface_hub==0.13.3
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ {
2
+ "max_seq_length": 128,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
test-small.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9542358cbd27f3d8ebcb41757384eb23e7c54a92ea2d8dbdc9e70d22ae5a0743
3
+ size 69955958
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": "/home/shutty/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/special_tokens_map.json",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
train-small.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c7c4dd1391b2203b4d1c3d5618790ba3c01d9dcdaa1548b15a36ea4ee2d8840
3
+ size 161159190
vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
3
+ size 231508