shuttie commited on
Commit
103bfab
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ *.onnx filter=lfs diff=lfs merge=lfs -text
2
+ *.gz filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
1
+ .venv
2
+ venv
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ tags:
4
+ - sentence-transformers
5
+ - feature-extraction
6
+ - sentence-similarity
7
+
8
+ ---
9
+
10
+ # metarank/ce-esci-MiniLM-L6-v2
11
+
12
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
13
+
14
+ A [cross-encoder/ms-marco-MiniLM-L-6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model fine-tuned on
15
+ [Amazon ESCI dataset](https://github.com/amazon-science/esci-data).
16
+
17
+ ## Usage (Sentence-Transformers)
18
+
19
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
20
+
21
+ ```
22
+ pip install -U sentence-transformers
23
+ ```
24
+
25
+ Then you can use the model like this:
26
+
27
+ ```python
28
+ from sentence_transformers import SentenceTransformer
29
+ sentences = ["This is an example sentence", "Each sentence is converted"]
30
+
31
+ model = SentenceTransformer('metarank/esci-MiniLM-L6-v2')
32
+ embeddings = model.encode(sentences)
33
+ print(embeddings)
34
+ ```
35
+
36
+ ## Training
37
+ The model was trained with the parameters:
38
+
39
+ **DataLoader**:
40
+
41
+ `torch.utils.data.dataloader.DataLoader` of length 769 with parameters:
42
+ ```
43
+ {'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
44
+ ```
45
+
46
+ **Loss**:
47
+
48
+ `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
49
+ ```
50
+ {'scale': 20.0, 'similarity_fct': 'cos_sim'}
51
+ ```
52
+
53
+ Parameters of the fit()-Method:
54
+ ```
55
+ {
56
+ "epochs": 1,
57
+ "evaluation_steps": 0,
58
+ "evaluator": "NoneType",
59
+ "max_grad_norm": 1,
60
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
61
+ "optimizer_params": {
62
+ "lr": 2e-05
63
+ },
64
+ "scheduler": "WarmupLinear",
65
+ "steps_per_epoch": null,
66
+ "warmup_steps": 1000,
67
+ "weight_decay": 0.01
68
+ }
69
+ ```
70
+
71
+
72
+ ## Full Model Architecture
73
+ ```
74
+ SentenceTransformer(
75
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
76
+ (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
77
+ (2): Normalize()
78
+ )
79
+ ```
80
+
81
+ ## Citing & Authors
82
+
83
+ * Roman Grebennikov
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cross-encoder/ms-marco-MiniLM-L-12-v2",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 1536,
17
+ "label2id": {
18
+ "LABEL_0": 0
19
+ },
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 12,
25
+ "pad_token_id": 0,
26
+ "position_embedding_type": "absolute",
27
+ "sbert_ce_default_activation_function": "torch.nn.modules.linear.Identity",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.27.4",
30
+ "type_vocab_size": 2,
31
+ "use_cache": true,
32
+ "vocab_size": 30522
33
+ }
finetune.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample, CrossEncoder
2
+ from torch import nn
3
+ import csv
4
+ from torch.utils.data import DataLoader, Dataset
5
+ import torch
6
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SentenceEvaluator, SimilarityFunction, RerankingEvaluator
7
+ from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
8
+ import logging
9
+ import json
10
+ import random
11
+ import gzip
12
+
13
+ model_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
14
+
15
+ train_batch_size = 32
16
+ max_seq_length = 128
17
+ num_epochs = 1
18
+ warmup_steps = 1000
19
+ model_save_path = '.'
20
+ lr = 2e-5
21
+
22
+ class ESCIDataset(Dataset):
23
+ def __init__(self, input):
24
+ self.queries = []
25
+ self.posneg = []
26
+ with gzip.open(input) as jsonfile:
27
+ for line in jsonfile.readlines():
28
+ query = json.loads(line)
29
+ for doc in query['e']:
30
+ self.queries.append(InputExample(texts=[query['query'], doc['title']], label=1.0))
31
+ for doc in query['s']:
32
+ self.queries.append(InputExample(texts=[query['query'], doc['title']], label=0.1))
33
+ for doc in query['c']:
34
+ self.queries.append(InputExample(texts=[query['query'], doc['title']], label=0.01))
35
+ for doc in query['i']:
36
+ self.queries.append(InputExample(texts=[query['query'], doc['title']], label=0.0))
37
+
38
+ def __getitem__(self, item):
39
+ return self.queries[item]
40
+
41
+ def __len__(self):
42
+ return len(self.queries)
43
+
44
+ class ESCIEvalDataset(Dataset):
45
+ def __init__(self, input):
46
+ self.queries = []
47
+ with gzip.open(input) as jsonfile:
48
+ for line in jsonfile.readlines():
49
+ query = json.loads(line)
50
+ if len(query['e']) > 0 and len(query['i']) > 0:
51
+ for p in query['e']:
52
+ positive = p['title']
53
+ for n in query['i']:
54
+ negative = n['title']
55
+ self.queries.append(InputExample(texts=[query['query'], positive, negative]))
56
+
57
+ def __getitem__(self, item):
58
+ return self.queries[item]
59
+
60
+ def __len__(self):
61
+ return len(self.queries)
62
+
63
+ model = CrossEncoder(model_name, num_labels=1)
64
+ model.max_seq_length = max_seq_length
65
+
66
+
67
+ train_dataset = ESCIDataset(input='train-small.json.gz')
68
+ eval_dataset = ESCIEvalDataset(input='test-small.json.gz')
69
+ train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
70
+
71
+ samples = {}
72
+ for query in eval_dataset.queries:
73
+ qstr = query.texts[0]
74
+ sample = samples.get(qstr, {'query': qstr})
75
+ positive = sample.get('positive', [])
76
+ positive.append(query.texts[1])
77
+ sample['positive'] = positive
78
+ negative = sample.get('negative', [])
79
+ negative.append(query.texts[2])
80
+ sample['negative'] = negative
81
+ samples[qstr] = sample
82
+
83
+ evaluator = CERerankingEvaluator(samples=samples,name='esci')
84
+
85
+ # Train the model
86
+
87
+ model.fit(train_dataloader=train_dataloader,
88
+ epochs=num_epochs,
89
+ warmup_steps=warmup_steps,
90
+ use_amp=True,
91
+ optimizer_params = {'lr': lr},
92
+ evaluator=evaluator,
93
+ # evaluation_steps=1000,
94
+ output_path=model_save_path
95
+ )
96
+
97
+ # Save the model
98
+
99
+ model.save(model_save_path)
onnx_convert.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+
4
+ max_seq_length=128
5
+
6
+ model = AutoModel.from_pretrained(".")
7
+ model.eval()
8
+
9
+ inputs = {"input_ids": torch.ones(1, max_seq_length, dtype=torch.int64),
10
+ "attention_mask": torch.ones(1, max_seq_length, dtype=torch.int64),
11
+ "token_type_ids": torch.ones(1, max_seq_length, dtype=torch.int64)}
12
+
13
+ symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
14
+
15
+ torch.onnx.export(model, args=tuple(inputs.values()), f='pytorch_model.onnx', export_params=True,
16
+ input_names=['input_ids', 'attention_mask', 'token_type_ids'], output_names=['last_hidden_state'],
17
+ dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names, 'token_type_ids': symbolic_names})
18
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dcfb2efa8e9be4d55c8353e38f61ccfd7223e0bfc2f24ab8af495b2cbbc8bc3
3
+ size 133514357
pytorch_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0312525f025d18e7013477ed8c389ad104591fdbfda838599762dad8608acb
3
+ size 133694712
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ sentence-transformers==2.2.2
2
+ torch==2.0.0
3
+ onnx==1.13.1
4
+ huggingface_hub==0.13.3
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
test-small.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb557251b12addb55d94af30120d121dfa6391e58bcc4a9aee0f1d35cc2ea1c8
3
+ size 8522018
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "never_split": null,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "special_tokens_map_file": "/home/shutty/.cache/huggingface/hub/models--cross-encoder--ms-marco-MiniLM-L-12-v2/snapshots/97f7dcbdd6ab58fe7f44368c795fc5200b48fcbe/special_tokens_map.json",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
train-small.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c7c14a8910a3a6c09421a08a84cfc0e74fd198d0aaf43ab2c39250a8ae4e4dd
3
+ size 19430577
vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
3
+ size 231508