96abhishekarora
commited on
Commit
•
37f5aa1
1
Parent(s):
4e7e74b
Modified validation and training for linktransformer model
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +1 -1
- LT_training_config.json +7 -7
- README.md +11 -10
- config.json +22 -14
- config_sentence_transformers.json +3 -3
- model.safetensors +2 -2
- modules.json +6 -0
- sentence_bert_config.json +1 -1
- special_tokens_map.json +5 -19
- tokenizer.json +0 -0
- tokenizer_config.json +18 -26
- vocab.txt +0 -5
.gitattributes
CHANGED
@@ -37,3 +37,4 @@ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
|
37 |
.git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
|
38 |
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
39 |
.git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text
|
|
|
|
37 |
.git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
|
38 |
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
39 |
.git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text
|
40 |
+
.git/lfs/objects/4d/88/4d88dbbf624b0416b72827f3c8d16f8e21f1a59f8047e737eece87fb5a10b424 filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"word_embedding_dimension":
|
3 |
"pooling_mode_cls_token": true,
|
4 |
"pooling_mode_mean_tokens": false,
|
5 |
"pooling_mode_max_tokens": false,
|
|
|
1 |
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
"pooling_mode_cls_token": true,
|
4 |
"pooling_mode_mean_tokens": false,
|
5 |
"pooling_mode_max_tokens": false,
|
LT_training_config.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"model_save_dir": "models",
|
3 |
-
"model_save_name": "
|
4 |
-
"opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for
|
5 |
"opt_model_lang": "en",
|
6 |
-
"train_batch_size":
|
7 |
-
"num_epochs":
|
8 |
"warm_up_perc": 1,
|
9 |
"learning_rate": 2e-05,
|
10 |
"loss_type": "supcon",
|
@@ -12,7 +12,7 @@
|
|
12 |
"wandb_names": {
|
13 |
"project": "linkage",
|
14 |
"id": "econabhishek",
|
15 |
-
"run": "
|
16 |
"entity": "econabhishek"
|
17 |
},
|
18 |
"add_pooling_layer": false,
|
@@ -24,6 +24,6 @@
|
|
24 |
"loss_params": {},
|
25 |
"eval_type": "retrieval",
|
26 |
"training_dataset": "dataframe",
|
27 |
-
"base_model_path": "
|
28 |
-
"best_model_path": "models/
|
29 |
}
|
|
|
1 |
{
|
2 |
"model_save_dir": "models",
|
3 |
+
"model_save_name": "linkage_en_aliases_large",
|
4 |
+
"opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
|
5 |
"opt_model_lang": "en",
|
6 |
+
"train_batch_size": 256,
|
7 |
+
"num_epochs": 30,
|
8 |
"warm_up_perc": 1,
|
9 |
"learning_rate": 2e-05,
|
10 |
"loss_type": "supcon",
|
|
|
12 |
"wandb_names": {
|
13 |
"project": "linkage",
|
14 |
"id": "econabhishek",
|
15 |
+
"run": "linkage_en_aliases_large",
|
16 |
"entity": "econabhishek"
|
17 |
},
|
18 |
"add_pooling_layer": false,
|
|
|
24 |
"loss_params": {},
|
25 |
"eval_type": "retrieval",
|
26 |
"training_dataset": "dataframe",
|
27 |
+
"base_model_path": "BAAI/bge-large-en-v1.5",
|
28 |
+
"best_model_path": "models/linkage_en_aliases_large"
|
29 |
}
|
README.md
CHANGED
@@ -15,15 +15,15 @@ tags:
|
|
15 |
This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
|
16 |
It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
|
17 |
Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
|
18 |
-
It maps sentences & paragraphs to a
|
19 |
Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
|
20 |
|
21 |
|
22 |
-
This model has been fine-tuned on the model :
|
23 |
|
24 |
|
25 |
This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
|
26 |
-
It was trained for
|
27 |
|
28 |
|
29 |
## Usage (LinkTransformer)
|
@@ -97,9 +97,9 @@ The model was trained with the parameters:
|
|
97 |
|
98 |
**DataLoader**:
|
99 |
|
100 |
-
`torch.utils.data.dataloader.DataLoader` of length
|
101 |
```
|
102 |
-
{'batch_size':
|
103 |
```
|
104 |
|
105 |
**Loss**:
|
@@ -109,8 +109,8 @@ The model was trained with the parameters:
|
|
109 |
Parameters of the fit()-Method:
|
110 |
```
|
111 |
{
|
112 |
-
"epochs":
|
113 |
-
"evaluation_steps":
|
114 |
"evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
|
115 |
"max_grad_norm": 1,
|
116 |
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
@@ -119,7 +119,7 @@ Parameters of the fit()-Method:
|
|
119 |
},
|
120 |
"scheduler": "WarmupLinear",
|
121 |
"steps_per_epoch": null,
|
122 |
-
"warmup_steps":
|
123 |
"weight_decay": 0.01
|
124 |
}
|
125 |
```
|
@@ -128,8 +128,9 @@ Parameters of the fit()-Method:
|
|
128 |
|
129 |
|
130 |
LinkTransformer(
|
131 |
-
(0): Transformer({'max_seq_length': 512, 'do_lower_case':
|
132 |
-
(1): Pooling({'word_embedding_dimension':
|
|
|
133 |
)
|
134 |
```
|
135 |
|
|
|
15 |
This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
|
16 |
It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
|
17 |
Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
|
18 |
+
It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
19 |
Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
|
20 |
|
21 |
|
22 |
+
This model has been fine-tuned on the model : BAAI/bge-large-en-v1.5. It is pretrained for the language : - en.
|
23 |
|
24 |
|
25 |
This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
|
26 |
+
It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
|
27 |
|
28 |
|
29 |
## Usage (LinkTransformer)
|
|
|
97 |
|
98 |
**DataLoader**:
|
99 |
|
100 |
+
`torch.utils.data.dataloader.DataLoader` of length 522 with parameters:
|
101 |
```
|
102 |
+
{'batch_size': 256, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
|
103 |
```
|
104 |
|
105 |
**Loss**:
|
|
|
109 |
Parameters of the fit()-Method:
|
110 |
```
|
111 |
{
|
112 |
+
"epochs": 30,
|
113 |
+
"evaluation_steps": 261,
|
114 |
"evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
|
115 |
"max_grad_norm": 1,
|
116 |
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
|
|
|
119 |
},
|
120 |
"scheduler": "WarmupLinear",
|
121 |
"steps_per_epoch": null,
|
122 |
+
"warmup_steps": 15660,
|
123 |
"weight_decay": 0.01
|
124 |
}
|
125 |
```
|
|
|
128 |
|
129 |
|
130 |
LinkTransformer(
|
131 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
|
132 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
|
133 |
+
(2): Normalize()
|
134 |
)
|
135 |
```
|
136 |
|
config.json
CHANGED
@@ -1,24 +1,32 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "models/
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
-
"
|
8 |
-
"
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
11 |
-
"hidden_size":
|
|
|
|
|
|
|
12 |
"initializer_range": 0.02,
|
13 |
-
"intermediate_size":
|
14 |
-
"
|
15 |
-
|
16 |
-
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"
|
|
|
|
|
|
|
21 |
"torch_dtype": "float32",
|
22 |
"transformers_version": "4.35.1",
|
23 |
-
"
|
|
|
|
|
24 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "models/linkage_en_aliases_large",
|
3 |
"architectures": [
|
4 |
+
"BertModel"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
"hidden_act": "gelu",
|
10 |
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0"
|
14 |
+
},
|
15 |
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 4096,
|
17 |
+
"label2id": {
|
18 |
+
"LABEL_0": 0
|
19 |
+
},
|
20 |
+
"layer_norm_eps": 1e-12,
|
21 |
+
"max_position_embeddings": 512,
|
22 |
+
"model_type": "bert",
|
23 |
+
"num_attention_heads": 16,
|
24 |
+
"num_hidden_layers": 24,
|
25 |
+
"pad_token_id": 0,
|
26 |
+
"position_embedding_type": "absolute",
|
27 |
"torch_dtype": "float32",
|
28 |
"transformers_version": "4.35.1",
|
29 |
+
"type_vocab_size": 2,
|
30 |
+
"use_cache": true,
|
31 |
+
"vocab_size": 30522
|
32 |
}
|
config_sentence_transformers.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
-
"sentence_transformers": "2.
|
4 |
-
"transformers": "4.
|
5 |
-
"pytorch": "1.
|
6 |
}
|
7 |
}
|
|
|
1 |
{
|
2 |
"__version__": {
|
3 |
+
"sentence_transformers": "2.2.2",
|
4 |
+
"transformers": "4.28.1",
|
5 |
+
"pytorch": "1.13.0+cu117"
|
6 |
}
|
7 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9fd3575f11cffb2c51379c89222cf7d37d1de659dd27639a0a118eaa88e96df
|
3 |
+
size 1340612432
|
modules.json
CHANGED
@@ -10,5 +10,11 @@
|
|
10 |
"name": "1",
|
11 |
"path": "1_Pooling",
|
12 |
"type": "sentence_transformers.models.Pooling"
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
}
|
14 |
]
|
|
|
10 |
"name": "1",
|
11 |
"path": "1_Pooling",
|
12 |
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
}
|
20 |
]
|
sentence_bert_config.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
"max_seq_length": 512,
|
3 |
-
"do_lower_case":
|
4 |
}
|
|
|
1 |
{
|
2 |
"max_seq_length": 512,
|
3 |
+
"do_lower_case": true
|
4 |
}
|
special_tokens_map.json
CHANGED
@@ -1,41 +1,27 @@
|
|
1 |
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<s>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": false,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
"cls_token": {
|
10 |
-
"content": "
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": false,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"eos_token": {
|
17 |
-
"content": "</s>",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"mask_token": {
|
24 |
-
"content": "
|
25 |
-
"lstrip":
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
28 |
"single_word": false
|
29 |
},
|
30 |
"pad_token": {
|
31 |
-
"content": "
|
32 |
"lstrip": false,
|
33 |
"normalized": false,
|
34 |
"rstrip": false,
|
35 |
"single_word": false
|
36 |
},
|
37 |
"sep_token": {
|
38 |
-
"content": "
|
39 |
"lstrip": false,
|
40 |
"normalized": false,
|
41 |
"rstrip": false,
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"lstrip": false,
|
5 |
"normalized": false,
|
6 |
"rstrip": false,
|
7 |
"single_word": false
|
8 |
},
|
9 |
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
"normalized": false,
|
13 |
"rstrip": false,
|
14 |
"single_word": false
|
15 |
},
|
16 |
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
"lstrip": false,
|
19 |
"normalized": false,
|
20 |
"rstrip": false,
|
21 |
"single_word": false
|
22 |
},
|
23 |
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
"lstrip": false,
|
26 |
"normalized": false,
|
27 |
"rstrip": false,
|
tokenizer.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,71 +1,63 @@
|
|
1 |
{
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
-
"content": "
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
},
|
11 |
-
"
|
12 |
-
"content": "
|
13 |
"lstrip": false,
|
14 |
"normalized": false,
|
15 |
"rstrip": false,
|
16 |
"single_word": false,
|
17 |
"special": true
|
18 |
},
|
19 |
-
"
|
20 |
-
"content": "
|
21 |
"lstrip": false,
|
22 |
"normalized": false,
|
23 |
"rstrip": false,
|
24 |
"single_word": false,
|
25 |
"special": true
|
26 |
},
|
27 |
-
"
|
28 |
-
"content": "
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": true,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false,
|
33 |
-
"special": true
|
34 |
-
},
|
35 |
-
"104": {
|
36 |
-
"content": "[UNK]",
|
37 |
"lstrip": false,
|
38 |
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
},
|
43 |
-
"
|
44 |
-
"content": "
|
45 |
-
"lstrip":
|
46 |
"normalized": false,
|
47 |
"rstrip": false,
|
48 |
"single_word": false,
|
49 |
"special": true
|
50 |
}
|
51 |
},
|
52 |
-
"bos_token": "<s>",
|
53 |
"clean_up_tokenization_spaces": true,
|
54 |
-
"cls_token": "
|
|
|
55 |
"do_lower_case": true,
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"max_length": 250,
|
59 |
"model_max_length": 512,
|
|
|
60 |
"pad_to_multiple_of": null,
|
61 |
-
"pad_token": "
|
62 |
"pad_token_type_id": 0,
|
63 |
"padding_side": "right",
|
64 |
-
"sep_token": "
|
65 |
"stride": 0,
|
66 |
"strip_accents": null,
|
67 |
"tokenize_chinese_chars": true,
|
68 |
-
"tokenizer_class": "
|
69 |
"truncation_side": "right",
|
70 |
"truncation_strategy": "longest_first",
|
71 |
"unk_token": "[UNK]"
|
|
|
1 |
{
|
2 |
"added_tokens_decoder": {
|
3 |
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false,
|
9 |
"special": true
|
10 |
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
"lstrip": false,
|
14 |
"normalized": false,
|
15 |
"rstrip": false,
|
16 |
"single_word": false,
|
17 |
"special": true
|
18 |
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
"lstrip": false,
|
22 |
"normalized": false,
|
23 |
"rstrip": false,
|
24 |
"single_word": false,
|
25 |
"special": true
|
26 |
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"lstrip": false,
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
32 |
"single_word": false,
|
33 |
"special": true
|
34 |
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
"normalized": false,
|
39 |
"rstrip": false,
|
40 |
"single_word": false,
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
44 |
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"max_length": 512,
|
|
|
50 |
"model_max_length": 512,
|
51 |
+
"never_split": null,
|
52 |
"pad_to_multiple_of": null,
|
53 |
+
"pad_token": "[PAD]",
|
54 |
"pad_token_type_id": 0,
|
55 |
"padding_side": "right",
|
56 |
+
"sep_token": "[SEP]",
|
57 |
"stride": 0,
|
58 |
"strip_accents": null,
|
59 |
"tokenize_chinese_chars": true,
|
60 |
+
"tokenizer_class": "BertTokenizer",
|
61 |
"truncation_side": "right",
|
62 |
"truncation_strategy": "longest_first",
|
63 |
"unk_token": "[UNK]"
|
vocab.txt
CHANGED
@@ -1,7 +1,3 @@
|
|
1 |
-
<s>
|
2 |
-
<pad>
|
3 |
-
</s>
|
4 |
-
<unk>
|
5 |
[PAD]
|
6 |
[unused0]
|
7 |
[unused1]
|
@@ -30524,4 +30520,3 @@ necessitated
|
|
30524 |
##:
|
30525 |
##?
|
30526 |
##~
|
30527 |
-
<mask>
|
|
|
|
|
|
|
|
|
|
|
1 |
[PAD]
|
2 |
[unused0]
|
3 |
[unused1]
|
|
|
30520 |
##:
|
30521 |
##?
|
30522 |
##~
|
|