96abhishekarora commited on
Commit
37f5aa1
1 Parent(s): 4e7e74b

Modified validation and training for linktransformer model

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
  .git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
38
  model.safetensors filter=lfs diff=lfs merge=lfs -text
39
  .git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text
 
 
37
  .git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
38
  model.safetensors filter=lfs diff=lfs merge=lfs -text
39
  .git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text
40
+ .git/lfs/objects/4d/88/4d88dbbf624b0416b72827f3c8d16f8e21f1a59f8047e737eece87fb5a10b424 filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "word_embedding_dimension": 768,
3
  "pooling_mode_cls_token": true,
4
  "pooling_mode_mean_tokens": false,
5
  "pooling_mode_max_tokens": false,
 
1
  {
2
+ "word_embedding_dimension": 1024,
3
  "pooling_mode_cls_token": true,
4
  "pooling_mode_mean_tokens": false,
5
  "pooling_mode_max_tokens": false,
LT_training_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "model_save_dir": "models",
3
- "model_save_name": "linkage_en_aliases",
4
- "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
  "opt_model_lang": "en",
6
- "train_batch_size": 64,
7
- "num_epochs": 100,
8
  "warm_up_perc": 1,
9
  "learning_rate": 2e-05,
10
  "loss_type": "supcon",
@@ -12,7 +12,7 @@
12
  "wandb_names": {
13
  "project": "linkage",
14
  "id": "econabhishek",
15
- "run": "linkage_en_aliases",
16
  "entity": "econabhishek"
17
  },
18
  "add_pooling_layer": false,
@@ -24,6 +24,6 @@
24
  "loss_params": {},
25
  "eval_type": "retrieval",
26
  "training_dataset": "dataframe",
27
- "base_model_path": "multi-qa-mpnet-base-dot-v1",
28
- "best_model_path": "models/linkage_en_aliases"
29
  }
 
1
  {
2
  "model_save_dir": "models",
3
+ "model_save_name": "linkage_en_aliases_large",
4
+ "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
  "opt_model_lang": "en",
6
+ "train_batch_size": 256,
7
+ "num_epochs": 30,
8
  "warm_up_perc": 1,
9
  "learning_rate": 2e-05,
10
  "loss_type": "supcon",
 
12
  "wandb_names": {
13
  "project": "linkage",
14
  "id": "econabhishek",
15
+ "run": "linkage_en_aliases_large",
16
  "entity": "econabhishek"
17
  },
18
  "add_pooling_layer": false,
 
24
  "loss_params": {},
25
  "eval_type": "retrieval",
26
  "training_dataset": "dataframe",
27
+ "base_model_path": "BAAI/bge-large-en-v1.5",
28
+ "best_model_path": "models/linkage_en_aliases_large"
29
  }
README.md CHANGED
@@ -15,15 +15,15 @@ tags:
15
  This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
16
  It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
17
  Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
18
- It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
19
  Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
20
 
21
 
22
- This model has been fine-tuned on the model : multi-qa-mpnet-base-dot-v1. It is pretrained for the language : - en.
23
 
24
 
25
  This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
26
- It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
27
 
28
 
29
  ## Usage (LinkTransformer)
@@ -97,9 +97,9 @@ The model was trained with the parameters:
97
 
98
  **DataLoader**:
99
 
100
- `torch.utils.data.dataloader.DataLoader` of length 2087 with parameters:
101
  ```
102
- {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
103
  ```
104
 
105
  **Loss**:
@@ -109,8 +109,8 @@ The model was trained with the parameters:
109
  Parameters of the fit()-Method:
110
  ```
111
  {
112
- "epochs": 100,
113
- "evaluation_steps": 1044,
114
  "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
115
  "max_grad_norm": 1,
116
  "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
@@ -119,7 +119,7 @@ Parameters of the fit()-Method:
119
  },
120
  "scheduler": "WarmupLinear",
121
  "steps_per_epoch": null,
122
- "warmup_steps": 208700,
123
  "weight_decay": 0.01
124
  }
125
  ```
@@ -128,8 +128,9 @@ Parameters of the fit()-Method:
128
 
129
 
130
  LinkTransformer(
131
- (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel
132
- (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
 
133
  )
134
  ```
135
 
 
15
  This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
16
  It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
17
  Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
18
+ It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
19
  Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
20
 
21
 
22
+ This model has been fine-tuned on the model : BAAI/bge-large-en-v1.5. It is pretrained for the language : - en.
23
 
24
 
25
  This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
26
+ It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
27
 
28
 
29
  ## Usage (LinkTransformer)
 
97
 
98
  **DataLoader**:
99
 
100
+ `torch.utils.data.dataloader.DataLoader` of length 522 with parameters:
101
  ```
102
+ {'batch_size': 256, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
103
  ```
104
 
105
  **Loss**:
 
109
  Parameters of the fit()-Method:
110
  ```
111
  {
112
+ "epochs": 30,
113
+ "evaluation_steps": 261,
114
  "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
115
  "max_grad_norm": 1,
116
  "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
 
119
  },
120
  "scheduler": "WarmupLinear",
121
  "steps_per_epoch": null,
122
+ "warmup_steps": 15660,
123
  "weight_decay": 0.01
124
  }
125
  ```
 
128
 
129
 
130
  LinkTransformer(
131
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
132
+ (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
133
+ (2): Normalize()
134
  )
135
  ```
136
 
config.json CHANGED
@@ -1,24 +1,32 @@
1
  {
2
- "_name_or_path": "models/linkage_en_aliases",
3
  "architectures": [
4
- "MPNetModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 0,
8
- "eos_token_id": 2,
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
- "hidden_size": 768,
 
 
 
12
  "initializer_range": 0.02,
13
- "intermediate_size": 3072,
14
- "layer_norm_eps": 1e-05,
15
- "max_position_embeddings": 514,
16
- "model_type": "mpnet",
17
- "num_attention_heads": 12,
18
- "num_hidden_layers": 12,
19
- "pad_token_id": 1,
20
- "relative_attention_num_buckets": 32,
 
 
 
21
  "torch_dtype": "float32",
22
  "transformers_version": "4.35.1",
23
- "vocab_size": 30527
 
 
24
  }
 
1
  {
2
+ "_name_or_path": "models/linkage_en_aliases_large",
3
  "architectures": [
4
+ "BertModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "LABEL_0"
14
+ },
15
  "initializer_range": 0.02,
16
+ "intermediate_size": 4096,
17
+ "label2id": {
18
+ "LABEL_0": 0
19
+ },
20
+ "layer_norm_eps": 1e-12,
21
+ "max_position_embeddings": 512,
22
+ "model_type": "bert",
23
+ "num_attention_heads": 16,
24
+ "num_hidden_layers": 24,
25
+ "pad_token_id": 0,
26
+ "position_embedding_type": "absolute",
27
  "torch_dtype": "float32",
28
  "transformers_version": "4.35.1",
29
+ "type_vocab_size": 2,
30
+ "use_cache": true,
31
+ "vocab_size": 30522
32
  }
config_sentence_transformers.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "__version__": {
3
- "sentence_transformers": "2.0.0",
4
- "transformers": "4.6.1",
5
- "pytorch": "1.8.1"
6
  }
7
  }
 
1
  {
2
  "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.28.1",
5
+ "pytorch": "1.13.0+cu117"
6
  }
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d88dbbf624b0416b72827f3c8d16f8e21f1a59f8047e737eece87fb5a10b424
3
- size 437967672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9fd3575f11cffb2c51379c89222cf7d37d1de659dd27639a0a118eaa88e96df
3
+ size 1340612432
modules.json CHANGED
@@ -10,5 +10,11 @@
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
 
 
 
 
 
 
13
  }
14
  ]
 
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
  }
20
  ]
sentence_bert_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "max_seq_length": 512,
3
- "do_lower_case": false
4
  }
 
1
  {
2
  "max_seq_length": 512,
3
+ "do_lower_case": true
4
  }
special_tokens_map.json CHANGED
@@ -1,41 +1,27 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
  "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
  "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
- "content": "<pad>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "</s>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
1
  {
 
 
 
 
 
 
 
2
  "cls_token": {
3
+ "content": "[CLS]",
 
 
 
 
 
 
 
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "[PAD]",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "sep_token": {
24
+ "content": "[SEP]",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,71 +1,63 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<s>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
- "1": {
12
- "content": "<pad>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "2": {
20
- "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "104": {
36
- "content": "[UNK]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
- "30526": {
44
- "content": "<mask>",
45
- "lstrip": true,
46
  "normalized": false,
47
  "rstrip": false,
48
  "single_word": false,
49
  "special": true
50
  }
51
  },
52
- "bos_token": "<s>",
53
  "clean_up_tokenization_spaces": true,
54
- "cls_token": "<s>",
 
55
  "do_lower_case": true,
56
- "eos_token": "</s>",
57
- "mask_token": "<mask>",
58
- "max_length": 250,
59
  "model_max_length": 512,
 
60
  "pad_to_multiple_of": null,
61
- "pad_token": "<pad>",
62
  "pad_token_type_id": 0,
63
  "padding_side": "right",
64
- "sep_token": "</s>",
65
  "stride": 0,
66
  "strip_accents": null,
67
  "tokenize_chinese_chars": true,
68
- "tokenizer_class": "MPNetTokenizer",
69
  "truncation_side": "right",
70
  "truncation_strategy": "longest_first",
71
  "unk_token": "[UNK]"
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "100": {
12
+ "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "101": {
20
+ "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "102": {
28
+ "content": "[SEP]",
 
 
 
 
 
 
 
 
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
  "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
 
50
  "model_max_length": 512,
51
+ "never_split": null,
52
  "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
  "pad_token_type_id": 0,
55
  "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
  "stride": 0,
58
  "strip_accents": null,
59
  "tokenize_chinese_chars": true,
60
+ "tokenizer_class": "BertTokenizer",
61
  "truncation_side": "right",
62
  "truncation_strategy": "longest_first",
63
  "unk_token": "[UNK]"
vocab.txt CHANGED
@@ -1,7 +1,3 @@
1
- <s>
2
- <pad>
3
- </s>
4
- <unk>
5
  [PAD]
6
  [unused0]
7
  [unused1]
@@ -30524,4 +30520,3 @@ necessitated
30524
  ##:
30525
  ##?
30526
  ##~
30527
- <mask>
 
 
 
 
 
1
  [PAD]
2
  [unused0]
3
  [unused1]
 
30520
  ##:
30521
  ##?
30522
  ##~