Modified validation and training for linktransformer model

Browse files

Files changed (13) hide show

.gitattributes +1 -0
1_Pooling/config.json +1 -1
LT_training_config.json +7 -7
README.md +11 -10
config.json +22 -14
config_sentence_transformers.json +3 -3
model.safetensors +2 -2
modules.json +6 -0
sentence_bert_config.json +1 -1
special_tokens_map.json +5 -19
tokenizer.json +0 -0
tokenizer_config.json +18 -26
vocab.txt +0 -5

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
 .git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text
 .git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text

 .git/lfs/objects/21/f6/21f62a2f5d51a4a54c4b42e01fb10b45fc505cae8c7ad33ea62a89790a951532 filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text
 .git/lfs/objects/3a/24/3a24ab799bbbdf5df9869468abcb2eb4c97436c5eb810627cdd569b2e926572b filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/4d/88/4d88dbbf624b0416b72827f3c8d16f8e21f1a59f8047e737eece87fb5a10b424 filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "word_embedding_dimension": 768,
   "pooling_mode_cls_token": true,
   "pooling_mode_mean_tokens": false,
   "pooling_mode_max_tokens": false,

 {
+  "word_embedding_dimension": 1024,
   "pooling_mode_cls_token": true,
   "pooling_mode_mean_tokens": false,
   "pooling_mode_max_tokens": false,

LT_training_config.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
   "model_save_dir": "models",
-  "model_save_name": "linkage_en_aliases",
-  "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n                               It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n  ",
   "opt_model_lang": "en",
-  "train_batch_size": 64,
-  "num_epochs": 100,
   "warm_up_perc": 1,
   "learning_rate": 2e-05,
   "loss_type": "supcon",
@@ -12,7 +12,7 @@
   "wandb_names": {
     "project": "linkage",
     "id": "econabhishek",
-    "run": "linkage_en_aliases",
     "entity": "econabhishek"
   },
   "add_pooling_layer": false,
@@ -24,6 +24,6 @@
   "loss_params": {},
   "eval_type": "retrieval",
   "training_dataset": "dataframe",
-  "base_model_path": "multi-qa-mpnet-base-dot-v1",
-  "best_model_path": "models/linkage_en_aliases"
 }

 {
   "model_save_dir": "models",
+  "model_save_name": "linkage_en_aliases_large",
+  "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n                               It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n  ",
   "opt_model_lang": "en",
+  "train_batch_size": 256,
+  "num_epochs": 30,
   "warm_up_perc": 1,
   "learning_rate": 2e-05,
   "loss_type": "supcon",
   "wandb_names": {
     "project": "linkage",
     "id": "econabhishek",
+    "run": "linkage_en_aliases_large",
     "entity": "econabhishek"
   },
   "add_pooling_layer": false,
   "loss_params": {},
   "eval_type": "retrieval",
   "training_dataset": "dataframe",
+  "base_model_path": "BAAI/bge-large-en-v1.5",
+  "best_model_path": "models/linkage_en_aliases_large"
 }

README.md CHANGED Viewed

@@ -15,15 +15,15 @@ tags:
 This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
 It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
 Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
-It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
 Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
-This model has been fine-tuned on the model : multi-qa-mpnet-base-dot-v1. It is pretrained for the language : - en.
 This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
-                               It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
 ## Usage (LinkTransformer)
@@ -97,9 +97,9 @@ The model was trained with the parameters:
 **DataLoader**:
-`torch.utils.data.dataloader.DataLoader` of length 2087 with parameters:
 ```
-{'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
 ```
 **Loss**:
@@ -109,8 +109,8 @@ The model was trained with the parameters:
 Parameters of the fit()-Method:
 ```
 {
-    "epochs": 100,
-    "evaluation_steps": 1044,
     "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
     "max_grad_norm": 1,
     "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
@@ -119,7 +119,7 @@ Parameters of the fit()-Method:
     },
     "scheduler": "WarmupLinear",
     "steps_per_epoch": null,
-    "warmup_steps": 208700,
     "weight_decay": 0.01
 }
 ```
@@ -128,8 +128,9 @@ Parameters of the fit()-Method:
 LinkTransformer(
-  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel
-  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
 )
 ```

 This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
 It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
 Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
+It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
 Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
+This model has been fine-tuned on the model : BAAI/bge-large-en-v1.5. It is pretrained for the language : - en.
 This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
+                               It was trained for 30 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
 ## Usage (LinkTransformer)
 **DataLoader**:
+`torch.utils.data.dataloader.DataLoader` of length 522 with parameters:
 ```
+{'batch_size': 256, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
 ```
 **Loss**:
 Parameters of the fit()-Method:
 ```
 {
+    "epochs": 30,
+    "evaluation_steps": 261,
     "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
     "max_grad_norm": 1,
     "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
     },
     "scheduler": "WarmupLinear",
     "steps_per_epoch": null,
+    "warmup_steps": 15660,
     "weight_decay": 0.01
 }
 ```
 LinkTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
+  (2): Normalize()
 )
 ```

config.json CHANGED Viewed

@@ -1,24 +1,32 @@
 {
-  "_name_or_path": "models/linkage_en_aliases",
   "architectures": [
-    "MPNetModel"
   ],
   "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
   "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 514,
-  "model_type": "mpnet",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "relative_attention_num_buckets": 32,
   "torch_dtype": "float32",
   "transformers_version": "4.35.1",
-  "vocab_size": 30527
 }

 {
+  "_name_or_path": "models/linkage_en_aliases_large",
   "architectures": [
+    "BertModel"
   ],
   "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
   "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.35.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
 }

config_sentence_transformers.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "__version__": {
-    "sentence_transformers": "2.0.0",
-    "transformers": "4.6.1",
-    "pytorch": "1.8.1"
   }
 }

 {
   "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.28.1",
+    "pytorch": "1.13.0+cu117"
   }
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d88dbbf624b0416b72827f3c8d16f8e21f1a59f8047e737eece87fb5a10b424
-size 437967672

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9fd3575f11cffb2c51379c89222cf7d37d1de659dd27639a0a118eaa88e96df
+size 1340612432

modules.json CHANGED Viewed

@@ -10,5 +10,11 @@
     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
   }
 ]

     "name": "1",
     "path": "1_Pooling",
     "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
   }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
   "max_seq_length": 512,
-  "do_lower_case": false
 }

 {
   "max_seq_length": 512,
+  "do_lower_case": true
 }

special_tokens_map.json CHANGED Viewed

@@ -1,41 +1,27 @@
 {
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "cls_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "mask_token": {
-    "content": "<mask>",
-    "lstrip": true,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
-    "content": "<pad>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
-    "content": "</s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

 {
   "cls_token": {
+    "content": "[CLS]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
+    "content": "[PAD]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "sep_token": {
+    "content": "[SEP]",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,71 +1,63 @@
 {
   "added_tokens_decoder": {
     "0": {
-      "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "1": {
-      "content": "<pad>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "2": {
-      "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "104": {
-      "content": "[UNK]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "30526": {
-      "content": "<mask>",
-      "lstrip": true,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
-  "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
-  "cls_token": "<s>",
   "do_lower_case": true,
-  "eos_token": "</s>",
-  "mask_token": "<mask>",
-  "max_length": 250,
   "model_max_length": 512,
   "pad_to_multiple_of": null,
-  "pad_token": "<pad>",
   "pad_token_type_id": 0,
   "padding_side": "right",
-  "sep_token": "</s>",
   "stride": 0,
   "strip_accents": null,
   "tokenize_chinese_chars": true,
-  "tokenizer_class": "MPNetTokenizer",
   "truncation_side": "right",
   "truncation_strategy": "longest_first",
   "unk_token": "[UNK]"

 {
   "added_tokens_decoder": {
     "0": {
+      "content": "[PAD]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "100": {
+      "content": "[UNK]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "101": {
+      "content": "[CLS]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "102": {
+      "content": "[SEP]",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
   "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "max_length": 512,
   "model_max_length": 512,
+  "never_split": null,
   "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
   "pad_token_type_id": 0,
   "padding_side": "right",
+  "sep_token": "[SEP]",
   "stride": 0,
   "strip_accents": null,
   "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
   "truncation_side": "right",
   "truncation_strategy": "longest_first",
   "unk_token": "[UNK]"

vocab.txt CHANGED Viewed

@@ -1,7 +1,3 @@
-<s>
-<pad>
-</s>
-<unk>
 [PAD]
 [unused0]
 [unused1]
@@ -30524,4 +30520,3 @@ necessitated
 ##：
 ##？
 ##～
-<mask>

 [PAD]
 [unused0]
 [unused1]
 ##：
 ##？
 ##～