96abhishekarora commited on
Commit
6fa1fd1
1 Parent(s): ee998a0

Modified validation and training for linktransformer model

Browse files
.gitattributes CHANGED
@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
39
+ .git/lfs/objects/bf/e9/bfe950b415bc9506b72a2f73eed59afd4840841deae18772931c069d89d51f23 filter=lfs diff=lfs merge=lfs -text
40
+ .git/lfs/objects/b6/0b/b60b6b43406a48bf3638526314f3d232d97058bc93472ff2de930d43686fa441 filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json CHANGED
@@ -3,5 +3,7 @@
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false
 
 
7
  }
 
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
  }
LT_training_config.json CHANGED
@@ -1,19 +1,26 @@
1
  {
2
  "model_save_dir": "models",
3
  "model_save_name": "linkage_multi_aliases",
4
- "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
  "opt_model_lang": [
 
6
  "en",
 
 
 
 
 
 
 
7
  "es",
8
  "fr",
9
- "de",
10
- "ja",
11
- "zh"
12
  ],
13
  "train_batch_size": 64,
14
- "num_epochs": 100,
15
  "warm_up_perc": 1,
16
- "learning_rate": 2e-06,
 
17
  "val_perc": 0.2,
18
  "wandb_names": {
19
  "project": "linkage",
@@ -23,10 +30,11 @@
23
  },
24
  "add_pooling_layer": false,
25
  "large_val": true,
26
- "eval_steps_perc": 0.1,
27
  "test_at_end": true,
28
  "save_val_test_pickles": true,
29
  "val_query_prop": 0.5,
 
30
  "eval_type": "retrieval",
31
  "training_dataset": "dataframe",
32
  "base_model_path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
 
1
  {
2
  "model_save_dir": "models",
3
  "model_save_name": "linkage_multi_aliases",
4
+ "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 70 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
  "opt_model_lang": [
6
+ "de",
7
  "en",
8
+ "zh",
9
+ "ja",
10
+ "hi",
11
+ "ar",
12
+ "bn",
13
+ "pt",
14
+ "ru",
15
  "es",
16
  "fr",
17
+ "ko"
 
 
18
  ],
19
  "train_batch_size": 64,
20
+ "num_epochs": 70,
21
  "warm_up_perc": 1,
22
+ "learning_rate": 2e-05,
23
+ "loss_type": "supcon",
24
  "val_perc": 0.2,
25
  "wandb_names": {
26
  "project": "linkage",
 
30
  },
31
  "add_pooling_layer": false,
32
  "large_val": true,
33
+ "eval_steps_perc": 0.5,
34
  "test_at_end": true,
35
  "save_val_test_pickles": true,
36
  "val_query_prop": 0.5,
37
+ "loss_params": {},
38
  "eval_type": "retrieval",
39
  "training_dataset": "dataframe",
40
  "base_model_path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
README.md CHANGED
@@ -1,12 +1,18 @@
1
  ---
2
  pipeline_tag: sentence-similarity
3
  language:
 
4
  - en
 
 
 
 
 
 
 
5
  - es
6
  - fr
7
- - de
8
- - ja
9
- - zh
10
  tags:
11
  - linktransformer
12
  - sentence-transformers
@@ -17,23 +23,29 @@ tags:
17
 
18
  # dell-research-harvard/lt-wikidata-comp-multi
19
 
20
- This is a [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
21
  It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
22
  Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
23
  It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
24
  Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
25
 
26
 
27
- This model has been fine-tuned on the model : sentence-transformers/paraphrase-multilingual-mpnet-base-v2. It is pretrained for the language : - en
 
 
 
 
 
 
 
 
28
  - es
29
  - fr
30
- - de
31
- - ja
32
- - zh.
33
 
34
 
35
  This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
36
- It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
37
 
38
 
39
  ## Usage (LinkTransformer)
@@ -107,7 +119,7 @@ The model was trained with the parameters:
107
 
108
  **DataLoader**:
109
 
110
- `torch.utils.data.dataloader.DataLoader` of length 2268 with parameters:
111
  ```
112
  {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
113
  ```
@@ -119,17 +131,17 @@ The model was trained with the parameters:
119
  Parameters of the fit()-Method:
120
  ```
121
  {
122
- "epochs": 100,
123
- "evaluation_steps": 22680,
124
  "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
125
  "max_grad_norm": 1,
126
  "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
127
  "optimizer_params": {
128
- "lr": 2e-06
129
  },
130
  "scheduler": "WarmupLinear",
131
  "steps_per_epoch": null,
132
- "warmup_steps": 226800,
133
  "weight_decay": 0.01
134
  }
135
  ```
@@ -139,10 +151,20 @@ Parameters of the fit()-Method:
139
 
140
  LinkTransformer(
141
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
142
- (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
143
  )
144
  ```
145
 
146
  ## Citing & Authors
147
 
148
- <!--- Describe where people can find more information -->
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  pipeline_tag: sentence-similarity
3
  language:
4
+ - de
5
  - en
6
+ - zh
7
+ - ja
8
+ - hi
9
+ - ar
10
+ - bn
11
+ - pt
12
+ - ru
13
  - es
14
  - fr
15
+ - ko
 
 
16
  tags:
17
  - linktransformer
18
  - sentence-transformers
 
23
 
24
  # dell-research-harvard/lt-wikidata-comp-multi
25
 
26
+ This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
27
  It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
28
  Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
29
  It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
30
  Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
31
 
32
 
33
+ This model has been fine-tuned on the model : sentence-transformers/paraphrase-multilingual-mpnet-base-v2. It is pretrained for the language : - de
34
+ - en
35
+ - zh
36
+ - ja
37
+ - hi
38
+ - ar
39
+ - bn
40
+ - pt
41
+ - ru
42
  - es
43
  - fr
44
+ - ko.
 
 
45
 
46
 
47
  This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
48
+ It was trained for 70 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
49
 
50
 
51
  ## Usage (LinkTransformer)
 
119
 
120
  **DataLoader**:
121
 
122
+ `torch.utils.data.dataloader.DataLoader` of length 5966 with parameters:
123
  ```
124
  {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
125
  ```
 
131
  Parameters of the fit()-Method:
132
  ```
133
  {
134
+ "epochs": 70,
135
+ "evaluation_steps": 2983,
136
  "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
137
  "max_grad_norm": 1,
138
  "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
139
  "optimizer_params": {
140
+ "lr": 2e-05
141
  },
142
  "scheduler": "WarmupLinear",
143
  "steps_per_epoch": null,
144
+ "warmup_steps": 417620,
145
  "weight_decay": 0.01
146
  }
147
  ```
 
151
 
152
  LinkTransformer(
153
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
154
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
155
  )
156
  ```
157
 
158
  ## Citing & Authors
159
 
160
+ ```
161
+ @misc{arora2023linktransformer,
162
+ title={LinkTransformer: A Unified Package for Record Linkage with Transformer Language Models},
163
+ author={Abhishek Arora and Melissa Dell},
164
+ year={2023},
165
+ eprint={2309.00789},
166
+ archivePrefix={arXiv},
167
+ primaryClass={cs.CL}
168
+ }
169
+
170
+ ```
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "models/linkage_multi_aliases/",
3
  "architectures": [
4
  "XLMRobertaModel"
5
  ],
@@ -22,7 +22,7 @@
22
  "pad_token_id": 1,
23
  "position_embedding_type": "absolute",
24
  "torch_dtype": "float32",
25
- "transformers_version": "4.31.0",
26
  "type_vocab_size": 1,
27
  "use_cache": true,
28
  "vocab_size": 250002
 
1
  {
2
+ "_name_or_path": "models/linkage_multi_aliases",
3
  "architectures": [
4
  "XLMRobertaModel"
5
  ],
 
22
  "pad_token_id": 1,
23
  "position_embedding_type": "absolute",
24
  "torch_dtype": "float32",
25
+ "transformers_version": "4.35.1",
26
  "type_vocab_size": 1,
27
  "use_cache": true,
28
  "vocab_size": 250002
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d23a8c3194fddc392984009fb03fbfd1ce072746d1ced463d0feb407bd0059
3
+ size 1112197096
special_tokens_map.json CHANGED
@@ -1,7 +1,25 @@
1
  {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "mask_token": {
6
  "content": "<mask>",
7
  "lstrip": true,
@@ -9,7 +27,25 @@
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "mask_token": {
24
  "content": "<mask>",
25
  "lstrip": true,
 
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b60b6b43406a48bf3638526314f3d232d97058bc93472ff2de930d43686fa441
3
- size 17082913
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c835b069d7b8cd02b400e6247b83bc1840ab12bb1628d5b2e03c8d728de75558
3
+ size 17082941
tokenizer_config.json CHANGED
@@ -1,19 +1,61 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "cls_token": "<s>",
5
  "eos_token": "</s>",
6
- "mask_token": {
7
- "__type": "AddedToken",
8
- "content": "<mask>",
9
- "lstrip": true,
10
- "normalized": true,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
  "model_max_length": 512,
 
15
  "pad_token": "<pad>",
 
 
16
  "sep_token": "</s>",
 
17
  "tokenizer_class": "XLMRobertaTokenizer",
 
 
18
  "unk_token": "<unk>"
19
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
  "bos_token": "<s>",
45
  "clean_up_tokenization_spaces": true,
46
  "cls_token": "<s>",
47
  "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "max_length": 128,
 
 
 
 
 
 
50
  "model_max_length": 512,
51
+ "pad_to_multiple_of": null,
52
  "pad_token": "<pad>",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
  "sep_token": "</s>",
56
+ "stride": 0,
57
  "tokenizer_class": "XLMRobertaTokenizer",
58
+ "truncation_side": "right",
59
+ "truncation_strategy": "longest_first",
60
  "unk_token": "<unk>"
61
  }