96abhishekarora commited on
Commit
1c9c7eb
1 Parent(s): e143560

Add new LinkTransformer model.

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
LT_training_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_save_dir": "models",
3
+ "model_save_name": "linkage_ja_aliases",
4
+ "opt_model_description": "This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework. \n It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json \n ",
5
+ "opt_model_lang": "ja",
6
+ "train_batch_size": 64,
7
+ "num_epochs": 100,
8
+ "warm_up_perc": 1,
9
+ "learning_rate": 2e-06,
10
+ "val_perc": 0.2,
11
+ "wandb_names": {
12
+ "project": "linkage",
13
+ "id": "econabhishek",
14
+ "run": "linkage_ja_aliases",
15
+ "entity": "econabhishek"
16
+ },
17
+ "add_pooling_layer": false,
18
+ "large_val": true,
19
+ "eval_steps_perc": 0.1,
20
+ "test_at_end": true,
21
+ "save_val_test_pickles": true,
22
+ "eval_type": "retrieval",
23
+ "training_dataset": "dataframe",
24
+ "base_model_path": "oshizo/sbert-jsnli-luke-japanese-base-lite",
25
+ "best_model_path": "models/linkage_ja_aliases"
26
+ }
README.md ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: sentence-similarity
3
+ language:
4
+ - ja
5
+ tags:
6
+ - linktransformer
7
+ - sentence-transformers
8
+ - sentence-similarity
9
+ - tabular-classification
10
+
11
+ ---
12
+
13
+ # dell-research-harvard/lt-wikidata-comp-ja
14
+
15
+ This is a [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
16
+ It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
17
+ Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
18
+ It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
19
+ Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
20
+
21
+
22
+ This model has been fine-tuned on the model : oshizo/sbert-jsnli-luke-japanese-base-lite. It is pretrained for the language : - ja.
23
+
24
+
25
+ This model was trained on a dataset consisting of company aliases from wiki data using the LinkTransformer framework.
26
+ It was trained for 100 epochs using other defaults that can be found in the repo's LinkTransformer config file - LT_training_config.json
27
+
28
+
29
+ ## Usage (LinkTransformer)
30
+
31
+ Using this model becomes easy when you have [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) installed:
32
+
33
+ ```
34
+ pip install -U linktransformer
35
+ ```
36
+
37
+ Then you can use the model like this:
38
+
39
+ ```python
40
+ import linktransformer as lt
41
+ import pandas as pd
42
+
43
+ ##Load the two dataframes that you want to link. For example, 2 dataframes with company names that are written differently
44
+ df1=pd.read_csv("data/df1.csv") ###This is the left dataframe with key CompanyName for instance
45
+ df2=pd.read_csv("data/df2.csv") ###This is the right dataframe with key CompanyName for instance
46
+
47
+ ###Merge the two dataframes on the key column!
48
+ df_merged = lt.merge(df1, df2, on="CompanyName", how="inner")
49
+
50
+ ##Done! The merged dataframe has a column called "score" that contains the similarity score between the two company names
51
+
52
+ ```
53
+
54
+
55
+ ## Training your own LinkTransformer model
56
+ Any Sentence Transformers can be used as a backbone by simply adding a pooling layer. Any other transformer on HuggingFace can also be used by specifying the option add_pooling_layer==True
57
+ The model was trained using SupCon loss.
58
+ Usage can be found in the package docs.
59
+ The training config can be found in the repo with the name LT_training_config.json
60
+ To replicate the training, you can download the file and specify the path in the config_path argument of the training function. You can also override the config by specifying the training_args argument.
61
+ Here is an example.
62
+
63
+
64
+ ```python
65
+
66
+ ##Consider the example in the paper that has a dataset of Mexican products and their tariff codes from 1947 and 1948 and we want train a model to link the two tariff codes.
67
+ saved_model_path = train_model(
68
+ model_path="hiiamsid/sentence_similarity_spanish_es",
69
+ dataset_path=dataset_path,
70
+ left_col_names=["description47"],
71
+ right_col_names=['description48'],
72
+ left_id_name=['tariffcode47'],
73
+ right_id_name=['tariffcode48'],
74
+ log_wandb=False,
75
+ config_path=LINKAGE_CONFIG_PATH,
76
+ training_args={"num_epochs": 1}
77
+ )
78
+
79
+ ```
80
+
81
+
82
+ You can also use this package for deduplication (clusters a df on the supplied key column). Merging a fine class (like product) to a coarse class (like HS code) is also possible.
83
+ Read our paper and the documentation for more!
84
+
85
+
86
+
87
+ ## Evaluation Results
88
+
89
+ <!--- Describe how your model was evaluated -->
90
+
91
+ You can evaluate the model using the [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) package's inference functions.
92
+ We have provided a few datasets in the package for you to try out. We plan to host more datasets on Huggingface and our website (Coming soon) that you can take a look at.
93
+
94
+
95
+ ## Training
96
+ The model was trained with the parameters:
97
+
98
+ **DataLoader**:
99
+
100
+ `torch.utils.data.dataloader.DataLoader` of length 444 with parameters:
101
+ ```
102
+ {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
103
+ ```
104
+
105
+ **Loss**:
106
+
107
+ `linktransformer.modified_sbert.losses.SupConLoss_wandb`
108
+
109
+ Parameters of the fit()-Method:
110
+ ```
111
+ {
112
+ "epochs": 100,
113
+ "evaluation_steps": 4440,
114
+ "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
115
+ "max_grad_norm": 1,
116
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
117
+ "optimizer_params": {
118
+ "lr": 2e-06
119
+ },
120
+ "scheduler": "WarmupLinear",
121
+ "steps_per_epoch": null,
122
+ "warmup_steps": 44400,
123
+ "weight_decay": 0.01
124
+ }
125
+ ```
126
+
127
+
128
+
129
+
130
+ LinkTransformer(
131
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: LukeModel
132
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
133
+ )
134
+ ```
135
+
136
+ ## Citing & Authors
137
+
138
+ <!--- Describe where people can find more information -->
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ent2>": 32771,
3
+ "<ent>": 32770
4
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/linkage_ja_aliases/",
3
+ "architectures": [
4
+ "LukeModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_model_name": "models/luke-japanese/hf_xlm_roberta",
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": null,
10
+ "cls_entity_prediction": false,
11
+ "entity_emb_size": 256,
12
+ "entity_vocab_size": 4,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-05,
20
+ "max_position_embeddings": 514,
21
+ "model_type": "luke",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 1,
25
+ "position_embedding_type": "absolute",
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.31.0",
28
+ "type_vocab_size": 1,
29
+ "use_cache": true,
30
+ "use_entity_aware_attention": true,
31
+ "vocab_size": 32772
32
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.25.1",
5
+ "pytorch": "1.13.0+cu116"
6
+ }
7
+ }
entity_vocab.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[MASK2]": 3,
3
+ "[MASK]": 0,
4
+ "[PAD]": 2,
5
+ "[UNK]": 1
6
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451d07b21cc069b7c2c7e8f6624f52da26f0e246f90abaedc8749782b54b0adf
3
+ size 532357729
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 128,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b73a5e054936c920cf5b7d1ec21ce9c281977078269963beb821c6c86fbff7
3
+ size 841889
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<ent>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<ent2>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<s>",
19
+ "cls_token": "<s>",
20
+ "eos_token": "</s>",
21
+ "mask_token": {
22
+ "content": "<mask>",
23
+ "lstrip": true,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "pad_token": "<pad>",
29
+ "sep_token": "</s>",
30
+ "unk_token": "<unk>"
31
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "__type": "AddedToken",
5
+ "content": "<ent>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ {
12
+ "__type": "AddedToken",
13
+ "content": "<ent2>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ {
20
+ "__type": "AddedToken",
21
+ "content": "<ent>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ {
28
+ "__type": "AddedToken",
29
+ "content": "<ent2>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ {
36
+ "__type": "AddedToken",
37
+ "content": "<ent>",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false
42
+ },
43
+ {
44
+ "__type": "AddedToken",
45
+ "content": "<ent2>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ {
52
+ "__type": "AddedToken",
53
+ "content": "<ent>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "__type": "AddedToken",
61
+ "content": "<ent2>",
62
+ "lstrip": false,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false
66
+ }
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": true,
70
+ "cls_token": "<s>",
71
+ "entity_mask2_token": "[MASK2]",
72
+ "entity_mask_token": "[MASK]",
73
+ "entity_pad_token": "[PAD]",
74
+ "entity_token_1": {
75
+ "__type": "AddedToken",
76
+ "content": "<ent>",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false
81
+ },
82
+ "entity_token_2": {
83
+ "__type": "AddedToken",
84
+ "content": "<ent2>",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false
89
+ },
90
+ "entity_unk_token": "[UNK]",
91
+ "eos_token": "</s>",
92
+ "mask_token": {
93
+ "__type": "AddedToken",
94
+ "content": "<mask>",
95
+ "lstrip": true,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false
99
+ },
100
+ "max_entity_length": 32,
101
+ "max_mention_length": 30,
102
+ "model_max_length": 512,
103
+ "pad_token": "<pad>",
104
+ "sep_token": "</s>",
105
+ "sp_model_kwargs": {},
106
+ "task": null,
107
+ "tokenizer_class": "MLukeTokenizer",
108
+ "tokenizer_file": "models/luke-japanese/hf_luke_japanese_lite_epoch20/tokenizer.json",
109
+ "unk_token": "<unk>"
110
+ }