96abhishekarora commited on
Commit
0675367
1 Parent(s): f0a355a

Updated model with better training and evaluation. Test and val data included as pickle files.

Browse files
.gitattributes CHANGED
@@ -1,37 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
- .git/lfs/objects/2c/54/2c547cd0200d5e2941a0df1be6f08c9c58aa7909c52edc93b34fd74a26360708 filter=lfs diff=lfs merge=lfs -text
 
1
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ .git/lfs/objects/38/03/38038b2d482f03da65b16b695cca791699e9d40235edd0dbe368b855c05ca162 filter=lfs diff=lfs merge=lfs -text
3
+ sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1_Pooling/config.json CHANGED
@@ -3,5 +3,7 @@
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false
 
 
7
  }
 
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
  }
LT_training_config.json CHANGED
@@ -1,27 +1,29 @@
1
  {
2
  "model_save_dir": "models",
3
- "model_save_name": "check2",
4
- "opt_model_description": null,
5
- "opt_model_lang": null,
6
  "train_batch_size": 64,
7
  "num_epochs": 1,
8
  "warm_up_perc": 1,
9
- "learning_rate": 2e-06,
 
10
  "val_perc": 0.2,
11
  "wandb_names": {
12
- "project": "linkage",
13
- "id": "econabhishek",
14
- "run": "paraphrase-xlm-r-multilingual-v1-es",
15
- "entity": "econabhishek"
16
  },
17
  "add_pooling_layer": false,
18
  "large_val": true,
19
- "eval_steps_perc": 0.1,
20
  "test_at_end": true,
21
  "save_val_test_pickles": true,
22
  "val_query_prop": 0.5,
23
- "eval_type": "retrieval",
24
- "training_dataset": "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/deeprecordlinkage/linktransformer/src/linktransformer/data/es_mexican_products.xlsx",
25
- "base_model_path": "hiiamsid/sentence_similarity_spanish_es",
26
- "best_model_path": "models/check2"
 
27
  }
 
1
  {
2
  "model_save_dir": "models",
3
+ "model_save_name": "check",
4
+ "opt_model_description": "test",
5
+ "opt_model_lang": "jp",
6
  "train_batch_size": 64,
7
  "num_epochs": 1,
8
  "warm_up_perc": 1,
9
+ "learning_rate": 2e-05,
10
+ "loss_type": "supcon",
11
  "val_perc": 0.2,
12
  "wandb_names": {
13
+ "project": "linktransformer",
14
+ "id": "your-id",
15
+ "run": "run-name",
16
+ "entity": "your-id"
17
  },
18
  "add_pooling_layer": false,
19
  "large_val": true,
20
+ "eval_steps_perc": 0.5,
21
  "test_at_end": true,
22
  "save_val_test_pickles": true,
23
  "val_query_prop": 0.5,
24
+ "loss_params": {},
25
+ "eval_type": "classification",
26
+ "training_dataset": "dataframe",
27
+ "base_model_path": "oshizo/sbert-jsnli-luke-japanese-base-lite",
28
+ "best_model_path": "models/check"
29
  }
README.md CHANGED
@@ -1,67 +1,84 @@
1
  ---
2
- pipeline_tag: text-classification
3
  language:
4
- - en
5
  tags:
6
  - linktransformer
7
- - transformers
8
- - text-classification
9
  - tabular-classification
10
 
11
  ---
12
 
13
  # dell-research-harvard/linktransformer-models-test
14
 
15
- This model is part of the [LinkTransformer](https://linktransformer.github.io/) ecosystem. While rooted in the a standard HuggingFace Transformer, this specific instance is tailored for text classification tasks. It classifies input sentences or paragraphs into specific categories or labels, leveraging the power of transformer architectures.
 
 
 
 
16
 
17
- The base model for this classifier is: roberta. It is pretrained for the language: - en.
18
 
19
- Labels are mapped to integers as follows:
20
 
21
- - Neither: 0
22
- - Protest: 1
23
- - Riot: 2
24
 
 
25
 
26
- This is a LinkTransformer model for classification of text into 'Protest', 'Riot' or 'Neither' classes. It was trained on annotated newspaper articles.
27
 
28
- ## Usage with LinkTransformer
29
 
30
- After installing [LinkTransformer](https://linktransformer.github.io/):
31
-
32
- ```python
33
  pip install -U linktransformer
34
  ```
35
 
36
- Employ the model for text classification tasks:
37
 
38
  ```python
39
  import linktransformer as lt
40
- df_clf_output = lt.classify_rows(df, on=["col_of_interest"], model="dell-research-harvard/linktransformer-models-test")
41
- ```
42
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- ## Training
45
 
46
- ### Training your own LinkTransformer Classification Model
 
 
 
 
 
 
47
 
48
- With the provided tools, you can train a custom classification model:
49
 
50
  ```python
51
- from linktransformer import train_clf_model
52
-
53
- best_model_path, best_metric, label_map = train_clf_model(
54
- data="path_to_dataset.csv",
55
- model="you-model-path-or-name",
56
- on=["col_of_interest"],
57
- label_col_name="label_column_name",
58
- lr=5e-5,
59
- batch_size=16,
60
- epochs=3
61
- )
 
 
 
62
  ```
63
 
64
 
 
 
65
 
66
 
67
 
@@ -69,7 +86,50 @@ best_model_path, best_metric, label_map = train_clf_model(
69
 
70
  <!--- Describe how your model was evaluated -->
71
 
72
- Evaluation is typically based on metrics like accuracy, F1-score, precision, and recall.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  ## Citing & Authors
75
 
@@ -81,5 +141,6 @@ Evaluation is typically based on metrics like accuracy, F1-score, precision, and
81
  eprint={2309.00789},
82
  archivePrefix={arXiv},
83
  primaryClass={cs.CL}
84
- }
 
85
  ```
 
1
  ---
2
+ pipeline_tag: sentence-similarity
3
  language:
4
+ - jp
5
  tags:
6
  - linktransformer
7
+ - sentence-transformers
8
+ - sentence-similarity
9
  - tabular-classification
10
 
11
  ---
12
 
13
  # dell-research-harvard/linktransformer-models-test
14
 
15
+ This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
16
+ It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
17
+ Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
18
+ It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
19
+ Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
20
 
 
21
 
22
+ This model has been fine-tuned on the model : oshizo/sbert-jsnli-luke-japanese-base-lite. It is pretrained for the language : - jp.
23
 
 
 
 
24
 
25
+ test
26
 
27
+ ## Usage (LinkTransformer)
28
 
29
+ Using this model becomes easy when you have [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) installed:
30
 
31
+ ```
 
 
32
  pip install -U linktransformer
33
  ```
34
 
35
+ Then you can use the model like this:
36
 
37
  ```python
38
  import linktransformer as lt
39
+ import pandas as pd
 
40
 
41
+ ##Load the two dataframes that you want to link. For example, 2 dataframes with company names that are written differently
42
+ df1=pd.read_csv("data/df1.csv") ###This is the left dataframe with key CompanyName for instance
43
+ df2=pd.read_csv("data/df2.csv") ###This is the right dataframe with key CompanyName for instance
44
+
45
+ ###Merge the two dataframes on the key column!
46
+ df_merged = lt.merge(df1, df2, on="CompanyName", how="inner")
47
+
48
+ ##Done! The merged dataframe has a column called "score" that contains the similarity score between the two company names
49
+
50
+ ```
51
 
 
52
 
53
+ ## Training your own LinkTransformer model
54
+ Any Sentence Transformers can be used as a backbone by simply adding a pooling layer. Any other transformer on HuggingFace can also be used by specifying the option add_pooling_layer==True
55
+ The model was trained using SupCon loss.
56
+ Usage can be found in the package docs.
57
+ The training config can be found in the repo with the name LT_training_config.json
58
+ To replicate the training, you can download the file and specify the path in the config_path argument of the training function. You can also override the config by specifying the training_args argument.
59
+ Here is an example.
60
 
 
61
 
62
  ```python
63
+
64
+ ##Consider the example in the paper that has a dataset of Mexican products and their tariff codes from 1947 and 1948 and we want train a model to link the two tariff codes.
65
+ saved_model_path = train_model(
66
+ model_path="hiiamsid/sentence_similarity_spanish_es",
67
+ dataset_path=dataset_path,
68
+ left_col_names=["description47"],
69
+ right_col_names=['description48'],
70
+ left_id_name=['tariffcode47'],
71
+ right_id_name=['tariffcode48'],
72
+ log_wandb=False,
73
+ config_path=LINKAGE_CONFIG_PATH,
74
+ training_args={"num_epochs": 1}
75
+ )
76
+
77
  ```
78
 
79
 
80
+ You can also use this package for deduplication (clusters a df on the supplied key column). Merging a fine class (like product) to a coarse class (like HS code) is also possible.
81
+ Read our paper and the documentation for more!
82
 
83
 
84
 
 
86
 
87
  <!--- Describe how your model was evaluated -->
88
 
89
+ You can evaluate the model using the [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) package's inference functions.
90
+ We have provided a few datasets in the package for you to try out. We plan to host more datasets on Huggingface and our website (Coming soon) that you can take a look at.
91
+
92
+
93
+ ## Training
94
+ The model was trained with the parameters:
95
+
96
+ **DataLoader**:
97
+
98
+ `torch.utils.data.dataloader.DataLoader` of length 10 with parameters:
99
+ ```
100
+ {'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
101
+ ```
102
+
103
+ **Loss**:
104
+
105
+ `linktransformer.modified_sbert.losses.SupConLoss_wandb`
106
+
107
+ Parameters of the fit()-Method:
108
+ ```
109
+ {
110
+ "epochs": 1,
111
+ "evaluation_steps": 5,
112
+ "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
113
+ "max_grad_norm": 1,
114
+ "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
115
+ "optimizer_params": {
116
+ "lr": 2e-05
117
+ },
118
+ "scheduler": "WarmupLinear",
119
+ "steps_per_epoch": null,
120
+ "warmup_steps": 10,
121
+ "weight_decay": 0.01
122
+ }
123
+ ```
124
+
125
+
126
+
127
+
128
+ LinkTransformer(
129
+ (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: LukeModel
130
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
131
+ )
132
+ ```
133
 
134
  ## Citing & Authors
135
 
 
141
  eprint={2309.00789},
142
  archivePrefix={arXiv},
143
  primaryClass={cs.CL}
144
+ }
145
+
146
  ```
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ent2>": 32771,
3
+ "<ent>": 32770
4
+ }
config.json CHANGED
@@ -1,38 +1,32 @@
1
  {
2
- "_name_or_path": "test_lt_clf/checkpoint-90",
3
  "architectures": [
4
- "RobertaForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
 
7
  "bos_token_id": 0,
8
  "classifier_dropout": null,
 
 
 
9
  "eos_token_id": 2,
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
  "hidden_size": 768,
13
- "id2label": {
14
- "0": "LABEL_0",
15
- "1": "LABEL_1",
16
- "2": "LABEL_2"
17
- },
18
  "initializer_range": 0.02,
19
  "intermediate_size": 3072,
20
- "label2id": {
21
- "LABEL_0": 0,
22
- "LABEL_1": 1,
23
- "LABEL_2": 2
24
- },
25
  "layer_norm_eps": 1e-05,
26
  "max_position_embeddings": 514,
27
- "model_type": "roberta",
28
  "num_attention_heads": 12,
29
- "num_hidden_layers": 6,
30
  "pad_token_id": 1,
31
  "position_embedding_type": "absolute",
32
- "problem_type": "single_label_classification",
33
  "torch_dtype": "float32",
34
- "transformers_version": "4.33.2",
35
  "type_vocab_size": 1,
36
  "use_cache": true,
37
- "vocab_size": 50265
 
38
  }
 
1
  {
2
+ "_name_or_path": "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/deeprecordlinkage/linktransformer/models/check",
3
  "architectures": [
4
+ "LukeModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "bert_model_name": "models/luke-japanese/hf_xlm_roberta",
8
  "bos_token_id": 0,
9
  "classifier_dropout": null,
10
+ "cls_entity_prediction": false,
11
+ "entity_emb_size": 256,
12
+ "entity_vocab_size": 4,
13
  "eos_token_id": 2,
14
  "hidden_act": "gelu",
15
  "hidden_dropout_prob": 0.1,
16
  "hidden_size": 768,
 
 
 
 
 
17
  "initializer_range": 0.02,
18
  "intermediate_size": 3072,
 
 
 
 
 
19
  "layer_norm_eps": 1e-05,
20
  "max_position_embeddings": 514,
21
+ "model_type": "luke",
22
  "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
  "pad_token_id": 1,
25
  "position_embedding_type": "absolute",
 
26
  "torch_dtype": "float32",
27
+ "transformers_version": "4.35.1",
28
  "type_vocab_size": 1,
29
  "use_cache": true,
30
+ "use_entity_aware_attention": true,
31
+ "vocab_size": 32772
32
  }
config_sentence_transformers.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "__version__": {
3
- "sentence_transformers": "2.0.0",
4
- "transformers": "4.10.2",
5
- "pytorch": "1.9.0+cu102"
6
  }
7
  }
 
1
  {
2
  "__version__": {
3
+ "sentence_transformers": "2.2.2",
4
+ "transformers": "4.25.1",
5
+ "pytorch": "1.13.0+cu116"
6
  }
7
  }
entity_vocab.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[MASK2]": 3,
3
+ "[MASK]": 0,
4
+ "[PAD]": 2,
5
+ "[UNK]": 1
6
+ }
merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38038b2d482f03da65b16b695cca791699e9d40235edd0dbe368b855c05ca162
3
- size 328517105
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffff8b1e1118917383c8481ca5631e9b5a05616039a6f0039ad5d0fae975d7ed
3
+ size 532299592
sentence_bert_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "max_seq_length": 512,
3
  "do_lower_case": false
4
  }
 
1
  {
2
+ "max_seq_length": 128,
3
  "do_lower_case": false
4
  }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b73a5e054936c920cf5b7d1ec21ce9c281977078269963beb821c6c86fbff7
3
+ size 841889
special_tokens_map.json CHANGED
@@ -1,15 +1,75 @@
1
  {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "mask_token": {
6
  "content": "<mask>",
7
  "lstrip": true,
 
 
 
 
 
 
 
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
15
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<ent>",
4
+ "<ent2>",
5
+ "<ent>",
6
+ "<ent2>",
7
+ "<ent>",
8
+ "<ent2>",
9
+ "<ent>",
10
+ "<ent2>",
11
+ {
12
+ "content": "<ent>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ {
19
+ "content": "<ent2>",
20
+ "lstrip": false,
21
+ "normalized": true,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ }
25
+ ],
26
+ "bos_token": {
27
+ "content": "<s>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "cls_token": {
34
+ "content": "<s>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ },
40
+ "eos_token": {
41
+ "content": "</s>",
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false
46
+ },
47
  "mask_token": {
48
  "content": "<mask>",
49
  "lstrip": true,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "pad_token": {
55
+ "content": "<pad>",
56
+ "lstrip": false,
57
  "normalized": false,
58
  "rstrip": false,
59
  "single_word": false
60
  },
61
+ "sep_token": {
62
+ "content": "</s>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false
67
+ },
68
+ "unk_token": {
69
+ "content": "<unk>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false
74
+ }
75
  }
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,15 +1,108 @@
1
  {
2
- "add_prefix_space": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "bos_token": "<s>",
4
  "clean_up_tokenization_spaces": true,
5
  "cls_token": "<s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "eos_token": "</s>",
7
- "errors": "replace",
8
  "mask_token": "<mask>",
 
 
9
  "model_max_length": 512,
10
  "pad_token": "<pad>",
11
  "sep_token": "</s>",
12
- "tokenizer_class": "RobertaTokenizer",
13
- "trim_offsets": true,
 
14
  "unk_token": "<unk>"
15
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "32769": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "32770": {
44
+ "content": "<ent>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "32771": {
52
+ "content": "<ent2>",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "additional_special_tokens": [
61
+ "<ent>",
62
+ "<ent2>",
63
+ "<ent>",
64
+ "<ent2>",
65
+ "<ent>",
66
+ "<ent2>",
67
+ "<ent>",
68
+ "<ent2>",
69
+ "<ent>",
70
+ "<ent2>"
71
+ ],
72
  "bos_token": "<s>",
73
  "clean_up_tokenization_spaces": true,
74
  "cls_token": "<s>",
75
+ "entity_mask2_token": "[MASK2]",
76
+ "entity_mask_token": "[MASK]",
77
+ "entity_pad_token": "[PAD]",
78
+ "entity_token_1": {
79
+ "__type": "AddedToken",
80
+ "content": "<ent>",
81
+ "lstrip": false,
82
+ "normalized": true,
83
+ "rstrip": false,
84
+ "single_word": false,
85
+ "special": false
86
+ },
87
+ "entity_token_2": {
88
+ "__type": "AddedToken",
89
+ "content": "<ent2>",
90
+ "lstrip": false,
91
+ "normalized": true,
92
+ "rstrip": false,
93
+ "single_word": false,
94
+ "special": false
95
+ },
96
+ "entity_unk_token": "[UNK]",
97
  "eos_token": "</s>",
 
98
  "mask_token": "<mask>",
99
+ "max_entity_length": 32,
100
+ "max_mention_length": 30,
101
  "model_max_length": 512,
102
  "pad_token": "<pad>",
103
  "sep_token": "</s>",
104
+ "sp_model_kwargs": {},
105
+ "task": null,
106
+ "tokenizer_class": "MLukeTokenizer",
107
  "unk_token": "<unk>"
108
  }
vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
vocab.txt DELETED
The diff for this file is too large to render. See raw diff