PuxAI commited on
Commit
bcb23ed
·
verified ·
1 Parent(s): 1f3b236

Upload folder using huggingface_hub

Browse files
gretel-pii-ready/TokenBased-CRF/checkpoint-63/config.json CHANGED
@@ -1,248 +1,40 @@
1
  {
 
2
  "architectures": [
3
  "TransformerCrfForTokenClassification"
4
  ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 50281,
8
- "classifier_activation": "gelu",
9
- "classifier_bias": false,
10
- "classifier_dropout": 0.0,
11
- "classifier_pooling": "mean",
12
- "cls_token_id": 50281,
13
- "decoder_bias": true,
14
- "deterministic_flash_attn": false,
15
  "dtype": "float32",
16
- "embedding_dropout": 0.0,
17
- "eos_token_id": 50282,
18
- "global_attn_every_n_layers": 3,
19
- "gradient_checkpointing": false,
20
- "hidden_activation": "gelu",
21
  "hidden_size": 768,
22
  "id2label": {
23
- "0": "O",
24
- "1": "B-ACCOUNT_NUMBER",
25
- "2": "I-ACCOUNT_NUMBER",
26
- "3": "B-ADDRESS",
27
- "4": "I-ADDRESS",
28
- "5": "B-API_KEY",
29
- "6": "I-API_KEY",
30
- "7": "B-BANK_ROUTING_NUMBER",
31
- "8": "I-BANK_ROUTING_NUMBER",
32
- "9": "B-BIOMETRIC_IDENTIFIER",
33
- "10": "I-BIOMETRIC_IDENTIFIER",
34
- "11": "B-CERTIFICATE_LICENSE_NUMBER",
35
- "12": "I-CERTIFICATE_LICENSE_NUMBER",
36
- "13": "B-CITY",
37
- "14": "I-CITY",
38
- "15": "B-COMPANY_NAME",
39
- "16": "I-COMPANY_NAME",
40
- "17": "B-COORDINATE",
41
- "18": "I-COORDINATE",
42
- "19": "B-COUNTRY",
43
- "20": "I-COUNTRY",
44
- "21": "B-CREDIT_CARD_NUMBER",
45
- "22": "I-CREDIT_CARD_NUMBER",
46
- "23": "B-CUSTOMER_ID",
47
- "24": "I-CUSTOMER_ID",
48
- "25": "B-CVV",
49
- "26": "I-CVV",
50
- "27": "B-DATE",
51
- "28": "I-DATE",
52
- "29": "B-DATE_OF_BIRTH",
53
- "30": "I-DATE_OF_BIRTH",
54
- "31": "B-DATE_TIME",
55
- "32": "I-DATE_TIME",
56
- "33": "B-DEVICE_IDENTIFIER",
57
- "34": "I-DEVICE_IDENTIFIER",
58
- "35": "B-EMAIL",
59
- "36": "I-EMAIL",
60
- "37": "B-EMPLOYEE_ID",
61
- "38": "I-EMPLOYEE_ID",
62
- "39": "B-FIRST_NAME",
63
- "40": "I-FIRST_NAME",
64
- "41": "B-HEALTH_PLAN_BENEFICIARY_NUMBER",
65
- "42": "I-HEALTH_PLAN_BENEFICIARY_NUMBER",
66
- "43": "B-IPV4",
67
- "44": "I-IPV4",
68
- "45": "B-IPV6",
69
- "46": "I-IPV6",
70
- "47": "B-LAST_NAME",
71
- "48": "I-LAST_NAME",
72
- "49": "B-LICENSE_PLATE",
73
- "50": "I-LICENSE_PLATE",
74
- "51": "B-MEDICAL_RECORD_NUMBER",
75
- "52": "I-MEDICAL_RECORD_NUMBER",
76
- "53": "B-NAME",
77
- "54": "I-NAME",
78
- "55": "B-NATIONAL_ID",
79
- "56": "I-NATIONAL_ID",
80
- "57": "B-PASSWORD",
81
- "58": "I-PASSWORD",
82
- "59": "B-PHONE_NUMBER",
83
- "60": "I-PHONE_NUMBER",
84
- "61": "B-POSTCODE",
85
- "62": "I-POSTCODE",
86
- "63": "B-SSN",
87
- "64": "I-SSN",
88
- "65": "B-STATE",
89
- "66": "I-STATE",
90
- "67": "B-STREET_ADDRESS",
91
- "68": "I-STREET_ADDRESS",
92
- "69": "B-SWIFT_BIC",
93
- "70": "I-SWIFT_BIC",
94
- "71": "B-TAX_ID",
95
- "72": "I-TAX_ID",
96
- "73": "B-TIME",
97
- "74": "I-TIME",
98
- "75": "B-UNIQUE_IDENTIFIER",
99
- "76": "I-UNIQUE_IDENTIFIER",
100
- "77": "B-URL",
101
- "78": "I-URL",
102
- "79": "B-USER_NAME",
103
- "80": "I-USER_NAME",
104
- "81": "B-VEHICLE_IDENTIFIER",
105
- "82": "I-VEHICLE_IDENTIFIER"
106
  },
107
- "initializer_cutoff_factor": 2.0,
108
  "initializer_range": 0.02,
109
- "intermediate_size": 1152,
 
110
  "label2id": {
111
- "B-ACCOUNT_NUMBER": 1,
112
- "B-ADDRESS": 3,
113
- "B-API_KEY": 5,
114
- "B-BANK_ROUTING_NUMBER": 7,
115
- "B-BIOMETRIC_IDENTIFIER": 9,
116
- "B-CERTIFICATE_LICENSE_NUMBER": 11,
117
- "B-CITY": 13,
118
- "B-COMPANY_NAME": 15,
119
- "B-COORDINATE": 17,
120
- "B-COUNTRY": 19,
121
- "B-CREDIT_CARD_NUMBER": 21,
122
- "B-CUSTOMER_ID": 23,
123
- "B-CVV": 25,
124
- "B-DATE": 27,
125
- "B-DATE_OF_BIRTH": 29,
126
- "B-DATE_TIME": 31,
127
- "B-DEVICE_IDENTIFIER": 33,
128
- "B-EMAIL": 35,
129
- "B-EMPLOYEE_ID": 37,
130
- "B-FIRST_NAME": 39,
131
- "B-HEALTH_PLAN_BENEFICIARY_NUMBER": 41,
132
- "B-IPV4": 43,
133
- "B-IPV6": 45,
134
- "B-LAST_NAME": 47,
135
- "B-LICENSE_PLATE": 49,
136
- "B-MEDICAL_RECORD_NUMBER": 51,
137
- "B-NAME": 53,
138
- "B-NATIONAL_ID": 55,
139
- "B-PASSWORD": 57,
140
- "B-PHONE_NUMBER": 59,
141
- "B-POSTCODE": 61,
142
- "B-SSN": 63,
143
- "B-STATE": 65,
144
- "B-STREET_ADDRESS": 67,
145
- "B-SWIFT_BIC": 69,
146
- "B-TAX_ID": 71,
147
- "B-TIME": 73,
148
- "B-UNIQUE_IDENTIFIER": 75,
149
- "B-URL": 77,
150
- "B-USER_NAME": 79,
151
- "B-VEHICLE_IDENTIFIER": 81,
152
- "I-ACCOUNT_NUMBER": 2,
153
- "I-ADDRESS": 4,
154
- "I-API_KEY": 6,
155
- "I-BANK_ROUTING_NUMBER": 8,
156
- "I-BIOMETRIC_IDENTIFIER": 10,
157
- "I-CERTIFICATE_LICENSE_NUMBER": 12,
158
- "I-CITY": 14,
159
- "I-COMPANY_NAME": 16,
160
- "I-COORDINATE": 18,
161
- "I-COUNTRY": 20,
162
- "I-CREDIT_CARD_NUMBER": 22,
163
- "I-CUSTOMER_ID": 24,
164
- "I-CVV": 26,
165
- "I-DATE": 28,
166
- "I-DATE_OF_BIRTH": 30,
167
- "I-DATE_TIME": 32,
168
- "I-DEVICE_IDENTIFIER": 34,
169
- "I-EMAIL": 36,
170
- "I-EMPLOYEE_ID": 38,
171
- "I-FIRST_NAME": 40,
172
- "I-HEALTH_PLAN_BENEFICIARY_NUMBER": 42,
173
- "I-IPV4": 44,
174
- "I-IPV6": 46,
175
- "I-LAST_NAME": 48,
176
- "I-LICENSE_PLATE": 50,
177
- "I-MEDICAL_RECORD_NUMBER": 52,
178
- "I-NAME": 54,
179
- "I-NATIONAL_ID": 56,
180
- "I-PASSWORD": 58,
181
- "I-PHONE_NUMBER": 60,
182
- "I-POSTCODE": 62,
183
- "I-SSN": 64,
184
- "I-STATE": 66,
185
- "I-STREET_ADDRESS": 68,
186
- "I-SWIFT_BIC": 70,
187
- "I-TAX_ID": 72,
188
- "I-TIME": 74,
189
- "I-UNIQUE_IDENTIFIER": 76,
190
- "I-URL": 78,
191
- "I-USER_NAME": 80,
192
- "I-VEHICLE_IDENTIFIER": 82,
193
  "O": 0
194
  },
195
- "layer_norm_eps": 1e-05,
196
- "layer_types": [
197
- "full_attention",
198
- "sliding_attention",
199
- "sliding_attention",
200
- "full_attention",
201
- "sliding_attention",
202
- "sliding_attention",
203
- "full_attention",
204
- "sliding_attention",
205
- "sliding_attention",
206
- "full_attention",
207
- "sliding_attention",
208
- "sliding_attention",
209
- "full_attention",
210
- "sliding_attention",
211
- "sliding_attention",
212
- "full_attention",
213
- "sliding_attention",
214
- "sliding_attention",
215
- "full_attention",
216
- "sliding_attention",
217
- "sliding_attention",
218
- "full_attention"
219
- ],
220
- "local_attention": 128,
221
- "max_position_embeddings": 8192,
222
- "mlp_bias": false,
223
- "mlp_dropout": 0.0,
224
- "model_type": "modernbert",
225
- "norm_bias": false,
226
- "norm_eps": 1e-05,
227
  "num_attention_heads": 12,
228
- "num_hidden_layers": 22,
229
- "pad_token_id": 50283,
230
- "position_embedding_type": "absolute",
231
- "rope_parameters": {
232
- "full_attention": {
233
- "rope_theta": 160000.0,
234
- "rope_type": "default"
235
- },
236
- "sliding_attention": {
237
- "rope_theta": 10000.0,
238
- "rope_type": "default"
239
- }
240
- },
241
- "sep_token_id": 50282,
242
- "sparse_pred_ignore_index": -100,
243
- "sparse_prediction": false,
244
  "tie_word_embeddings": true,
245
  "transformers_version": "5.3.0",
 
246
  "use_cache": false,
247
- "vocab_size": 50368
248
  }
 
1
  {
2
+ "add_cross_attention": false,
3
  "architectures": [
4
  "TransformerCrfForTokenClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
 
 
 
 
 
 
10
  "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
 
 
14
  "hidden_size": 768,
15
  "id2label": {
16
+ "0": "O"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
 
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "is_decoder": false,
21
  "label2id": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "O": 0
23
  },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "pooler_fc_size": 768,
31
+ "pooler_num_attention_heads": 12,
32
+ "pooler_num_fc_layers": 3,
33
+ "pooler_size_per_head": 128,
34
+ "pooler_type": "first_token_transform",
 
 
 
 
 
 
 
 
 
35
  "tie_word_embeddings": true,
36
  "transformers_version": "5.3.0",
37
+ "type_vocab_size": 2,
38
  "use_cache": false,
39
+ "vocab_size": 119547
40
  }
gretel-pii-ready/TokenBased-CRF/checkpoint-63/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3674bfdf8d3174f264c7c764a2ab3ce41d23903a8c0dd1413a55f8f2396a714a
3
- size 596355680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a33fa8f20989e1678a3edc77b7b2fddfda14c9f0fb76b514af560bf54babbac
3
+ size 711441992
gretel-pii-ready/TokenBased-CRF/checkpoint-63/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c8de4b7b92956a8d68436bc4a50682baef47f4fbdd1798715f259315120e013
3
- size 1192800523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79148412b64f6a2290a7249211de11a1587d63b7a027284d03c3ba7d2f16c7e8
3
+ size 1418281163
gretel-pii-ready/TokenBased-CRF/checkpoint-63/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7991aa545aa2a9819a120cc19b48bfbf88dcaec121a00f803b9fbabf38d57d55
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f39eab8ed980549bfffcd8b948e8852bef979d820a1c47a898b2c9f270cc3986
3
  size 14645
gretel-pii-ready/TokenBased-CRF/checkpoint-63/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
gretel-pii-ready/TokenBased-CRF/checkpoint-63/tokenizer_config.json CHANGED
@@ -1,17 +1,15 @@
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
4
- "clean_up_tokenization_spaces": true,
5
  "cls_token": "[CLS]",
 
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
- "model_input_names": [
9
- "input_ids",
10
- "attention_mask"
11
- ],
12
- "model_max_length": 8192,
13
  "pad_token": "[PAD]",
14
  "sep_token": "[SEP]",
15
- "tokenizer_class": "TokenizersBackend",
 
 
16
  "unk_token": "[UNK]"
17
  }
 
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
 
4
  "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
+ "model_max_length": 512,
 
 
 
 
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
  "unk_token": "[UNK]"
15
  }
gretel-pii-ready/TokenBased-CRF/checkpoint-63/trainer_state.json CHANGED
@@ -8,7 +8,16 @@
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
- "log_history": [],
 
 
 
 
 
 
 
 
 
12
  "logging_steps": 500,
13
  "max_steps": 63,
14
  "num_input_tokens_seen": 0,
@@ -26,7 +35,7 @@
26
  "attributes": {}
27
  }
28
  },
29
- "total_flos": 169578286080000.0,
30
  "train_batch_size": 8,
31
  "trial_name": null,
32
  "trial_params": null
 
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.0,
14
+ "eval_loss": 0.0,
15
+ "eval_runtime": 1.6661,
16
+ "eval_samples_per_second": 60.021,
17
+ "eval_steps_per_second": 7.803,
18
+ "step": 63
19
+ }
20
+ ],
21
  "logging_steps": 500,
22
  "max_steps": 63,
23
  "num_input_tokens_seen": 0,
 
35
  "attributes": {}
36
  }
37
  },
38
+ "total_flos": 131554351104000.0,
39
  "train_batch_size": 8,
40
  "trial_name": null,
41
  "trial_params": null
gretel-pii-ready/TokenBased-CRF/checkpoint-63/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7adfd43ab830d4bbdcf45c20dc46d1b41e035d973db9875a684066dc6dd0ec9
3
  size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f58e5f8a68b99c8875d6accad13595617eb06c03b2066f839e9d146dca04f088
3
  size 5201
gretel-pii-ready/TokenBased-CRF/config.json CHANGED
@@ -1,248 +1,40 @@
1
  {
 
2
  "architectures": [
3
  "TransformerCrfForTokenClassification"
4
  ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 50281,
8
- "classifier_activation": "gelu",
9
- "classifier_bias": false,
10
- "classifier_dropout": 0.0,
11
- "classifier_pooling": "mean",
12
- "cls_token_id": 50281,
13
- "decoder_bias": true,
14
- "deterministic_flash_attn": false,
15
  "dtype": "float32",
16
- "embedding_dropout": 0.0,
17
- "eos_token_id": 50282,
18
- "global_attn_every_n_layers": 3,
19
- "gradient_checkpointing": false,
20
- "hidden_activation": "gelu",
21
  "hidden_size": 768,
22
  "id2label": {
23
- "0": "O",
24
- "1": "B-ACCOUNT_NUMBER",
25
- "2": "I-ACCOUNT_NUMBER",
26
- "3": "B-ADDRESS",
27
- "4": "I-ADDRESS",
28
- "5": "B-API_KEY",
29
- "6": "I-API_KEY",
30
- "7": "B-BANK_ROUTING_NUMBER",
31
- "8": "I-BANK_ROUTING_NUMBER",
32
- "9": "B-BIOMETRIC_IDENTIFIER",
33
- "10": "I-BIOMETRIC_IDENTIFIER",
34
- "11": "B-CERTIFICATE_LICENSE_NUMBER",
35
- "12": "I-CERTIFICATE_LICENSE_NUMBER",
36
- "13": "B-CITY",
37
- "14": "I-CITY",
38
- "15": "B-COMPANY_NAME",
39
- "16": "I-COMPANY_NAME",
40
- "17": "B-COORDINATE",
41
- "18": "I-COORDINATE",
42
- "19": "B-COUNTRY",
43
- "20": "I-COUNTRY",
44
- "21": "B-CREDIT_CARD_NUMBER",
45
- "22": "I-CREDIT_CARD_NUMBER",
46
- "23": "B-CUSTOMER_ID",
47
- "24": "I-CUSTOMER_ID",
48
- "25": "B-CVV",
49
- "26": "I-CVV",
50
- "27": "B-DATE",
51
- "28": "I-DATE",
52
- "29": "B-DATE_OF_BIRTH",
53
- "30": "I-DATE_OF_BIRTH",
54
- "31": "B-DATE_TIME",
55
- "32": "I-DATE_TIME",
56
- "33": "B-DEVICE_IDENTIFIER",
57
- "34": "I-DEVICE_IDENTIFIER",
58
- "35": "B-EMAIL",
59
- "36": "I-EMAIL",
60
- "37": "B-EMPLOYEE_ID",
61
- "38": "I-EMPLOYEE_ID",
62
- "39": "B-FIRST_NAME",
63
- "40": "I-FIRST_NAME",
64
- "41": "B-HEALTH_PLAN_BENEFICIARY_NUMBER",
65
- "42": "I-HEALTH_PLAN_BENEFICIARY_NUMBER",
66
- "43": "B-IPV4",
67
- "44": "I-IPV4",
68
- "45": "B-IPV6",
69
- "46": "I-IPV6",
70
- "47": "B-LAST_NAME",
71
- "48": "I-LAST_NAME",
72
- "49": "B-LICENSE_PLATE",
73
- "50": "I-LICENSE_PLATE",
74
- "51": "B-MEDICAL_RECORD_NUMBER",
75
- "52": "I-MEDICAL_RECORD_NUMBER",
76
- "53": "B-NAME",
77
- "54": "I-NAME",
78
- "55": "B-NATIONAL_ID",
79
- "56": "I-NATIONAL_ID",
80
- "57": "B-PASSWORD",
81
- "58": "I-PASSWORD",
82
- "59": "B-PHONE_NUMBER",
83
- "60": "I-PHONE_NUMBER",
84
- "61": "B-POSTCODE",
85
- "62": "I-POSTCODE",
86
- "63": "B-SSN",
87
- "64": "I-SSN",
88
- "65": "B-STATE",
89
- "66": "I-STATE",
90
- "67": "B-STREET_ADDRESS",
91
- "68": "I-STREET_ADDRESS",
92
- "69": "B-SWIFT_BIC",
93
- "70": "I-SWIFT_BIC",
94
- "71": "B-TAX_ID",
95
- "72": "I-TAX_ID",
96
- "73": "B-TIME",
97
- "74": "I-TIME",
98
- "75": "B-UNIQUE_IDENTIFIER",
99
- "76": "I-UNIQUE_IDENTIFIER",
100
- "77": "B-URL",
101
- "78": "I-URL",
102
- "79": "B-USER_NAME",
103
- "80": "I-USER_NAME",
104
- "81": "B-VEHICLE_IDENTIFIER",
105
- "82": "I-VEHICLE_IDENTIFIER"
106
  },
107
- "initializer_cutoff_factor": 2.0,
108
  "initializer_range": 0.02,
109
- "intermediate_size": 1152,
 
110
  "label2id": {
111
- "B-ACCOUNT_NUMBER": 1,
112
- "B-ADDRESS": 3,
113
- "B-API_KEY": 5,
114
- "B-BANK_ROUTING_NUMBER": 7,
115
- "B-BIOMETRIC_IDENTIFIER": 9,
116
- "B-CERTIFICATE_LICENSE_NUMBER": 11,
117
- "B-CITY": 13,
118
- "B-COMPANY_NAME": 15,
119
- "B-COORDINATE": 17,
120
- "B-COUNTRY": 19,
121
- "B-CREDIT_CARD_NUMBER": 21,
122
- "B-CUSTOMER_ID": 23,
123
- "B-CVV": 25,
124
- "B-DATE": 27,
125
- "B-DATE_OF_BIRTH": 29,
126
- "B-DATE_TIME": 31,
127
- "B-DEVICE_IDENTIFIER": 33,
128
- "B-EMAIL": 35,
129
- "B-EMPLOYEE_ID": 37,
130
- "B-FIRST_NAME": 39,
131
- "B-HEALTH_PLAN_BENEFICIARY_NUMBER": 41,
132
- "B-IPV4": 43,
133
- "B-IPV6": 45,
134
- "B-LAST_NAME": 47,
135
- "B-LICENSE_PLATE": 49,
136
- "B-MEDICAL_RECORD_NUMBER": 51,
137
- "B-NAME": 53,
138
- "B-NATIONAL_ID": 55,
139
- "B-PASSWORD": 57,
140
- "B-PHONE_NUMBER": 59,
141
- "B-POSTCODE": 61,
142
- "B-SSN": 63,
143
- "B-STATE": 65,
144
- "B-STREET_ADDRESS": 67,
145
- "B-SWIFT_BIC": 69,
146
- "B-TAX_ID": 71,
147
- "B-TIME": 73,
148
- "B-UNIQUE_IDENTIFIER": 75,
149
- "B-URL": 77,
150
- "B-USER_NAME": 79,
151
- "B-VEHICLE_IDENTIFIER": 81,
152
- "I-ACCOUNT_NUMBER": 2,
153
- "I-ADDRESS": 4,
154
- "I-API_KEY": 6,
155
- "I-BANK_ROUTING_NUMBER": 8,
156
- "I-BIOMETRIC_IDENTIFIER": 10,
157
- "I-CERTIFICATE_LICENSE_NUMBER": 12,
158
- "I-CITY": 14,
159
- "I-COMPANY_NAME": 16,
160
- "I-COORDINATE": 18,
161
- "I-COUNTRY": 20,
162
- "I-CREDIT_CARD_NUMBER": 22,
163
- "I-CUSTOMER_ID": 24,
164
- "I-CVV": 26,
165
- "I-DATE": 28,
166
- "I-DATE_OF_BIRTH": 30,
167
- "I-DATE_TIME": 32,
168
- "I-DEVICE_IDENTIFIER": 34,
169
- "I-EMAIL": 36,
170
- "I-EMPLOYEE_ID": 38,
171
- "I-FIRST_NAME": 40,
172
- "I-HEALTH_PLAN_BENEFICIARY_NUMBER": 42,
173
- "I-IPV4": 44,
174
- "I-IPV6": 46,
175
- "I-LAST_NAME": 48,
176
- "I-LICENSE_PLATE": 50,
177
- "I-MEDICAL_RECORD_NUMBER": 52,
178
- "I-NAME": 54,
179
- "I-NATIONAL_ID": 56,
180
- "I-PASSWORD": 58,
181
- "I-PHONE_NUMBER": 60,
182
- "I-POSTCODE": 62,
183
- "I-SSN": 64,
184
- "I-STATE": 66,
185
- "I-STREET_ADDRESS": 68,
186
- "I-SWIFT_BIC": 70,
187
- "I-TAX_ID": 72,
188
- "I-TIME": 74,
189
- "I-UNIQUE_IDENTIFIER": 76,
190
- "I-URL": 78,
191
- "I-USER_NAME": 80,
192
- "I-VEHICLE_IDENTIFIER": 82,
193
  "O": 0
194
  },
195
- "layer_norm_eps": 1e-05,
196
- "layer_types": [
197
- "full_attention",
198
- "sliding_attention",
199
- "sliding_attention",
200
- "full_attention",
201
- "sliding_attention",
202
- "sliding_attention",
203
- "full_attention",
204
- "sliding_attention",
205
- "sliding_attention",
206
- "full_attention",
207
- "sliding_attention",
208
- "sliding_attention",
209
- "full_attention",
210
- "sliding_attention",
211
- "sliding_attention",
212
- "full_attention",
213
- "sliding_attention",
214
- "sliding_attention",
215
- "full_attention",
216
- "sliding_attention",
217
- "sliding_attention",
218
- "full_attention"
219
- ],
220
- "local_attention": 128,
221
- "max_position_embeddings": 8192,
222
- "mlp_bias": false,
223
- "mlp_dropout": 0.0,
224
- "model_type": "modernbert",
225
- "norm_bias": false,
226
- "norm_eps": 1e-05,
227
  "num_attention_heads": 12,
228
- "num_hidden_layers": 22,
229
- "pad_token_id": 50283,
230
- "position_embedding_type": "absolute",
231
- "rope_parameters": {
232
- "full_attention": {
233
- "rope_theta": 160000.0,
234
- "rope_type": "default"
235
- },
236
- "sliding_attention": {
237
- "rope_theta": 10000.0,
238
- "rope_type": "default"
239
- }
240
- },
241
- "sep_token_id": 50282,
242
- "sparse_pred_ignore_index": -100,
243
- "sparse_prediction": false,
244
  "tie_word_embeddings": true,
245
  "transformers_version": "5.3.0",
 
246
  "use_cache": false,
247
- "vocab_size": 50368
248
  }
 
1
  {
2
+ "add_cross_attention": false,
3
  "architectures": [
4
  "TransformerCrfForTokenClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
 
 
 
 
 
 
10
  "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
 
 
14
  "hidden_size": 768,
15
  "id2label": {
16
+ "0": "O"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
 
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "is_decoder": false,
21
  "label2id": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "O": 0
23
  },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "pooler_fc_size": 768,
31
+ "pooler_num_attention_heads": 12,
32
+ "pooler_num_fc_layers": 3,
33
+ "pooler_size_per_head": 128,
34
+ "pooler_type": "first_token_transform",
 
 
 
 
 
 
 
 
 
35
  "tie_word_embeddings": true,
36
  "transformers_version": "5.3.0",
37
+ "type_vocab_size": 2,
38
  "use_cache": false,
39
+ "vocab_size": 119547
40
  }
gretel-pii-ready/TokenBased-CRF/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:518c0d9525653860285e6b25047e0052a4c6f1896a88959b99cf9b29d2af3acc
3
- size 596401743
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f62b92b941f41460837da75a5cc8e4a3ca9b95fa49700be682eb87c0ff17c349
3
+ size 711504083
gretel-pii-ready/TokenBased-CRF/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
gretel-pii-ready/TokenBased-CRF/tokenizer_config.json CHANGED
@@ -1,17 +1,15 @@
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
4
- "clean_up_tokenization_spaces": true,
5
  "cls_token": "[CLS]",
 
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
- "model_input_names": [
9
- "input_ids",
10
- "attention_mask"
11
- ],
12
- "model_max_length": 8192,
13
  "pad_token": "[PAD]",
14
  "sep_token": "[SEP]",
15
- "tokenizer_class": "TokenizersBackend",
 
 
16
  "unk_token": "[UNK]"
17
  }
 
1
  {
2
  "add_prefix_space": true,
3
  "backend": "tokenizers",
 
4
  "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
  "is_local": false,
7
  "mask_token": "[MASK]",
8
+ "model_max_length": 512,
 
 
 
 
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
  "unk_token": "[UNK]"
15
  }