tomaarsen HF staff commited on
Commit
feb9524
1 Parent(s): ec94028

Upload model

Browse files
README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ license: apache-2.0
4
+ library_name: span-marker
5
+ tags:
6
+ - span-marker
7
+ - token-classification
8
+ - ner
9
+ - named-entity-recognition
10
+ pipeline_tag: token-classification
11
+ ---
12
+
13
+ # SpanMarker for Named Entity Recognition
14
+
15
+ This is a [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) model that can be used for Named Entity Recognition. In particular, this SpanMarker model uses [bert-base-cased](https://huggingface.co/bert-base-cased) as the underlying encoder.
16
+
17
+
18
+ ## Usage
19
+
20
+ To use this model for inference, first install the `span_marker` library:
21
+
22
+ ```bash
23
+ pip install span_marker
24
+ ```
25
+
26
+ You can then run inference with this model like so:
27
+
28
+ ```python
29
+ from span_marker import SpanMarkerModel
30
+
31
+ # Download from the 🤗 Hub
32
+ model = SpanMarkerModel.from_pretrained("span_marker_model_name")
33
+ # Run inference
34
+ entities = model.predict("Amelia Earhart flew her single engine Lockheed Vega 5B across the Atlantic to Paris.")
35
+ ```
36
+
37
+ See the [SpanMarker](https://github.com/tomaarsen/SpanMarkerNER) repository for documentation and additional information on this library.
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<end>": 28997,
3
+ "<start>": 28996
4
+ }
config.json ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models\\span_marker_bert_base_cross_ner_P3ps\\checkpoint-final",
3
+ "architectures": [
4
+ "SpanMarkerModel"
5
+ ],
6
+ "encoder": {
7
+ "_name_or_path": "bert-base-cased",
8
+ "add_cross_attention": false,
9
+ "architectures": [
10
+ "BertForMaskedLM"
11
+ ],
12
+ "attention_probs_dropout_prob": 0.1,
13
+ "bad_words_ids": null,
14
+ "begin_suppress_tokens": null,
15
+ "bos_token_id": null,
16
+ "chunk_size_feed_forward": 0,
17
+ "classifier_dropout": null,
18
+ "cross_attention_hidden_size": null,
19
+ "decoder_start_token_id": null,
20
+ "diversity_penalty": 0.0,
21
+ "do_sample": false,
22
+ "early_stopping": false,
23
+ "encoder_no_repeat_ngram_size": 0,
24
+ "eos_token_id": null,
25
+ "exponential_decay_length_penalty": null,
26
+ "finetuning_task": null,
27
+ "forced_bos_token_id": null,
28
+ "forced_eos_token_id": null,
29
+ "gradient_checkpointing": false,
30
+ "hidden_act": "gelu",
31
+ "hidden_dropout_prob": 0.1,
32
+ "hidden_size": 768,
33
+ "id2label": {
34
+ "0": "O",
35
+ "1": "B-academicjournal",
36
+ "2": "I-academicjournal",
37
+ "3": "B-album",
38
+ "4": "I-album",
39
+ "5": "B-algorithm",
40
+ "6": "I-algorithm",
41
+ "7": "B-astronomicalobject",
42
+ "8": "I-astronomicalobject",
43
+ "9": "B-award",
44
+ "10": "I-award",
45
+ "11": "B-band",
46
+ "12": "I-band",
47
+ "13": "B-book",
48
+ "14": "I-book",
49
+ "15": "B-chemicalcompound",
50
+ "16": "I-chemicalcompound",
51
+ "17": "B-chemicalelement",
52
+ "18": "I-chemicalelement",
53
+ "19": "B-conference",
54
+ "20": "I-conference",
55
+ "21": "B-country",
56
+ "22": "I-country",
57
+ "23": "B-discipline",
58
+ "24": "I-discipline",
59
+ "25": "B-election",
60
+ "26": "I-election",
61
+ "27": "B-enzyme",
62
+ "28": "I-enzyme",
63
+ "29": "B-event",
64
+ "30": "I-event",
65
+ "31": "B-field",
66
+ "32": "I-field",
67
+ "33": "B-literarygenre",
68
+ "34": "I-literarygenre",
69
+ "35": "B-location",
70
+ "36": "I-location",
71
+ "37": "B-magazine",
72
+ "38": "I-magazine",
73
+ "39": "B-metrics",
74
+ "40": "I-metrics",
75
+ "41": "B-misc",
76
+ "42": "I-misc",
77
+ "43": "B-musicalartist",
78
+ "44": "I-musicalartist",
79
+ "45": "B-musicalinstrument",
80
+ "46": "I-musicalinstrument",
81
+ "47": "B-musicgenre",
82
+ "48": "I-musicgenre",
83
+ "49": "B-organisation",
84
+ "50": "I-organisation",
85
+ "51": "B-person",
86
+ "52": "I-person",
87
+ "53": "B-poem",
88
+ "54": "I-poem",
89
+ "55": "B-politicalparty",
90
+ "56": "I-politicalparty",
91
+ "57": "B-politician",
92
+ "58": "I-politician",
93
+ "59": "B-product",
94
+ "60": "I-product",
95
+ "61": "B-programlang",
96
+ "62": "I-programlang",
97
+ "63": "B-protein",
98
+ "64": "I-protein",
99
+ "65": "B-researcher",
100
+ "66": "I-researcher",
101
+ "67": "B-scientist",
102
+ "68": "I-scientist",
103
+ "69": "B-song",
104
+ "70": "I-song",
105
+ "71": "B-task",
106
+ "72": "I-task",
107
+ "73": "B-theory",
108
+ "74": "I-theory",
109
+ "75": "B-university",
110
+ "76": "I-university",
111
+ "77": "B-writer",
112
+ "78": "I-writer"
113
+ },
114
+ "initializer_range": 0.02,
115
+ "intermediate_size": 3072,
116
+ "is_decoder": false,
117
+ "is_encoder_decoder": false,
118
+ "label2id": {
119
+ "B-academicjournal": 1,
120
+ "B-album": 3,
121
+ "B-algorithm": 5,
122
+ "B-astronomicalobject": 7,
123
+ "B-award": 9,
124
+ "B-band": 11,
125
+ "B-book": 13,
126
+ "B-chemicalcompound": 15,
127
+ "B-chemicalelement": 17,
128
+ "B-conference": 19,
129
+ "B-country": 21,
130
+ "B-discipline": 23,
131
+ "B-election": 25,
132
+ "B-enzyme": 27,
133
+ "B-event": 29,
134
+ "B-field": 31,
135
+ "B-literarygenre": 33,
136
+ "B-location": 35,
137
+ "B-magazine": 37,
138
+ "B-metrics": 39,
139
+ "B-misc": 41,
140
+ "B-musicalartist": 43,
141
+ "B-musicalinstrument": 45,
142
+ "B-musicgenre": 47,
143
+ "B-organisation": 49,
144
+ "B-person": 51,
145
+ "B-poem": 53,
146
+ "B-politicalparty": 55,
147
+ "B-politician": 57,
148
+ "B-product": 59,
149
+ "B-programlang": 61,
150
+ "B-protein": 63,
151
+ "B-researcher": 65,
152
+ "B-scientist": 67,
153
+ "B-song": 69,
154
+ "B-task": 71,
155
+ "B-theory": 73,
156
+ "B-university": 75,
157
+ "B-writer": 77,
158
+ "I-academicjournal": 2,
159
+ "I-album": 4,
160
+ "I-algorithm": 6,
161
+ "I-astronomicalobject": 8,
162
+ "I-award": 10,
163
+ "I-band": 12,
164
+ "I-book": 14,
165
+ "I-chemicalcompound": 16,
166
+ "I-chemicalelement": 18,
167
+ "I-conference": 20,
168
+ "I-country": 22,
169
+ "I-discipline": 24,
170
+ "I-election": 26,
171
+ "I-enzyme": 28,
172
+ "I-event": 30,
173
+ "I-field": 32,
174
+ "I-literarygenre": 34,
175
+ "I-location": 36,
176
+ "I-magazine": 38,
177
+ "I-metrics": 40,
178
+ "I-misc": 42,
179
+ "I-musicalartist": 44,
180
+ "I-musicalinstrument": 46,
181
+ "I-musicgenre": 48,
182
+ "I-organisation": 50,
183
+ "I-person": 52,
184
+ "I-poem": 54,
185
+ "I-politicalparty": 56,
186
+ "I-politician": 58,
187
+ "I-product": 60,
188
+ "I-programlang": 62,
189
+ "I-protein": 64,
190
+ "I-researcher": 66,
191
+ "I-scientist": 68,
192
+ "I-song": 70,
193
+ "I-task": 72,
194
+ "I-theory": 74,
195
+ "I-university": 76,
196
+ "I-writer": 78,
197
+ "O": 0
198
+ },
199
+ "layer_norm_eps": 1e-12,
200
+ "length_penalty": 1.0,
201
+ "max_length": 20,
202
+ "max_position_embeddings": 512,
203
+ "min_length": 0,
204
+ "model_type": "bert",
205
+ "no_repeat_ngram_size": 0,
206
+ "num_attention_heads": 12,
207
+ "num_beam_groups": 1,
208
+ "num_beams": 1,
209
+ "num_hidden_layers": 12,
210
+ "num_return_sequences": 1,
211
+ "output_attentions": false,
212
+ "output_hidden_states": false,
213
+ "output_scores": false,
214
+ "pad_token_id": 0,
215
+ "position_embedding_type": "absolute",
216
+ "prefix": null,
217
+ "problem_type": null,
218
+ "pruned_heads": {},
219
+ "remove_invalid_values": false,
220
+ "repetition_penalty": 1.0,
221
+ "return_dict": true,
222
+ "return_dict_in_generate": false,
223
+ "sep_token_id": null,
224
+ "suppress_tokens": null,
225
+ "task_specific_params": null,
226
+ "temperature": 1.0,
227
+ "tf_legacy_loss": false,
228
+ "tie_encoder_decoder": false,
229
+ "tie_word_embeddings": true,
230
+ "tokenizer_class": null,
231
+ "top_k": 50,
232
+ "top_p": 1.0,
233
+ "torch_dtype": null,
234
+ "torchscript": false,
235
+ "transformers_version": "4.31.0",
236
+ "type_vocab_size": 2,
237
+ "typical_p": 1.0,
238
+ "use_bfloat16": false,
239
+ "use_cache": true,
240
+ "vocab_size": 28998
241
+ },
242
+ "entity_max_length": 8,
243
+ "id2label": {
244
+ "0": "O",
245
+ "1": "academicjournal",
246
+ "2": "album",
247
+ "3": "algorithm",
248
+ "4": "astronomicalobject",
249
+ "5": "award",
250
+ "6": "band",
251
+ "7": "book",
252
+ "8": "chemicalcompound",
253
+ "9": "chemicalelement",
254
+ "10": "conference",
255
+ "11": "country",
256
+ "12": "discipline",
257
+ "13": "election",
258
+ "14": "enzyme",
259
+ "15": "event",
260
+ "16": "field",
261
+ "17": "literarygenre",
262
+ "18": "location",
263
+ "19": "magazine",
264
+ "20": "metrics",
265
+ "21": "misc",
266
+ "22": "musicalartist",
267
+ "23": "musicalinstrument",
268
+ "24": "musicgenre",
269
+ "25": "organisation",
270
+ "26": "person",
271
+ "27": "poem",
272
+ "28": "politicalparty",
273
+ "29": "politician",
274
+ "30": "product",
275
+ "31": "programlang",
276
+ "32": "protein",
277
+ "33": "researcher",
278
+ "34": "scientist",
279
+ "35": "song",
280
+ "36": "task",
281
+ "37": "theory",
282
+ "38": "university",
283
+ "39": "writer"
284
+ },
285
+ "id2reduced_id": {
286
+ "0": 0,
287
+ "1": 1,
288
+ "2": 1,
289
+ "3": 2,
290
+ "4": 2,
291
+ "5": 3,
292
+ "6": 3,
293
+ "7": 4,
294
+ "8": 4,
295
+ "9": 5,
296
+ "10": 5,
297
+ "11": 6,
298
+ "12": 6,
299
+ "13": 7,
300
+ "14": 7,
301
+ "15": 8,
302
+ "16": 8,
303
+ "17": 9,
304
+ "18": 9,
305
+ "19": 10,
306
+ "20": 10,
307
+ "21": 11,
308
+ "22": 11,
309
+ "23": 12,
310
+ "24": 12,
311
+ "25": 13,
312
+ "26": 13,
313
+ "27": 14,
314
+ "28": 14,
315
+ "29": 15,
316
+ "30": 15,
317
+ "31": 16,
318
+ "32": 16,
319
+ "33": 17,
320
+ "34": 17,
321
+ "35": 18,
322
+ "36": 18,
323
+ "37": 19,
324
+ "38": 19,
325
+ "39": 20,
326
+ "40": 20,
327
+ "41": 21,
328
+ "42": 21,
329
+ "43": 22,
330
+ "44": 22,
331
+ "45": 23,
332
+ "46": 23,
333
+ "47": 24,
334
+ "48": 24,
335
+ "49": 25,
336
+ "50": 25,
337
+ "51": 26,
338
+ "52": 26,
339
+ "53": 27,
340
+ "54": 27,
341
+ "55": 28,
342
+ "56": 28,
343
+ "57": 29,
344
+ "58": 29,
345
+ "59": 30,
346
+ "60": 30,
347
+ "61": 31,
348
+ "62": 31,
349
+ "63": 32,
350
+ "64": 32,
351
+ "65": 33,
352
+ "66": 33,
353
+ "67": 34,
354
+ "68": 34,
355
+ "69": 35,
356
+ "70": 35,
357
+ "71": 36,
358
+ "72": 36,
359
+ "73": 37,
360
+ "74": 37,
361
+ "75": 38,
362
+ "76": 38,
363
+ "77": 39,
364
+ "78": 39
365
+ },
366
+ "label2id": {
367
+ "O": 0,
368
+ "academicjournal": 1,
369
+ "album": 2,
370
+ "algorithm": 3,
371
+ "astronomicalobject": 4,
372
+ "award": 5,
373
+ "band": 6,
374
+ "book": 7,
375
+ "chemicalcompound": 8,
376
+ "chemicalelement": 9,
377
+ "conference": 10,
378
+ "country": 11,
379
+ "discipline": 12,
380
+ "election": 13,
381
+ "enzyme": 14,
382
+ "event": 15,
383
+ "field": 16,
384
+ "literarygenre": 17,
385
+ "location": 18,
386
+ "magazine": 19,
387
+ "metrics": 20,
388
+ "misc": 21,
389
+ "musicalartist": 22,
390
+ "musicalinstrument": 23,
391
+ "musicgenre": 24,
392
+ "organisation": 25,
393
+ "person": 26,
394
+ "poem": 27,
395
+ "politicalparty": 28,
396
+ "politician": 29,
397
+ "product": 30,
398
+ "programlang": 31,
399
+ "protein": 32,
400
+ "researcher": 33,
401
+ "scientist": 34,
402
+ "song": 35,
403
+ "task": 36,
404
+ "theory": 37,
405
+ "university": 38,
406
+ "writer": 39
407
+ },
408
+ "marker_max_length": 128,
409
+ "max_next_context": null,
410
+ "max_prev_context": null,
411
+ "model_max_length": 256,
412
+ "model_max_length_default": 512,
413
+ "model_type": "span-marker",
414
+ "span_marker_version": "1.2.5.dev",
415
+ "torch_dtype": "float32",
416
+ "trained_with_document_context": false,
417
+ "transformers_version": "4.31.0",
418
+ "vocab_size": 28998
419
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897a7386d49089a21fce80675e020b782b7f31ac4e4ae625ca8d35841aff211c
3
+ size 433559281
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff