KoichiYasuoka commited on
Commit
db20055
1 Parent(s): 278c0de

model improved

Browse files
Files changed (7) hide show
  1. README.md +1 -1
  2. config.json +77 -157
  3. pytorch_model.bin +2 -2
  4. supar.model +2 -2
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +1 -1
  7. vocab.txt +0 -0
README.md CHANGED
@@ -16,7 +16,7 @@ pipeline_tag: "token-classification"
16
 
17
  ## Model Description
18
 
19
- This is a BERT model pre-trained with [UD_German-HDT](https://github.com/UniversalDependencies/UD_German-HDT) for POS-tagging and dependency-parsing, derived from [bert-base-german-cased](https://huggingface.co/bert-base-german-cased). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
20
 
21
  ## How to Use
22
 
16
 
17
  ## Model Description
18
 
19
+ This is a BERT model pre-trained with [UD_German-HDT](https://github.com/UniversalDependencies/UD_German-HDT) for POS-tagging and dependency-parsing, derived from [gbert-base](https://huggingface.co/deepset/gbert-base). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
20
 
21
  ## How to Use
22
 
config.json CHANGED
@@ -21,56 +21,44 @@
21
  "10": "B-AUX",
22
  "11": "B-CCONJ",
23
  "12": "B-DET",
24
- "13": "B-DET+NOUN",
25
- "14": "B-INTJ",
26
- "15": "B-NOUN",
27
- "16": "B-NOUN+PROPN",
28
- "17": "B-NUM",
29
- "18": "B-PRON",
30
- "19": "B-PROPN",
31
- "20": "B-PROPN+PROPN",
32
- "21": "B-PUNCT",
33
- "22": "B-SCONJ",
34
- "23": "B-VERB",
35
- "24": "B-X",
36
- "25": "CCONJ",
37
- "26": "CCONJ+NOUN",
38
- "27": "DET",
39
- "28": "DET+NOUN",
40
- "29": "I-ADJ",
41
- "30": "I-ADP",
42
- "31": "I-ADP+DET",
43
- "32": "I-ADV",
44
- "33": "I-AUX",
45
- "34": "I-CCONJ",
46
- "35": "I-DET",
47
- "36": "I-DET+NOUN",
48
- "37": "I-INTJ",
49
- "38": "I-NOUN",
50
- "39": "I-NOUN+PROPN",
51
- "40": "I-NUM",
52
- "41": "I-PRON",
53
- "42": "I-PROPN",
54
- "43": "I-PROPN+PROPN",
55
- "44": "I-PUNCT",
56
- "45": "I-SCONJ",
57
- "46": "I-VERB",
58
- "47": "I-X",
59
- "48": "INTJ",
60
- "49": "NOUN",
61
- "50": "NOUN+PROPN",
62
- "51": "NOUN+X",
63
- "52": "NUM",
64
- "53": "PART",
65
- "54": "PRON",
66
- "55": "PROPN",
67
- "56": "PROPN+PROPN",
68
- "57": "PROPN+X",
69
- "58": "PUNCT",
70
- "59": "SCONJ",
71
- "60": "SYM",
72
- "61": "VERB",
73
- "62": "X"
74
  },
75
  "initializer_range": 0.02,
76
  "intermediate_size": 3072,
@@ -88,56 +76,44 @@
88
  "B-AUX": 10,
89
  "B-CCONJ": 11,
90
  "B-DET": 12,
91
- "B-DET+NOUN": 13,
92
- "B-INTJ": 14,
93
- "B-NOUN": 15,
94
- "B-NOUN+PROPN": 16,
95
- "B-NUM": 17,
96
- "B-PRON": 18,
97
- "B-PROPN": 19,
98
- "B-PROPN+PROPN": 20,
99
- "B-PUNCT": 21,
100
- "B-SCONJ": 22,
101
- "B-VERB": 23,
102
- "B-X": 24,
103
- "CCONJ": 25,
104
- "CCONJ+NOUN": 26,
105
- "DET": 27,
106
- "DET+NOUN": 28,
107
- "I-ADJ": 29,
108
- "I-ADP": 30,
109
- "I-ADP+DET": 31,
110
- "I-ADV": 32,
111
- "I-AUX": 33,
112
- "I-CCONJ": 34,
113
- "I-DET": 35,
114
- "I-DET+NOUN": 36,
115
- "I-INTJ": 37,
116
- "I-NOUN": 38,
117
- "I-NOUN+PROPN": 39,
118
- "I-NUM": 40,
119
- "I-PRON": 41,
120
- "I-PROPN": 42,
121
- "I-PROPN+PROPN": 43,
122
- "I-PUNCT": 44,
123
- "I-SCONJ": 45,
124
- "I-VERB": 46,
125
- "I-X": 47,
126
- "INTJ": 48,
127
- "NOUN": 49,
128
- "NOUN+PROPN": 50,
129
- "NOUN+X": 51,
130
- "NUM": 52,
131
- "PART": 53,
132
- "PRON": 54,
133
- "PROPN": 55,
134
- "PROPN+PROPN": 56,
135
- "PROPN+X": 57,
136
- "PUNCT": 58,
137
- "SCONJ": 59,
138
- "SYM": 60,
139
- "VERB": 61,
140
- "X": 62
141
  },
142
  "layer_norm_eps": 1e-12,
143
  "max_position_embeddings": 512,
@@ -152,10 +128,6 @@
152
  "selbst\u00e4ndigeDenken\u00b8": [
153
  "selbst\u00e4ndige",
154
  "Denken\u00b8"
155
- ],
156
- "\u00f6ffentlichenInternetcaf\u00e9s": [
157
- "\u00f6ffentlichen",
158
- "Internetcaf\u00e9s"
159
  ]
160
  },
161
  "ADP+DET": {
@@ -291,58 +263,6 @@
291
  "\u00fcber",
292
  "das"
293
  ]
294
- },
295
- "CCONJ+NOUN": {
296
- "sowieInternetcaf\u00e9s": [
297
- "sowie",
298
- "Internetcaf\u00e9s"
299
- ]
300
- },
301
- "DET+NOUN": {
302
- "dieCr\u00e8me": [
303
- "die",
304
- "Cr\u00e8me"
305
- ],
306
- "dieFort\u00e9-Familie": [
307
- "die",
308
- "Fort\u00e9-Familie"
309
- ]
310
- },
311
- "NOUN+PROPN": {
312
- "Highend-ModellCli\u00e9": [
313
- "Highend-Modell",
314
- "Cli\u00e9"
315
- ],
316
- "McKinsey-BeraterRen\u00e9": [
317
- "McKinsey-Berater",
318
- "Ren\u00e9"
319
- ],
320
- "NachfolgemodellCli\u00e9": [
321
- "Nachfolgemodell",
322
- "Cli\u00e9"
323
- ]
324
- },
325
- "PROPN+PROPN": {
326
- "AlexanderArtop\u00e9": [
327
- "Alexander",
328
- "Artop\u00e9"
329
- ],
330
- "ClausS\u00f8rensen": [
331
- "Claus",
332
- "S\u00f8rensen"
333
- ],
334
- "Jean-LouisGass\u00e9e": [
335
- "Jean-Louis",
336
- "Gass\u00e9e"
337
- ],
338
- "JoelleRichardi\u00e9re": [
339
- "Joelle",
340
- "Richardi\u00e9re"
341
- ],
342
- "LeD\u00e9aut": [
343
- "Le",
344
- "D\u00e9aut"
345
- ]
346
  }
347
  }
348
  },
@@ -351,5 +271,5 @@
351
  "transformers_version": "4.17.0",
352
  "type_vocab_size": 2,
353
  "use_cache": true,
354
- "vocab_size": 30000
355
  }
21
  "10": "B-AUX",
22
  "11": "B-CCONJ",
23
  "12": "B-DET",
24
+ "13": "B-INTJ",
25
+ "14": "B-NOUN",
26
+ "15": "B-NUM",
27
+ "16": "B-PRON",
28
+ "17": "B-PROPN",
29
+ "18": "B-PUNCT",
30
+ "19": "B-SCONJ",
31
+ "20": "B-VERB",
32
+ "21": "B-X",
33
+ "22": "CCONJ",
34
+ "23": "DET",
35
+ "24": "I-ADJ",
36
+ "25": "I-ADP",
37
+ "26": "I-ADP+DET",
38
+ "27": "I-ADV",
39
+ "28": "I-AUX",
40
+ "29": "I-CCONJ",
41
+ "30": "I-DET",
42
+ "31": "I-INTJ",
43
+ "32": "I-NOUN",
44
+ "33": "I-NUM",
45
+ "34": "I-PRON",
46
+ "35": "I-PROPN",
47
+ "36": "I-PUNCT",
48
+ "37": "I-SCONJ",
49
+ "38": "I-VERB",
50
+ "39": "I-X",
51
+ "40": "INTJ",
52
+ "41": "NOUN",
53
+ "42": "NUM",
54
+ "43": "PART",
55
+ "44": "PRON",
56
+ "45": "PROPN",
57
+ "46": "PUNCT",
58
+ "47": "SCONJ",
59
+ "48": "SYM",
60
+ "49": "VERB",
61
+ "50": "X"
 
 
 
 
 
 
 
 
 
 
 
 
62
  },
63
  "initializer_range": 0.02,
64
  "intermediate_size": 3072,
76
  "B-AUX": 10,
77
  "B-CCONJ": 11,
78
  "B-DET": 12,
79
+ "B-INTJ": 13,
80
+ "B-NOUN": 14,
81
+ "B-NUM": 15,
82
+ "B-PRON": 16,
83
+ "B-PROPN": 17,
84
+ "B-PUNCT": 18,
85
+ "B-SCONJ": 19,
86
+ "B-VERB": 20,
87
+ "B-X": 21,
88
+ "CCONJ": 22,
89
+ "DET": 23,
90
+ "I-ADJ": 24,
91
+ "I-ADP": 25,
92
+ "I-ADP+DET": 26,
93
+ "I-ADV": 27,
94
+ "I-AUX": 28,
95
+ "I-CCONJ": 29,
96
+ "I-DET": 30,
97
+ "I-INTJ": 31,
98
+ "I-NOUN": 32,
99
+ "I-NUM": 33,
100
+ "I-PRON": 34,
101
+ "I-PROPN": 35,
102
+ "I-PUNCT": 36,
103
+ "I-SCONJ": 37,
104
+ "I-VERB": 38,
105
+ "I-X": 39,
106
+ "INTJ": 40,
107
+ "NOUN": 41,
108
+ "NUM": 42,
109
+ "PART": 43,
110
+ "PRON": 44,
111
+ "PROPN": 45,
112
+ "PUNCT": 46,
113
+ "SCONJ": 47,
114
+ "SYM": 48,
115
+ "VERB": 49,
116
+ "X": 50
 
 
 
 
 
 
 
 
 
 
 
 
117
  },
118
  "layer_norm_eps": 1e-12,
119
  "max_position_embeddings": 512,
128
  "selbst\u00e4ndigeDenken\u00b8": [
129
  "selbst\u00e4ndige",
130
  "Denken\u00b8"
 
 
 
 
131
  ]
132
  },
133
  "ADP+DET": {
263
  "\u00fcber",
264
  "das"
265
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  }
267
  }
268
  },
271
  "transformers_version": "4.17.0",
272
  "type_vocab_size": 2,
273
  "use_cache": true,
274
+ "vocab_size": 31102
275
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22c187760763df4a0ba59d50560fd2464f373b3e771ffd63b4bf2a81430eae04
3
- size 434244299
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b142e370aff8d696daa9032fe62347bc15e9a5536b7486ff390b7e949806b8e
3
+ size 437592708
supar.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6767137a559c0cda3f40dacaa05190e965713588b2e3d5aea8a355f830e9cad
3
- size 513838054
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2209e84685d556258b7ce1e2a62e03fcb4cdb0f651fb32cb37836c74feea9cc9
3
+ size 517221736
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "tokenizer_class": "BertTokenizerFast"}
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": false, "model_max_length": 512, "do_basic_tokenize": true, "tokenizer_class": "BertTokenizerFast"}
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff