emfomy commited on
Commit
d5f0a17
1 Parent(s): 0e019d0

Upload model files.

Browse files
Files changed (6) hide show
  1. README.md +47 -0
  2. config.json +152 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +1 -0
  5. tokenizer_config.json +1 -0
  6. vocab.txt +0 -0
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - zh
4
+ thumbnail: https://ckip.iis.sinica.edu.tw/files/ckip_logo.png
5
+ tags:
6
+ - pytorch
7
+ - token-classification
8
+ - bert
9
+ - zh
10
+ license: gpl-3.0
11
+ datasets:
12
+ metrics:
13
+ ---
14
+
15
+ # CKIP BERT Tiny Chinese
16
+
17
+ This project provides traditional Chinese transformers models (including ALBERT, BERT, GPT2) and NLP tools (including word segmentation, part-of-speech tagging, named entity recognition).
18
+
19
+ 這個專案提供了繁體中文的 transformers 模型(包含 ALBERT、BERT、GPT2)及自然語言處理工具(包含斷詞、詞性標記、實體辨識)。
20
+
21
+ ## Homepage
22
+
23
+ * https://github.com/ckiplab/ckip-transformers
24
+
25
+ ## Contributers
26
+
27
+ * [Mu Yang](https://muyang.pro) at [CKIP](https://ckip.iis.sinica.edu.tw) (Author & Maintainer)
28
+
29
+ ## Usage
30
+
31
+ Please use BertTokenizerFast as tokenizer instead of AutoTokenizer.
32
+
33
+ 請使用 BertTokenizerFast 而非 AutoTokenizer。
34
+
35
+ ```
36
+ from transformers import (
37
+ BertTokenizerFast,
38
+ AutoModel,
39
+ )
40
+
41
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
42
+ model = AutoModel.from_pretrained('ckiplab/bert-tiny-chinese-pos')
43
+ ```
44
+
45
+ For full usage and more information, please refer to https://github.com/ckiplab/ckip-transformers.
46
+
47
+ 有關完整使用方法及其他資訊,請參見 https://github.com/ckiplab/ckip-transformers 。
config.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../../../model/bert-tiny-scratch-lm",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "directionality": "bidi",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 312,
12
+ "id2label": {
13
+ "0": "A",
14
+ "1": "Caa",
15
+ "2": "Cab",
16
+ "3": "Cba",
17
+ "4": "Cbb",
18
+ "5": "D",
19
+ "6": "Da",
20
+ "7": "Dfa",
21
+ "8": "Dfb",
22
+ "9": "Di",
23
+ "10": "Dk",
24
+ "11": "DM",
25
+ "12": "I",
26
+ "13": "Na",
27
+ "14": "Nb",
28
+ "15": "Nc",
29
+ "16": "Ncd",
30
+ "17": "Nd",
31
+ "18": "Nep",
32
+ "19": "Neqa",
33
+ "20": "Neqb",
34
+ "21": "Nes",
35
+ "22": "Neu",
36
+ "23": "Nf",
37
+ "24": "Ng",
38
+ "25": "Nh",
39
+ "26": "Nv",
40
+ "27": "P",
41
+ "28": "T",
42
+ "29": "VA",
43
+ "30": "VAC",
44
+ "31": "VB",
45
+ "32": "VC",
46
+ "33": "VCL",
47
+ "34": "VD",
48
+ "35": "VF",
49
+ "36": "VE",
50
+ "37": "VG",
51
+ "38": "VH",
52
+ "39": "VHC",
53
+ "40": "VI",
54
+ "41": "VJ",
55
+ "42": "VK",
56
+ "43": "VL",
57
+ "44": "V_2",
58
+ "45": "DE",
59
+ "46": "SHI",
60
+ "47": "FW",
61
+ "48": "COLONCATEGORY",
62
+ "49": "COMMACATEGORY",
63
+ "50": "DASHCATEGORY",
64
+ "51": "DOTCATEGORY",
65
+ "52": "ETCCATEGORY",
66
+ "53": "EXCLAMATIONCATEGORY",
67
+ "54": "PARENTHESISCATEGORY",
68
+ "55": "PAUSECATEGORY",
69
+ "56": "PERIODCATEGORY",
70
+ "57": "QUESTIONCATEGORY",
71
+ "58": "SEMICOLONCATEGORY",
72
+ "59": "SPCHANGECATEGORY"
73
+ },
74
+ "initializer_range": 0.02,
75
+ "intermediate_size": 1248,
76
+ "label2id": {
77
+ "A": 0,
78
+ "COLONCATEGORY": 48,
79
+ "COMMACATEGORY": 49,
80
+ "Caa": 1,
81
+ "Cab": 2,
82
+ "Cba": 3,
83
+ "Cbb": 4,
84
+ "D": 5,
85
+ "DASHCATEGORY": 50,
86
+ "DE": 45,
87
+ "DM": 11,
88
+ "DOTCATEGORY": 51,
89
+ "Da": 6,
90
+ "Dfa": 7,
91
+ "Dfb": 8,
92
+ "Di": 9,
93
+ "Dk": 10,
94
+ "ETCCATEGORY": 52,
95
+ "EXCLAMATIONCATEGORY": 53,
96
+ "FW": 47,
97
+ "I": 12,
98
+ "Na": 13,
99
+ "Nb": 14,
100
+ "Nc": 15,
101
+ "Ncd": 16,
102
+ "Nd": 17,
103
+ "Nep": 18,
104
+ "Neqa": 19,
105
+ "Neqb": 20,
106
+ "Nes": 21,
107
+ "Neu": 22,
108
+ "Nf": 23,
109
+ "Ng": 24,
110
+ "Nh": 25,
111
+ "Nv": 26,
112
+ "P": 27,
113
+ "PARENTHESISCATEGORY": 54,
114
+ "PAUSECATEGORY": 55,
115
+ "PERIODCATEGORY": 56,
116
+ "QUESTIONCATEGORY": 57,
117
+ "SEMICOLONCATEGORY": 58,
118
+ "SHI": 46,
119
+ "SPCHANGECATEGORY": 59,
120
+ "T": 28,
121
+ "VA": 29,
122
+ "VAC": 30,
123
+ "VB": 31,
124
+ "VC": 32,
125
+ "VCL": 33,
126
+ "VD": 34,
127
+ "VE": 36,
128
+ "VF": 35,
129
+ "VG": 37,
130
+ "VH": 38,
131
+ "VHC": 39,
132
+ "VI": 40,
133
+ "VJ": 41,
134
+ "VK": 42,
135
+ "VL": 43,
136
+ "V_2": 44
137
+ },
138
+ "layer_norm_eps": 1e-12,
139
+ "max_position_embeddings": 512,
140
+ "model_type": "bert",
141
+ "num_attention_heads": 12,
142
+ "num_hidden_layers": 4,
143
+ "pad_token_id": 0,
144
+ "pooler_fc_size": 312,
145
+ "pooler_num_attention_heads": 12,
146
+ "pooler_num_fc_layers": 3,
147
+ "pooler_size_per_head": 128,
148
+ "pooler_type": "first_token_transform",
149
+ "tokenizer_class": "BertTokenizerFast",
150
+ "type_vocab_size": 2,
151
+ "vocab_size": 21128
152
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35cc03bfebeabbd283208389b29664a7ddf613ffc03cb24f4d266f9f11243e5
3
+ size 45874807
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "bert-base-chinese"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff