Lukekim commited on
Commit
1d78b68
1 Parent(s): 264b9e0

Upload 7 files

Browse files
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "fairseq-roberta-all-model/checkpoint_last.pt",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "pad_token_id": 1,
31
+ "position_embedding_type": "absolute",
32
+ "problem_type": "single_label_classification",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.19.2",
35
+ "type_vocab_size": 1,
36
+ "use_cache": true,
37
+ "vocab_size": 52001
38
+ }
gitattributes.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e837cd62414db71e958b767cf9ef0f1ff98c48afc40300b5826ebb9a6b589934
3
+ size 503996397
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6628eae3bc0eb53ddfdd1d988cfd6527738fde88e873193055f2fdce468edcc0
3
+ size 1228505
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenization_roberta_spm.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team and Gyeongmin Kim
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers.models.xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
16
+
17
+ SPIECE_UNDERLINE = "▁"
18
+
19
+ VOCAB_FILES_NAMES = {"spm_model": "spm.model", "custom_vocab_file": "dict.txt"}
20
+
21
+ PRETRAINED_VOCAB_FILES_MAP = {
22
+ "spm_model": {
23
+ "fairseq-roberta-spm-normal": "fairseq-roberta-all-model/spm.model",
24
+ },
25
+ "custom_vocab_file": {
26
+ "fairseq-roberta-spm-normal": "fairseq-roberta-all-model/dict.txt",
27
+ }
28
+ }
29
+
30
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
31
+ "fairseq-roberta-spm-normal": 512,
32
+ }
33
+
34
+
35
+ class FairSeqRobertaSentencePieceTokenizer(XLMRobertaTokenizer):
36
+
37
+ vocab_files_names = VOCAB_FILES_NAMES
38
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
39
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
40
+
41
+ def __init__(
42
+ self,
43
+ spm_model,
44
+ custom_vocab_file,
45
+ bos_token="[CLS]",
46
+ eos_token="[SEP]",
47
+ sep_token="[SEP]",
48
+ cls_token="[CLS]",
49
+ unk_token="[UNK]",
50
+ pad_token="[PAD]",
51
+ mask_token="[MASK]",
52
+ **kwargs
53
+ ):
54
+ super().__init__(
55
+ vocab_file=spm_model,
56
+ bos_token=bos_token,
57
+ eos_token=eos_token,
58
+ unk_token=unk_token,
59
+ sep_token=sep_token,
60
+ cls_token=cls_token,
61
+ pad_token=pad_token,
62
+ mask_token=mask_token,
63
+ **kwargs,
64
+ )
65
+
66
+ # FairSeq dictioanry: <s>, <pad>, </s>, <unk>, token1, token2, ..., tokenN, <mask>
67
+ self.symbols = []
68
+ self.count = []
69
+ self.spm_id_to_fairseq_id = {}
70
+ self._add_symbol(self.sp_model.PieceToId(bos_token))
71
+ self._add_symbol(self.sp_model.PieceToId(pad_token))
72
+ self._add_symbol(self.sp_model.PieceToId(eos_token))
73
+ self._add_symbol(self.sp_model.PieceToId(unk_token))
74
+ self._add_from_file(custom_vocab_file)
75
+ self._add_symbol(self.sp_model.PieceToId(mask_token))
76
+
77
+ self.fairseq_tokens_to_ids = {}
78
+ self.fairseq_tokens_to_ids = self._build_fairseq_tokens_to_ids()
79
+ # self.spm_id_to_fairseq_id(bridge vocab)을 이용해서 real token -> fairseq id로 연결해주는 vocabulary
80
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
81
+
82
+ # Collect some stats like OOV rate.
83
+ self._num_tokens_converted = 0
84
+ self._num_tokens_oov = 0
85
+
86
+ @property
87
+ def vocab_size(self):
88
+ return len(self.symbols)
89
+
90
+ @property
91
+ def pad_token_id(self):
92
+ return self.fairseq_tokens_to_ids.get(self.pad_token)
93
+
94
+ @property
95
+ def unk_token_id(self):
96
+ return self.fairseq_tokens_to_ids.get(self.unk_token)
97
+
98
+ def reset_stats(self):
99
+ self._num_tokens_converted = 0
100
+ self._num_tokens_oov = 0
101
+
102
+ def get_stats(self):
103
+ oov_rate = self._num_tokens_oov / self._num_tokens_converted
104
+ result = {
105
+ "total": self._num_tokens_converted,
106
+ "oov": self._num_tokens_oov,
107
+ "oov_rate": oov_rate
108
+ }
109
+ return result
110
+
111
+ def _convert_token_to_id(self, token):
112
+ """ Converts a token (str) in an id using the vocab. """
113
+ self._num_tokens_converted += 1
114
+ if token in self.fairseq_tokens_to_ids:
115
+ return self.fairseq_tokens_to_ids[token]
116
+ else:
117
+ self._num_tokens_oov += 1
118
+ return self.unk_token_id
119
+
120
+ def _convert_id_to_token(self, index):
121
+ """Converts an index (integer) in a token (str) using the vocab."""
122
+ if index in self.fairseq_ids_to_tokens:
123
+ return self.fairseq_ids_to_tokens[index]
124
+ else:
125
+ return self.unk_token
126
+
127
+ def _add_from_file(self, f):
128
+ """
129
+ Source: FairSeq Dictionary class.
130
+ Loads a pre-existing dictionary from a text file and adds its symbols
131
+ to this instance.
132
+ """
133
+ if isinstance(f, str):
134
+ try:
135
+ with open(f, "r", encoding="utf-8") as fd:
136
+ self._add_from_file(fd)
137
+ except FileNotFoundError as fnfe:
138
+ raise fnfe
139
+ except UnicodeError:
140
+ raise Exception(
141
+ "Incorrect encoding detected in {}, please "
142
+ "rebuild the dataset".format(f)
143
+ )
144
+ return
145
+
146
+ lines = f.readlines()
147
+ indices_start_line = 0
148
+
149
+ for line in lines[indices_start_line:]:
150
+ try:
151
+ line, field = line.rstrip().rsplit(" ", 1)
152
+ if field == "#fairseq:overwrite":
153
+ overwrite = True
154
+ line, field = line.rsplit(" ", 1)
155
+ else:
156
+ overwrite = False
157
+ count = int(field)
158
+ spm_id = line
159
+ if spm_id in self.spm_id_to_fairseq_id and not overwrite:
160
+ raise RuntimeError(
161
+ "Duplicate word found when loading Dictionary: '{}'. "
162
+ "Duplicate words can overwrite earlier ones by adding the "
163
+ "#fairseq:overwrite flag at the end of the corresponding row "
164
+ "in the dictionary file. If using the Camembert model, please "
165
+ "download an updated copy of the model file."
166
+ .format(spm_id)
167
+ )
168
+ self._add_symbol(spm_id, n=count, overwrite=overwrite)
169
+ except ValueError:
170
+ raise ValueError(
171
+ "Incorrect dictionary format, expected '<token> <cnt> [flags]'"
172
+ )
173
+
174
+ def _add_symbol(self, spm_id, n=1, overwrite=False):
175
+ """
176
+ Source: FairSeq Dictionary class.
177
+ Adds a word to the dictionary
178
+ """
179
+ if spm_id in self.spm_id_to_fairseq_id and not overwrite:
180
+ idx = self.spm_id_to_fairseq_id[spm_id]
181
+ self.count[idx] = self.count[idx] + n
182
+ return idx
183
+ else:
184
+ idx = len(self.symbols)
185
+ self.spm_id_to_fairseq_id[spm_id] = idx
186
+ self.symbols.append(spm_id)
187
+ self.count.append(n)
188
+ return idx
189
+
190
+ def _build_fairseq_tokens_to_ids(self):
191
+ # self.spm_id_to_fairseq_id(bridge vocab)을 이용해서 real token -> fairseq id로 연결해주는 vocabulary 빌드
192
+ fairseq_tokens_to_ids = self.fairseq_tokens_to_ids
193
+ for spm_id, fairseq_id in self.spm_id_to_fairseq_id.items():
194
+ if isinstance(spm_id, str) and "madeup" in spm_id:
195
+ print("[PASS] spm_id: {} | fairseq_id: {}".format(spm_id, fairseq_id))
196
+ continue
197
+ token = self.sp_model.IdToPiece(int(spm_id))
198
+ # print("token: {} | spm_id: {} | fairseq_id: {}".format(token, spm_id, fairseq_id))
199
+ fairseq_tokens_to_ids[str(token)] = fairseq_id
200
+ return fairseq_tokens_to_ids
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "cls_token": "[CLS]", "pad_token": "[PAD]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sp_model_kwargs": {}, "special_tokens_map_file": null, "name_or_path": "fairseq-roberta-all-model", "tokenizer_class": "FairSeqRobertaSentencePieceTokenizer"}