IlyaGusev commited on
Commit
46f7742
1 Parent(s): dfdd24c

Small files

Browse files
Files changed (5) hide show
  1. config.json +39 -0
  2. special_tokens_map.json +6 -0
  3. tokenizer.py +121 -0
  4. tokenizer_config.json +7 -0
  5. vocab.txt +71 -0
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 128,
9
+ "id2label": {
10
+ "0": "NO",
11
+ "1": "PRIMARY",
12
+ "2": "SECONDARY"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 512,
16
+ "label2id": {
17
+ "NO": 0,
18
+ "PRIMARY": 1,
19
+ "SECONDARY": 2
20
+ },
21
+ "layer_norm_eps": 1e-07,
22
+ "max_length": 40,
23
+ "max_position_embeddings": 64,
24
+ "max_relative_positions": -1,
25
+ "model_type": "deberta-v2",
26
+ "num_attention_heads": 4,
27
+ "num_hidden_layers": 4,
28
+ "pad_token_id": 0,
29
+ "pooler_dropout": 0,
30
+ "pooler_hidden_act": "gelu",
31
+ "pooler_hidden_size": 128,
32
+ "pos_att_type": null,
33
+ "position_biased_input": true,
34
+ "relative_attention": false,
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.20.1",
37
+ "type_vocab_size": 0,
38
+ "vocab_size": 71
39
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple, List
3
+ from collections import OrderedDict
4
+
5
+ from torch.utils.data import Dataset
6
+ from transformers import PreTrainedTokenizer
7
+
8
+
9
+ def load_vocab(vocab_file):
10
+ vocab = OrderedDict()
11
+ with open(vocab_file, "r", encoding="utf-8") as reader:
12
+ tokens = reader.readlines()
13
+ for index, token in enumerate(tokens):
14
+ token = token.rstrip("\n")
15
+ vocab[token] = index
16
+ return vocab
17
+
18
+
19
+ class CharTokenizer(PreTrainedTokenizer):
20
+ vocab_files_names = {"vocab_file": "vocab.txt"}
21
+
22
+ def __init__(
23
+ self,
24
+ vocab_file=None,
25
+ pad_token="[PAD]",
26
+ unk_token="[UNK]",
27
+ bos_token="[BOS]",
28
+ eos_token="[EOS]",
29
+ *args,
30
+ **kwargs
31
+ ):
32
+ super().__init__(
33
+ pad_token=pad_token,
34
+ unk_token=unk_token,
35
+ bos_token=bos_token,
36
+ eos_token=eos_token,
37
+ **kwargs
38
+ )
39
+
40
+ if not vocab_file or not os.path.isfile(vocab_file):
41
+ self.vocab = OrderedDict()
42
+ self.ids_to_tokens = OrderedDict()
43
+ else:
44
+ self.vocab = load_vocab(vocab_file)
45
+ self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
46
+
47
+ def train(self, file_path):
48
+ vocab = set()
49
+ with open(file_path) as r:
50
+ for line in r:
51
+ word = line.strip()
52
+ vocab |= set(word)
53
+ vocab = list(vocab)
54
+ vocab.sort()
55
+ special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
56
+ vocab = special_tokens + vocab
57
+
58
+ for i, ch in enumerate(vocab):
59
+ self.vocab[ch] = i
60
+ self.ids_to_tokens = vocab
61
+
62
+ @property
63
+ def vocab_size(self):
64
+ return len(self.vocab)
65
+
66
+ def get_vocab(self):
67
+ return self.vocab
68
+
69
+ def _convert_token_to_id(self, token):
70
+ return self.vocab.get(token)
71
+
72
+ def _convert_id_to_token(self, index):
73
+ return self.ids_to_tokens[index]
74
+
75
+ def _tokenize(self, text):
76
+ return list(text)
77
+
78
+ def convert_tokens_to_string(self, tokens):
79
+ return "".join(tokens)
80
+
81
+ def build_inputs_with_special_tokens(
82
+ self,
83
+ token_ids_0: List[int],
84
+ token_ids_1: Optional[List[int]] = None
85
+ ) -> List[int]:
86
+ bos = [self.bos_token_id]
87
+ eos = [self.eos_token_id]
88
+ return bos + token_ids_0 + eos
89
+
90
+ def get_special_tokens_mask(
91
+ self,
92
+ token_ids_0: List[int],
93
+ token_ids_1: Optional[List[int]] = None
94
+ ) -> List[int]:
95
+ return [1] + ([0] * len(token_ids_0)) + [1]
96
+
97
+ def create_token_type_ids_from_sequences(
98
+ self,
99
+ token_ids_0: List[int],
100
+ token_ids_1: Optional[List[int]] = None
101
+ ) -> List[int]:
102
+ return (len(token_ids_0) + 2) * [0]
103
+
104
+ def save_vocabulary(
105
+ self,
106
+ save_directory: str,
107
+ filename_prefix: Optional[str] = None
108
+ ) -> Tuple[str]:
109
+ assert os.path.isdir(save_directory)
110
+ vocab_file = os.path.join(
111
+ save_directory,
112
+ (filename_prefix + "-" if filename_prefix else "") +
113
+ self.vocab_files_names["vocab_file"]
114
+ )
115
+ index = 0
116
+ with open(vocab_file, "w", encoding="utf-8") as writer:
117
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
118
+ assert index == token_index
119
+ writer.write(token + "\n")
120
+ index += 1
121
+ return (vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "eos_token": "[EOS]",
4
+ "pad_token": "[PAD]",
5
+ "tokenizer_class": "CharTokenizer",
6
+ "unk_token": "[UNK]"
7
+ }
vocab.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [BOS]
4
+ [EOS]
5
+ '
6
+ -
7
+ `
8
+ Ё
9
+ А
10
+ Б
11
+ В
12
+ Г
13
+ Д
14
+ Е
15
+ Ж
16
+ З
17
+ И
18
+ Й
19
+ К
20
+ Л
21
+ М
22
+ Н
23
+ О
24
+ П
25
+ Р
26
+ С
27
+ Т
28
+ У
29
+ Ф
30
+ Х
31
+ Ц
32
+ Ч
33
+ Ш
34
+ Щ
35
+ Ы
36
+ Э
37
+ Ю
38
+ Я
39
+ а
40
+ б
41
+ в
42
+ г
43
+ д
44
+ е
45
+ ж
46
+ з
47
+ и
48
+ й
49
+ к
50
+ л
51
+ м
52
+ н
53
+ о
54
+ п
55
+ р
56
+ с
57
+ т
58
+ у
59
+ ф
60
+ х
61
+ ц
62
+ ч
63
+ ш
64
+ щ
65
+ ъ
66
+ ы
67
+ ь
68
+ э
69
+ ю
70
+ я
71
+ ё