floriangardin commited on
Commit
53f85bf
β€’
1 Parent(s): 21e897e

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer.json +146 -0
  3. tokenizer_config.json +15 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": {
17
+ "type": "NFKC"
18
+ },
19
+ "pre_tokenizer": {
20
+ "type": "Metaspace",
21
+ "replacement": "▁",
22
+ "add_prefix_space": true
23
+ },
24
+ "post_processor": null,
25
+ "decoder": {
26
+ "type": "Metaspace",
27
+ "replacement": "▁",
28
+ "add_prefix_space": true
29
+ },
30
+ "model": {
31
+ "type": "BPE",
32
+ "dropout": null,
33
+ "unk_token": "<unk>",
34
+ "continuing_subword_prefix": null,
35
+ "end_of_word_suffix": null,
36
+ "fuse_unk": false,
37
+ "byte_fallback": false,
38
+ "vocab": {
39
+ "<unk>": 0,
40
+ "'": 1,
41
+ "(": 2,
42
+ ")": 3,
43
+ "*": 4,
44
+ "+": 5,
45
+ ".": 6,
46
+ "1": 7,
47
+ "6": 8,
48
+ "@": 9,
49
+ "C": 10,
50
+ "D": 11,
51
+ "F": 12,
52
+ "S": 13,
53
+ "U": 14,
54
+ "Z": 15,
55
+ "[": 16,
56
+ "\\": 17,
57
+ "^": 18,
58
+ "_": 19,
59
+ "`": 20,
60
+ "a": 21,
61
+ "k": 22,
62
+ "l": 23,
63
+ "p": 24,
64
+ "r": 25,
65
+ "~": 26,
66
+ "": 27,
67
+ "ˆ": 28,
68
+ "Š": 29,
69
+ "Đ": 30,
70
+ "Δ‘": 31,
71
+ "▁": 32,
72
+ "'.": 33,
73
+ "(+": 34,
74
+ "(+'.": 35,
75
+ "(+'.*": 36,
76
+ ")(+'.*": 37,
77
+ "6(+'.*": 38,
78
+ "D(+'.*": 39,
79
+ ")(+'.*6(+'.*": 40,
80
+ "D(+'.*C": 41,
81
+ ")(+'.*6(+'.*D(+'.*C": 42,
82
+ "*)(+'.*6(+'.*D(+'.*C": 43,
83
+ "▁1": 44,
84
+ "▁@": 45,
85
+ "▁F": 46,
86
+ "▁S": 47,
87
+ "▁U": 48,
88
+ "▁Z": 49,
89
+ "▁[": 50,
90
+ "▁^": 51,
91
+ "▁`": 52,
92
+ "▁k": 53,
93
+ "▁p": 54,
94
+ "▁~": 55,
95
+ "β–Βˆ": 56,
96
+ "▁*)(+'.*6(+'.*D(+'.*C": 57,
97
+ "(+'.*)(+'.*6(+'.*D(+'.*C": 58,
98
+ "▁*)(+'.*6(+'.*D(+'.*C(+'.*)(+'.*6(+'.*D(+'.*C": 59,
99
+ "▁*)(+'.*6(+'.*D(+'.*C(+'.*)(+'.*6(+'.*D(+'.*C(+'.": 60,
100
+ "▁\\": 61,
101
+ "▁_": 62,
102
+ "▁a": 63,
103
+ "▁l": 64,
104
+ "▁r": 65,
105
+ "▁": 66,
106
+ "β–ΒŠ": 67
107
+ },
108
+ "merges": [
109
+ "' .",
110
+ "( +",
111
+ "(+ '.",
112
+ "(+'. *",
113
+ ") (+'.*",
114
+ "6 (+'.*",
115
+ "D (+'.*",
116
+ ")(+'.* 6(+'.*",
117
+ "D(+'.* C",
118
+ ")(+'.*6(+'.* D(+'.*C",
119
+ "* )(+'.*6(+'.*D(+'.*C",
120
+ "▁ 1",
121
+ "▁ @",
122
+ "▁ F",
123
+ "▁ S",
124
+ "▁ U",
125
+ "▁ Z",
126
+ "▁ [",
127
+ "▁ ^",
128
+ "▁ `",
129
+ "▁ k",
130
+ "▁ p",
131
+ "▁ ~",
132
+ "▁ ˆ",
133
+ "▁ *)(+'.*6(+'.*D(+'.*C",
134
+ "(+'.* )(+'.*6(+'.*D(+'.*C",
135
+ "▁*)(+'.*6(+'.*D(+'.*C (+'.*)(+'.*6(+'.*D(+'.*C",
136
+ "▁*)(+'.*6(+'.*D(+'.*C(+'.*)(+'.*6(+'.*D(+'.*C (+'.",
137
+ "▁ \\",
138
+ "▁ _",
139
+ "▁ a",
140
+ "▁ l",
141
+ "▁ r",
142
+ "▁ ",
143
+ "▁ Š"
144
+ ]
145
+ }
146
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ }
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "model_max_length": 4096,
14
+ "tokenizer_class": "PreTrainedTokenizerFast"
15
+ }