tuanio commited on
Commit
e11a91f
1 Parent(s): 6015fd8

Upload tokenizer

Browse files
Files changed (2) hide show
  1. added_tokens.json +2 -2
  2. vocab.json +100 -143
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 145,
3
- "<s>": 144
4
  }
 
1
  {
2
+ "</s>": 102,
3
+ "<s>": 101
4
  }
vocab.json CHANGED
@@ -1,146 +1,103 @@
1
  {
2
- "#": 2,
3
- "&": 3,
4
- "'": 4,
5
- "(": 5,
6
- ")": 6,
7
- ",": 7,
8
- "-": 8,
9
- ".": 9,
10
- "/": 10,
11
- "0": 11,
12
- "1": 12,
13
- "2": 13,
14
- "3": 14,
15
- "4": 15,
16
- "5": 16,
17
- "6": 17,
18
- "7": 18,
19
- "8": 19,
20
- "9": 20,
21
- "<": 21,
22
- "=": 22,
23
- ">": 23,
24
- "?": 24,
25
- "@": 25,
26
- "[PAD]": 146,
27
- "[UNK]": 145,
28
- "_": 26,
29
- "a": 27,
30
- "b": 28,
31
- "c": 29,
32
- "d": 30,
33
- "e": 31,
34
- "f": 32,
35
- "g": 33,
36
- "h": 34,
37
- "i": 35,
38
- "j": 36,
39
- "k": 37,
40
- "l": 38,
41
- "m": 39,
42
- "n": 40,
43
- "o": 41,
44
- "p": 42,
45
- "q": 43,
46
- "r": 44,
47
- "s": 45,
48
- "t": 46,
49
- "u": 47,
50
- "v": 48,
51
- "w": 49,
52
- "x": 50,
53
- "y": 51,
54
- "z": 52,
55
  "|": 0,
56
- "£": 53,
57
- "à": 54,
58
- "á": 55,
59
- "â": 56,
60
- "ã": 57,
61
- "è": 58,
62
- "é": 59,
63
- "ê": 60,
64
- "ì": 61,
65
- "í": 62,
66
- "ð": 63,
67
- "ò": 64,
68
- "ó": 65,
69
- "ô": 66,
70
- "õ": 67,
71
- "ö": 68,
72
- "ù": 69,
73
- "ú": 70,
74
- "ý": 71,
75
- "ă": 72,
76
- "ć": 73,
77
- "đ": 74,
78
- "ġ": 75,
79
- "ĩ": 76,
80
- "ij": 77,
81
- "ũ": 78,
82
- "ơ": 79,
83
- "ư": 80,
84
- "ǎ": 81,
85
- "ǡ": 82,
86
- "̀": 83,
87
- "́": 84,
88
- "̃": 85,
89
- "̉": 86,
90
- "̣": 87,
91
- "ۃ": 88,
92
- "ۙ": 89,
93
- "۟": 90,
94
- "ۣ": 91,
95
- "": 92,
96
- "": 93,
97
- "": 94,
98
- "": 95,
99
- "": 96,
100
- "": 97,
101
- "": 98,
102
- "": 99,
103
- "": 100,
104
- "": 101,
105
- "": 102,
106
- "": 103,
107
- "": 104,
108
- "": 105,
109
- "": 106,
110
- "ế": 107,
111
- "": 108,
112
- "": 109,
113
- "": 110,
114
- "": 111,
115
- "": 112,
116
- "": 113,
117
- "ọ": 114,
118
- "ỏ": 115,
119
- "ố": 116,
120
- "ồ": 117,
121
- "ổ": 118,
122
- "ỗ": 119,
123
- "ộ": 120,
124
- "ớ": 121,
125
- "ờ": 122,
126
- "ở": 123,
127
- "ỡ": 124,
128
- "ợ": 125,
129
- "ụ": 126,
130
- "ủ": 127,
131
- "ứ": 128,
132
- "ừ": 129,
133
- "ử": 130,
134
- "ữ": 131,
135
- "ự": 132,
136
- "ỳ": 133,
137
- "ỵ": 134,
138
- "ỷ": 135,
139
- "ỹ": 136,
140
- "–": 138,
141
- "‘": 139,
142
- "’": 140,
143
- "“": 141,
144
- "”": 142,
145
- "…": 143
146
  }
 
1
  {
2
+ " ": 1,
3
+ "0": 2,
4
+ "1": 3,
5
+ "2": 4,
6
+ "3": 5,
7
+ "4": 6,
8
+ "5": 7,
9
+ "6": 8,
10
+ "7": 9,
11
+ "8": 10,
12
+ "9": 11,
13
+ "[PAD]": 100,
14
+ "[UNK]": 99,
15
+ "a": 12,
16
+ "b": 13,
17
+ "c": 14,
18
+ "d": 15,
19
+ "e": 16,
20
+ "f": 17,
21
+ "g": 18,
22
+ "h": 19,
23
+ "i": 20,
24
+ "j": 21,
25
+ "k": 22,
26
+ "l": 23,
27
+ "m": 24,
28
+ "n": 25,
29
+ "o": 26,
30
+ "p": 27,
31
+ "q": 28,
32
+ "r": 29,
33
+ "s": 30,
34
+ "t": 31,
35
+ "u": 32,
36
+ "v": 33,
37
+ "w": 34,
38
+ "x": 35,
39
+ "y": 36,
40
+ "z": 37,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "|": 0,
42
+ "à": 38,
43
+ "á": 39,
44
+ "â": 40,
45
+ "ã": 41,
46
+ "è": 42,
47
+ "é": 43,
48
+ "ê": 44,
49
+ "ì": 45,
50
+ "í": 46,
51
+ "ò": 47,
52
+ "ó": 48,
53
+ "ô": 49,
54
+ "õ": 50,
55
+ "ù": 51,
56
+ "ú": 52,
57
+ "ý": 53,
58
+ "ă": 54,
59
+ "đ": 55,
60
+ "ĩ": 56,
61
+ "ũ": 57,
62
+ "ơ": 58,
63
+ "": 59,
64
+ "": 60,
65
+ "": 61,
66
+ "": 62,
67
+ "": 63,
68
+ "": 64,
69
+ "": 65,
70
+ "": 66,
71
+ "": 67,
72
+ "": 68,
73
+ "": 69,
74
+ "": 70,
75
+ "": 71,
76
+ "": 72,
77
+ "": 73,
78
+ "ế": 74,
79
+ "": 75,
80
+ "": 76,
81
+ "": 77,
82
+ "": 78,
83
+ "": 79,
84
+ "": 80,
85
+ "": 81,
86
+ "": 82,
87
+ "": 83,
88
+ "": 84,
89
+ "": 85,
90
+ "": 86,
91
+ "": 87,
92
+ "": 88,
93
+ "": 89,
94
+ "": 90,
95
+ "": 91,
96
+ "": 92,
97
+ "": 93,
98
+ "": 94,
99
+ "": 95,
100
+ "": 96,
101
+ "": 97,
102
+ "": 98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  }