mrkaesy commited on
Commit
6cd0398
1 Parent(s): 2ade89d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +3 -3
  2. tokenizer_config.json +4 -4
  3. vocab.json +102 -126
added_tokens.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "</s>": 137,
3
- "<s>": 136,
4
- "[PAD]": 135
5
  }
 
1
  {
2
+ "</s>": 113,
3
+ "<s>": 112,
4
+ "[PAD]": 111
5
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "134": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "135": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "136": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "137": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "110": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "111": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "112": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "113": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
vocab.json CHANGED
@@ -8,131 +8,107 @@
8
  ".": 7,
9
  ":": 8,
10
  "?": 9,
11
- "A": 10,
12
- "B": 11,
13
- "C": 12,
14
- "D": 13,
15
- "E": 14,
16
- "F": 15,
17
- "G": 16,
18
- "H": 17,
19
- "I": 18,
20
- "J": 19,
21
- "K": 20,
22
- "L": 21,
23
- "M": 22,
24
- "N": 23,
25
- "O": 24,
26
- "P": 25,
27
- "R": 26,
28
- "S": 27,
29
- "T": 28,
30
- "V": 29,
31
- "W": 30,
32
- "X": 31,
33
- "Y": 32,
34
- "Z": 33,
35
- "[PAD]": 135,
36
- "[UNK]": 134,
37
- "a": 34,
38
- "b": 35,
39
- "c": 36,
40
- "d": 37,
41
- "e": 38,
42
- "f": 39,
43
- "g": 40,
44
- "h": 41,
45
- "i": 42,
46
- "j": 43,
47
- "k": 44,
48
- "l": 45,
49
- "m": 46,
50
- "n": 47,
51
- "o": 48,
52
- "p": 49,
53
- "q": 50,
54
- "r": 51,
55
- "s": 52,
56
- "t": 53,
57
- "u": 54,
58
- "v": 55,
59
- "w": 56,
60
- "x": 57,
61
- "y": 58,
62
- "z": 59,
63
  "|": 0,
64
- "ँ": 61,
65
- "ं": 62,
66
- "ः": 63,
67
- "अ": 64,
68
- "आ": 65,
69
- "इ": 66,
70
- "ई": 67,
71
- "उ": 68,
72
- "ऊ": 69,
73
- "ऋ": 70,
74
- "ए": 71,
75
- "ऐ": 72,
76
- "ऑ": 73,
77
- "ओ": 74,
78
- "औ": 75,
79
- "क": 76,
80
- "ख": 77,
81
- "ग": 78,
82
- "घ": 79,
83
- "च": 80,
84
- "छ": 81,
85
- "ज": 82,
86
- "झ": 83,
87
- "ञ": 84,
88
- "ट": 85,
89
- "ठ": 86,
90
- "ड": 87,
91
- "ढ": 88,
92
- "ण": 89,
93
- "त": 90,
94
- "थ": 91,
95
- "द": 92,
96
- "ध": 93,
97
- "न": 94,
98
- "प": 95,
99
- "फ": 96,
100
- "ब": 97,
101
- "भ": 98,
102
- "म": 99,
103
- "य": 100,
104
- "र": 101,
105
- "ल": 102,
106
- "व": 103,
107
- "श": 104,
108
- "ष": 105,
109
- "स": 106,
110
- "ह": 107,
111
- "़": 108,
112
- "ा": 109,
113
- "ि": 110,
114
- "ी": 111,
115
- "ु": 112,
116
- "ू": 113,
117
- "ृ": 114,
118
- "ॅ": 115,
119
- "े": 116,
120
- "ै": 117,
121
- "ॉ": 118,
122
- "ो": 119,
123
- "ौ": 120,
124
- "्": 121,
125
- "क़": 122,
126
- "ख़": 123,
127
- "ग़": 124,
128
- "ज़": 125,
129
- "ड़": 126,
130
- "ढ़": 127,
131
- "फ़": 128,
132
- "।": 129,
133
- "–": 130,
134
- "‘": 131,
135
- "’": 132,
136
- "“": 133,
137
- "”": 134
138
  }
 
8
  ".": 7,
9
  ":": 8,
10
  "?": 9,
11
+ "[PAD]": 111,
12
+ "[UNK]": 110,
13
+ "a": 10,
14
+ "b": 11,
15
+ "c": 12,
16
+ "d": 13,
17
+ "e": 14,
18
+ "f": 15,
19
+ "g": 16,
20
+ "h": 17,
21
+ "i": 18,
22
+ "j": 19,
23
+ "k": 20,
24
+ "l": 21,
25
+ "m": 22,
26
+ "n": 23,
27
+ "o": 24,
28
+ "p": 25,
29
+ "q": 26,
30
+ "r": 27,
31
+ "s": 28,
32
+ "t": 29,
33
+ "u": 30,
34
+ "v": 31,
35
+ "w": 32,
36
+ "x": 33,
37
+ "y": 34,
38
+ "z": 35,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  "|": 0,
40
+ "ँ": 37,
41
+ "ं": 38,
42
+ "ः": 39,
43
+ "अ": 40,
44
+ "आ": 41,
45
+ "इ": 42,
46
+ "ई": 43,
47
+ "उ": 44,
48
+ "ऊ": 45,
49
+ "ऋ": 46,
50
+ "ए": 47,
51
+ "ऐ": 48,
52
+ "ऑ": 49,
53
+ "ओ": 50,
54
+ "औ": 51,
55
+ "क": 52,
56
+ "ख": 53,
57
+ "ग": 54,
58
+ "घ": 55,
59
+ "च": 56,
60
+ "छ": 57,
61
+ "ज": 58,
62
+ "झ": 59,
63
+ "ञ": 60,
64
+ "ट": 61,
65
+ "ठ": 62,
66
+ "ड": 63,
67
+ "ढ": 64,
68
+ "ण": 65,
69
+ "त": 66,
70
+ "थ": 67,
71
+ "द": 68,
72
+ "ध": 69,
73
+ "न": 70,
74
+ "प": 71,
75
+ "फ": 72,
76
+ "ब": 73,
77
+ "भ": 74,
78
+ "म": 75,
79
+ "य": 76,
80
+ "र": 77,
81
+ "ल": 78,
82
+ "व": 79,
83
+ "श": 80,
84
+ "ष": 81,
85
+ "स": 82,
86
+ "ह": 83,
87
+ "़": 84,
88
+ "ा": 85,
89
+ "ि": 86,
90
+ "ी": 87,
91
+ "ु": 88,
92
+ "ू": 89,
93
+ "ृ": 90,
94
+ "ॅ": 91,
95
+ "े": 92,
96
+ "ै": 93,
97
+ "ॉ": 94,
98
+ "ो": 95,
99
+ "ौ": 96,
100
+ "्": 97,
101
+ "क़": 98,
102
+ "ख़": 99,
103
+ "ग़": 100,
104
+ "ज़": 101,
105
+ "ड़": 102,
106
+ "ढ़": 103,
107
+ "फ़": 104,
108
+ "।": 105,
109
+ "–": 106,
110
+ "‘": 107,
111
+ "’": 108,
112
+ "“": 109,
113
+ "”": 110
114
  }