kingabzpro commited on
Commit
1b32c1f
1 Parent(s): ec2318b

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +2 -4
  2. special_tokens_map.json +16 -0
  3. tokenizer_config.json +0 -35
  4. vocab.json +70 -71
added_tokens.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
- "</s>": 75,
3
- "<s>": 74,
4
- "[PAD]": 73,
5
- "[UNK]": 72
6
  }
 
1
  {
2
+ "</s>": 74,
3
+ "<s>": 73
 
 
4
  }
special_tokens_map.json CHANGED
@@ -1,4 +1,20 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "pad_token": "[PAD]",
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
  "bos_token": "<s>",
19
  "eos_token": "</s>",
20
  "pad_token": "[PAD]",
tokenizer_config.json CHANGED
@@ -1,39 +1,4 @@
1
  {
2
- "added_tokens_decoder": {
3
- "72": {
4
- "content": "[UNK]",
5
- "lstrip": true,
6
- "normalized": false,
7
- "rstrip": true,
8
- "single_word": false,
9
- "special": false
10
- },
11
- "73": {
12
- "content": "[PAD]",
13
- "lstrip": true,
14
- "normalized": false,
15
- "rstrip": true,
16
- "single_word": false,
17
- "special": false
18
- },
19
- "74": {
20
- "content": "<s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "75": {
28
- "content": "</s>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- }
35
- },
36
- "additional_special_tokens": [],
37
  "bos_token": "<s>",
38
  "clean_up_tokenization_spaces": true,
39
  "do_lower_case": false,
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "do_lower_case": false,
vocab.json CHANGED
@@ -1,76 +1,75 @@
1
  {
2
- "'": 50,
3
- "[PAD]": 73,
4
- "[UNK]": 72,
5
- "a": 41,
6
- "e": 26,
7
- "f": 61,
8
- "i": 52,
9
- "l": 68,
10
- "m": 67,
11
- "p": 70,
12
- "r": 12,
13
- "u": 38,
14
- "w": 43,
15
- "|": 63,
16
- "": 9,
17
- "": 24,
18
- "": 40,
19
- "": 66,
20
- "": 69,
21
- "": 32,
22
- "": 39,
23
- "": 46,
24
- "": 10,
25
- "": 54,
26
- "": 19,
27
- "": 55,
28
- "": 45,
29
- "": 48,
30
- "": 1,
31
- "": 71,
32
- "": 64,
33
- "": 4,
34
- "": 16,
35
- "": 22,
36
- "": 44,
37
- "": 33,
38
- "": 3,
39
- "": 20,
40
- "ड": 31,
41
  "ढ": 36,
42
- "ण": 59,
43
- "त": 27,
44
- "थ": 60,
45
- "द": 35,
46
- "ध": 11,
47
  "न": 42,
48
- "प": 34,
49
- "फ": 5,
50
- "ब": 30,
51
- "भ": 53,
52
- "म": 8,
53
- "य": 2,
54
- "र": 37,
55
- "ल": 65,
56
- "व": 18,
57
- "श": 15,
58
- "ष": 14,
59
- "स": 47,
60
- "ह": 13,
61
- "़": 7,
62
- "ा": 6,
63
- "ि": 21,
64
- "ी": 23,
65
- "ु": 28,
66
- "ू": 49,
67
  "ृ": 62,
68
- "े": 57,
69
- "ै": 25,
70
- "ॉ": 58,
71
- "ो": 0,
72
- "ौ": 17,
73
- "्": 51,
74
- "ड़": 56,
75
- "।": 29
76
  }
 
1
  {
2
+ "[PAD]": 72,
3
+ "[UNK]": 71,
4
+ "a": 1,
5
+ "e": 2,
6
+ "f": 3,
7
+ "i": 4,
8
+ "l": 5,
9
+ "m": 6,
10
+ "p": 7,
11
+ "r": 8,
12
+ "u": 9,
13
+ "w": 10,
14
+ "|": 0,
15
+ "": 11,
16
+ "": 12,
17
+ "": 13,
18
+ "": 14,
19
+ "": 15,
20
+ "": 16,
21
+ "": 17,
22
+ "": 18,
23
+ "": 19,
24
+ "": 20,
25
+ "": 21,
26
+ "": 22,
27
+ "": 23,
28
+ "": 24,
29
+ "": 25,
30
+ "": 26,
31
+ "": 27,
32
+ "": 28,
33
+ "": 29,
34
+ "": 30,
35
+ "": 31,
36
+ "": 32,
37
+ "": 33,
38
+ "": 34,
39
+ "": 35,
 
40
  "ढ": 36,
41
+ "ण": 37,
42
+ "त": 38,
43
+ "थ": 39,
44
+ "द": 40,
45
+ "ध": 41,
46
  "न": 42,
47
+ "प": 43,
48
+ "फ": 44,
49
+ "ब": 45,
50
+ "भ": 46,
51
+ "म": 47,
52
+ "य": 48,
53
+ "र": 49,
54
+ "ल": 50,
55
+ "व": 51,
56
+ "श": 52,
57
+ "ष": 53,
58
+ "स": 54,
59
+ "ह": 55,
60
+ "़": 56,
61
+ "ा": 57,
62
+ "ि": 58,
63
+ "ी": 59,
64
+ "ु": 60,
65
+ "ू": 61,
66
  "ृ": 62,
67
+ "े": 63,
68
+ "ै": 64,
69
+ "ॉ": 65,
70
+ "ो": 66,
71
+ "ौ": 67,
72
+ "्": 68,
73
+ "ड़": 69,
74
+ "।": 70
75
  }