rashmi035 commited on
Commit
016d2ad
1 Parent(s): da4d4fe

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "</s>": 82,
3
+ "<s>": 81,
4
+ "[PAD]": 83
5
+ }
special_tokens_map.json CHANGED
@@ -1,6 +1,29 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
- "pad_token": "<pad>",
5
- "unk_token": "<unk>"
6
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "[PAD]",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
  "bos_token": "<s>",
26
  "eos_token": "</s>",
27
+ "pad_token": "[PAD]",
28
+ "unk_token": "[UNK]"
29
  }
tokenizer_config.json CHANGED
@@ -2,14 +2,12 @@
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "do_lower_case": false,
5
- "do_normalize": true,
6
  "eos_token": "</s>",
7
- "model_max_length": 9223372036854775807,
8
- "pad_token": "<pad>",
9
  "replace_word_delimiter_char": " ",
10
- "return_attention_mask": false,
11
- "target_lang": null,
12
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
13
- "unk_token": "<unk>",
14
  "word_delimiter_token": "|"
15
  }
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "do_lower_case": false,
 
5
  "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
  "replace_word_delimiter_char": " ",
9
+ "target_lang": "hin",
 
10
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
  "word_delimiter_token": "|"
13
  }
vocab.json CHANGED
@@ -1,34 +1,86 @@
1
  {
2
- "'": 27,
3
- "</s>": 2,
4
- "<pad>": 0,
5
- "<s>": 1,
6
- "<unk>": 3,
7
- "A": 7,
8
- "B": 24,
9
- "C": 19,
10
- "D": 14,
11
- "E": 5,
12
- "F": 20,
13
- "G": 21,
14
- "H": 11,
15
- "I": 10,
16
- "J": 29,
17
- "K": 26,
18
- "L": 15,
19
- "M": 17,
20
- "N": 9,
21
- "O": 8,
22
- "P": 23,
23
- "Q": 30,
24
- "R": 13,
25
- "S": 12,
26
- "T": 6,
27
- "U": 16,
28
- "V": 25,
29
- "W": 18,
30
- "X": 28,
31
- "Y": 22,
32
- "Z": 31,
33
- "|": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  }
 
1
  {
2
+ "hin": {
3
+ "!": 1,
4
+ "\"": 2,
5
+ "'": 3,
6
+ ",": 4,
7
+ "-": 5,
8
+ ".": 6,
9
+ ":": 7,
10
+ "?": 8,
11
+ "F": 9,
12
+ "M": 10,
13
+ "W": 11,
14
+ "[PAD]": 81,
15
+ "[UNK]": 81,
16
+ "a": 12,
17
+ "e": 13,
18
+ "i": 14,
19
+ "l": 15,
20
+ "m": 16,
21
+ "p": 17,
22
+ "r": 18,
23
+ "u": 19,
24
+ "|": 0,
25
+ "": 20,
26
+ "": 21,
27
+ "": 22,
28
+ "": 23,
29
+ "": 24,
30
+ "": 25,
31
+ "": 26,
32
+ "": 27,
33
+ "": 28,
34
+ "ए": 29,
35
+ "ऐ": 30,
36
+ "ऑ": 31,
37
+ "ओ": 32,
38
+ "औ": 33,
39
+ "क": 34,
40
+ "ख": 35,
41
+ "ग": 36,
42
+ "घ": 37,
43
+ "च": 38,
44
+ "छ": 39,
45
+ "ज": 40,
46
+ "झ": 41,
47
+ "ट": 42,
48
+ "ठ": 43,
49
+ "ड": 44,
50
+ "ढ": 45,
51
+ "ण": 46,
52
+ "त": 47,
53
+ "थ": 48,
54
+ "द": 49,
55
+ "ध": 50,
56
+ "न": 51,
57
+ "प": 52,
58
+ "फ": 53,
59
+ "ब": 54,
60
+ "भ": 55,
61
+ "म": 56,
62
+ "य": 57,
63
+ "र": 58,
64
+ "ल": 59,
65
+ "व": 60,
66
+ "श": 61,
67
+ "ष": 62,
68
+ "स": 63,
69
+ "ह": 64,
70
+ "़": 65,
71
+ "ा": 66,
72
+ "ि": 67,
73
+ "ी": 68,
74
+ "ु": 69,
75
+ "ू": 70,
76
+ "ृ": 71,
77
+ "े": 72,
78
+ "ै": 73,
79
+ "ॉ": 74,
80
+ "ो": 75,
81
+ "ौ": 76,
82
+ "्": 77,
83
+ "ड़": 78,
84
+ "।": 79
85
+ }
86
  }