sid330 commited on
Commit
7385727
1 Parent(s): b278b35

Wav2Vec Malayalam with Ngrams

Browse files
.gitattributes CHANGED
@@ -1,35 +1,18 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
18
+ language_model/5gram_correct.arpa filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ml
3
+ datasets:
4
+ - Indic TTS Malayalam Speech Corpus
5
+ - Openslr Malayalam Speech Corpus
6
+ - SMC Malayalam Speech Corpus
7
+ - IIIT-H Indic Speech Databases
8
+ metrics:
9
+ - wer
10
+ tags:
11
+ - audio
12
+ - automatic-speech-recognition
13
+ - speech
14
+ - xlsr-fine-tuning-week
15
+ license: apache-2.0
16
+ model-index:
17
+ - name: Malayalam XLSR Wav2Vec2 Large 53
18
+ results:
19
+ - task:
20
+ name: Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: Test split of combined dataset using all datasets mentioned above
24
+ type: custom
25
+ args: ml
26
+ metrics:
27
+ - name: Test WER
28
+ type: wer
29
+ value: 28.43
30
+ ---
31
+
32
+ # Wav2Vec2-Large-XLSR-53-ml
33
+
34
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on ml (Malayalam) using the [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/) and [IIIT-H Indic Speech Databases](http://speech.iiit.ac.in/index.php/research-svl/69.html). The notebooks used to train model are available [here](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/). When using this model, make sure that your speech input is sampled at 16kHz.
35
+
36
+ ## Usage
37
+
38
+ The model can be used directly (without a language model) as follows:
39
+
40
+ ```python
41
+ import torch
42
+ import torchaudio
43
+ from datasets import load_dataset
44
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
45
+
46
+ test_dataset = <load-test-split-of-combined-dataset> # Details on loading this dataset in the evaluation section
47
+
48
+ processor = Wav2Vec2Processor.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
49
+ model = Wav2Vec2ForCTC.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
50
+
51
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
52
+
53
+ # Preprocessing the datasets.
54
+ # We need to read the audio files as arrays
55
+ def speech_file_to_array_fn(batch):
56
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
57
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
58
+ return batch
59
+
60
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
61
+ inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
62
+
63
+ with torch.no_grad():
64
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
65
+
66
+ predicted_ids = torch.argmax(logits, dim=-1)
67
+
68
+ print("Prediction:", processor.batch_decode(predicted_ids))
69
+ print("Reference:", test_dataset["sentence"])
70
+ ```
71
+
72
+
73
+ ## Evaluation
74
+
75
+ The model can be evaluated as follows on the test data of combined custom dataset. For more details on dataset preparation, check the notebooks mentioned at the end of this file.
76
+
77
+
78
+ ```python
79
+ import torch
80
+ import torchaudio
81
+ from datasets import load_dataset, load_metric
82
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
83
+ import re
84
+ from datasets import load_dataset, load_metric
85
+ from pathlib import Path
86
+
87
+ # The custom dataset needs to be created using notebook mentioned at the end of this file
88
+ data_dir = Path('<path-to-custom-dataset>')
89
+
90
+ dataset_folders = {
91
+ 'iiit': 'iiit_mal_abi',
92
+ 'openslr': 'openslr',
93
+ 'indic-tts': 'indic-tts-ml',
94
+ 'msc-reviewed': 'msc-reviewed-speech-v1.0+20200825',
95
+ }
96
+
97
+ # Set directories for datasets
98
+ openslr_male_dir = data_dir / dataset_folders['openslr'] / 'male'
99
+ openslr_female_dir = data_dir / dataset_folders['openslr'] / 'female'
100
+ iiit_dir = data_dir / dataset_folders['iiit']
101
+ indic_tts_male_dir = data_dir / dataset_folders['indic-tts'] / 'male'
102
+ indic_tts_female_dir = data_dir / dataset_folders['indic-tts'] / 'female'
103
+ msc_reviewed_dir = data_dir / dataset_folders['msc-reviewed']
104
+
105
+ # Load the datasets
106
+ openslr_male = load_dataset("json", data_files=[f"{str(openslr_male_dir.absolute())}/sample_{i}.json" for i in range(2023)], split="train")
107
+ openslr_female = load_dataset("json", data_files=[f"{str(openslr_female_dir.absolute())}/sample_{i}.json" for i in range(2103)], split="train")
108
+ iiit = load_dataset("json", data_files=[f"{str(iiit_dir.absolute())}/sample_{i}.json" for i in range(1000)], split="train")
109
+ indic_tts_male = load_dataset("json", data_files=[f"{str(indic_tts_male_dir.absolute())}/sample_{i}.json" for i in range(5649)], split="train")
110
+ indic_tts_female = load_dataset("json", data_files=[f"{str(indic_tts_female_dir.absolute())}/sample_{i}.json" for i in range(2950)], split="train")
111
+ msc_reviewed = load_dataset("json", data_files=[f"{str(msc_reviewed_dir.absolute())}/sample_{i}.json" for i in range(1541)], split="train")
112
+
113
+ # Create test split as 20%, set random seed as well.
114
+ test_size = 0.2
115
+ random_seed=1
116
+ openslr_male_splits = openslr_male.train_test_split(test_size=test_size, seed=random_seed)
117
+ openslr_female_splits = openslr_female.train_test_split(test_size=test_size, seed=random_seed)
118
+ iiit_splits = iiit.train_test_split(test_size=test_size, seed=random_seed)
119
+ indic_tts_male_splits = indic_tts_male.train_test_split(test_size=test_size, seed=random_seed)
120
+ indic_tts_female_splits = indic_tts_female.train_test_split(test_size=test_size, seed=random_seed)
121
+ msc_reviewed_splits = msc_reviewed.train_test_split(test_size=test_size, seed=random_seed)
122
+
123
+ # Get combined test dataset
124
+ split_list = [openslr_male_splits, openslr_female_splits, indic_tts_male_splits, indic_tts_female_splits, msc_reviewed_splits, iiit_splits]
125
+ test_dataset = datasets.concatenate_datasets([split['test'] for split in split_list)
126
+
127
+ wer = load_metric("wer")
128
+
129
+ processor = Wav2Vec2Processor.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
130
+ model = Wav2Vec2ForCTC.from_pretrained("gvs/wav2vec2-large-xlsr-malayalam")
131
+ model.to("cuda")
132
+
133
+ resamplers = {
134
+ 48000: torchaudio.transforms.Resample(48_000, 16_000),
135
+ }
136
+
137
+ chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�Utrnle\\\\_]'
138
+ unicode_ignore_regex = r'[\\\\u200e]'
139
+
140
+ # Preprocessing the datasets.
141
+ # We need to read the audio files as arrays
142
+ def speech_file_to_array_fn(batch):
143
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"])
144
+ batch["sentence"] = re.sub(unicode_ignore_regex, '', batch["sentence"])
145
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
146
+ # Resample if its not in 16kHz
147
+ if sampling_rate != 16000:
148
+ batch["speech"] = resamplers[sampling_rate](speech_array).squeeze().numpy()
149
+ else:
150
+ batch["speech"] = speech_array.squeeze().numpy()
151
+ # If more than one dimension is present, pick first one
152
+ if batch["speech"].ndim > 1:
153
+ batch["speech"] = batch["speech"][0]
154
+ return batch
155
+
156
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
157
+
158
+ # Preprocessing the datasets.
159
+ # We need to read the audio files as arrays
160
+ def evaluate(batch):
161
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
162
+
163
+ with torch.no_grad():
164
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
165
+
166
+ pred_ids = torch.argmax(logits, dim=-1)
167
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
168
+ return batch
169
+
170
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
171
+
172
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
173
+ ```
174
+
175
+ **Test Result (WER)**: 28.43 %
176
+
177
+
178
+ ## Training
179
+
180
+ A combined dataset was created using [Indic TTS Malayalam Speech Corpus (via Kaggle)](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus), [Openslr Malayalam Speech Corpus](http://openslr.org/63/), [SMC Malayalam Speech Corpus](https://blog.smc.org.in/malayalam-speech-corpus/) and [IIIT-H Indic Speech Databases](http://speech.iiit.ac.in/index.php/research-svl/69.html). The datasets were downloaded and was converted to HF Dataset format using [this notebook](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/make_hf_dataset.ipynb)
181
+
182
+ The notebook used for training and evaluation can be found [here](https://github.com/gauthamsuresh09/wav2vec2-large-xlsr-53-malayalam/blob/main/fine-tune-xlsr-wav2vec2-on-malayalam-asr-with-transformers_v2.ipynb)
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 77,
3
+ "<s>": 76
4
+ }
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["\u0d7c", "\u0d38", "\u0d02", "\u0d0b", "\u0d39", "\u0d27", "\u0d22", "\u0d20", "\u0d1f", "\u0d2a", "\u0d2f", "\u0d40", "\u0d15", " ", "\u0d30", "\u0d0f", "\u0d09", "\u0d43", "\u0d7a", "\u0d23", "\u0d3f", "\u200c", "\u0d7d", "\u0d4a", "\u0d32", "\u0d57", "\u0d18", "\u0d4c", "\u0d26", "\u0d2c", "\u0d1b", "\u0d1d", "\u0d0a", "\u0d28", "\u0d4b", "\u0d71", "\u0d07", "\u0d10", "\u0d48", "\u0d03", "\u0d3e", "\u0d1a", "\u0d47", "\u0d25", "\u0d21", "\u0d0e", "\u0d37", "\u0d06", "\u0d05", "\u0d2e", "\u0d12", "\u0d36", "\u0d16", "\u0d2b", "\u0d41", "\u0d17", "\u0d33", "\u0d13", "\u0d42", "\u0d35", "\u0d1e", "\u0d31", "\u0d2d", "\u0d7b", "\u0d7e", "\u0d08", "\u0d4d", "\u0d46", "\u0d19", "\u0d24", "\u0d1c", "\u200d", "\u0d14", "\u0d34", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.094,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.0,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.05,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.045,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.082,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 75,
74
+ "transformers_version": "4.5.0.dev0",
75
+ "vocab_size": 76
76
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18381d01f5140589dfb1d13754725c976fd7172be4e92a52445f2a13baf638f6
3
+ size 1262081874
language_model/5gram_correct.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0bb3d117437dfa2111ac7f68eb4e2a411532515a3c10cb7abff205455ec8852
3
+ size 15359294
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5c39014b08099f35aa8eab263323ff24c61f959eddd9d50dc9716ab0df8afee
3
+ size 1262245399
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "74": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "75": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "76": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "77": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "processor_class": "Wav2Vec2ProcessorWithLM",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "[UNK]",
47
+ "word_delimiter_token": "|"
48
+ }
vocab.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 75,
3
+ "[UNK]": 74,
4
+ "|": 13,
5
+ "ം": 2,
6
+ "ഃ": 39,
7
+ "അ": 48,
8
+ "ആ": 47,
9
+ "ഇ": 36,
10
+ "ഈ": 65,
11
+ "ഉ": 16,
12
+ "ഊ": 32,
13
+ "ഋ": 3,
14
+ "എ": 45,
15
+ "ഏ": 15,
16
+ "ഐ": 37,
17
+ "ഒ": 50,
18
+ "ഓ": 57,
19
+ "ഔ": 72,
20
+ "ക": 12,
21
+ "ഖ": 52,
22
+ "ഗ": 55,
23
+ "ഘ": 26,
24
+ "ങ": 68,
25
+ "ച": 41,
26
+ "ഛ": 30,
27
+ "ജ": 70,
28
+ "ഝ": 31,
29
+ "ഞ": 60,
30
+ "ട": 8,
31
+ "ഠ": 7,
32
+ "ഡ": 44,
33
+ "ഢ": 6,
34
+ "ണ": 19,
35
+ "ത": 69,
36
+ "ഥ": 43,
37
+ "ദ": 28,
38
+ "ധ": 5,
39
+ "ന": 33,
40
+ "പ": 9,
41
+ "ഫ": 53,
42
+ "ബ": 29,
43
+ "ഭ": 62,
44
+ "മ": 49,
45
+ "യ": 10,
46
+ "ര": 14,
47
+ "റ": 61,
48
+ "ല": 24,
49
+ "ള": 56,
50
+ "ഴ": 73,
51
+ "വ": 59,
52
+ "ശ": 51,
53
+ "ഷ": 46,
54
+ "സ": 1,
55
+ "ഹ": 4,
56
+ "ാ": 40,
57
+ "ി": 20,
58
+ "ീ": 11,
59
+ "ു": 54,
60
+ "ൂ": 58,
61
+ "ൃ": 17,
62
+ "െ": 67,
63
+ "േ": 42,
64
+ "ൈ": 38,
65
+ "ൊ": 23,
66
+ "ോ": 34,
67
+ "ൌ": 27,
68
+ "്": 66,
69
+ "ൗ": 25,
70
+ "൱": 35,
71
+ "ൺ": 18,
72
+ "ൻ": 63,
73
+ "ർ": 0,
74
+ "ൽ": 22,
75
+ "ൾ": 64,
76
+ "‌": 21,
77
+ "‍": 71
78
+ }