add language model

Browse files

Files changed (14) hide show

.gitignore +1 -0
alphabet.json +1 -0
build_lm_processor.ipynb +200 -0
eval.sh +1 -1
inference.ipynb +118 -56
language_model/attrs.json +1 -0
language_model/km_wiki_ngram.arpa +3 -0
language_model/unigrams.txt +0 -0
preprocessor_config.json +1 -0
special_tokens_map.json +1 -1
tokenizer_config.json +1 -1
train_kh.ipynb +45 -45
train_kh_lm.ipynb +0 -0
vocab.json +1 -1

.gitignore CHANGED Viewed

@@ -3,3 +3,4 @@ km_kh*
 .ipynb_checkpoints
 vitouphy
 *checkpoint*

 .ipynb_checkpoints
 vitouphy
 *checkpoint*
+data

alphabet.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"labels": [" ", "\u1780", "\u1781", "\u1782", "\u1783", "\u1784", "\u1785", "\u1786", "\u1787", "\u1788", "\u1789", "\u178a", "\u178b", "\u178c", "\u178d", "\u178e", "\u178f", "\u1790", "\u1791", "\u1792", "\u1793", "\u1794", "\u1795", "\u1796", "\u1797", "\u1798", "\u1799", "\u179a", "\u179b", "\u179c", "\u179f", "\u17a0", "\u17a1", "\u17a2", "\u17a5", "\u17a7", "\u17aa", "\u17ab", "\u17ac", "\u17ad", "\u17ae", "\u17af", "\u17b1", "\u17b6", "\u17b7", "\u17b8", "\u17b9", "\u17ba", "\u17bb", "\u17bc", "\u17bd", "\u17be", "\u17bf", "\u17c0", "\u17c1", "\u17c2", "\u17c3", "\u17c4", "\u17c5", "\u17c6", "\u17c7", "\u17c8", "\u17c9", "\u17ca", "\u17cb", "\u17cc", "\u17cd", "\u17ce", "\u17cf", "\u17d0", "\u17d2", "\u2047", "", "<s>", "</s>"], "is_bpe": false}

build_lm_processor.ipynb ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5393aa33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM\n",
+    "from datasets import load_dataset, load_metric, Audio\n",
+    "from pyctcdecode import build_ctcdecoder\n",
+    "from pydub import AudioSegment\n",
+    "from pydub.playback import play\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import kenlm\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "import soundfile as sf\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2d34d3b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
+    "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f0354cb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading the LM will be faster if you build a binary file.\n",
+      "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
+      "****************************************************************************************************\n"
+     ]
+    }
+   ],
+   "source": [
+    "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "109f28e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'|': 0, 'ក': 1, 'ខ': 2, 'គ': 3, 'ឃ': 4, 'ង': 5, 'ច': 6, 'ឆ': 7, 'ជ': 8, 'ឈ': 9, 'ញ': 10, 'ដ': 11, 'ឋ': 12, 'ឌ': 13, 'ឍ': 14, 'ណ': 15, 'ត': 16, 'ថ': 17, 'ទ': 18, 'ធ': 19, 'ន': 20, 'ប': 21, 'ផ': 22, 'ព': 23, 'ភ': 24, 'ម': 25, 'យ': 26, 'រ': 27, 'ល': 28, 'វ': 29, 'ស': 30, 'ហ': 31, 'ឡ': 32, 'អ': 33, 'ឥ': 34, 'ឧ': 35, 'ឪ': 36, 'ឫ': 37, 'ឬ': 38, 'ឭ': 39, 'ឮ': 40, 'ឯ': 41, 'ឱ': 42, 'ា': 43, 'ិ': 44, 'ី': 45, 'ឹ': 46, 'ឺ': 47, 'ុ': 48, 'ូ': 49, 'ួ': 50, 'ើ': 51, 'ឿ': 52, 'ៀ': 53, 'េ': 54, 'ែ': 55, 'ៃ': 56, 'ោ': 57, 'ៅ': 58, 'ំ': 59, 'ះ': 60, 'ៈ': 61, '៉': 62, '៊': 63, '់': 64, '៌': 65, '៍': 66, '៎': 67, '៏': 68, '័': 69, '្': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = processor.tokenizer.get_vocab()\n",
+    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
+    "print(sorted_vocab_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "300cec39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading the LM will be faster if you build a binary file.\n",
+      "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
+      "****************************************************************************************************\n"
+     ]
+    }
+   ],
+   "source": [
+    "decoder = build_ctcdecoder(\n",
+    "    labels=list(sorted_vocab_dict.keys()),\n",
+    "    kenlm_model_path=KENLM_MODEL_LOC,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "27dd8427",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
+    "    feature_extractor=processor.feature_extractor,\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    decoder=decoder\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "94eb248e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm.save_pretrained(\".\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f9b3dcc",
+   "metadata": {},
+   "source": [
+    "## Save Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8b584690",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc5bf68946064e97b869d44b02e7af19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3712c030",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained('.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5d8de20",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

eval.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 ./eval.py \
---model_id ./ \
 --dataset openslr \
 --config km \
 --split test \

 ./eval.py \
+--model_id vitouphy/xls-r-300m-km \
 --dataset openslr \
 --config km \
 --split test \

inference.ipynb CHANGED Viewed

@@ -2,32 +2,40 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "2bdeda95",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
     "from datasets import load_dataset, load_metric, Audio\n",
     "import numpy as np\n",
-    "import torch"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "8f840be9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
     "processor = Wav2Vec2Processor.from_pretrained(\".\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 28,
-   "id": "46339a6d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,62 +45,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "2c28d4f3",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Using custom data configuration default-fbad308ab5a03eb2\n",
-      "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
      ]
     }
    ],
    "source": [
-    "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "f14c1cfa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_voice_test  = (common_voice_test\n",
     "                      .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
-    "                      .rename_column('text', 'sentence'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "b60360b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_voice_test  = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "64758ba8",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_3799144408.wav',\n",
-       "  'array': array([-1.0600963e-06,  1.2359066e-06, -1.4001107e-06, ...,\n",
-       "         -3.1423504e-05,  4.4914182e-06,  0.0000000e+00], dtype=float32),\n",
        "  'sampling_rate': 16000},\n",
-       " 'sentence': 'ស៊ី ដាច់ ម៉ូតូ នៅ ពេល ដែល ប្រើ ឱ្យ ឌុប សម្ភារៈ គ្រឿង សង្ហារឹម យក ទៅ ឱ្យ ម៉ូយ នៅ ម្ដុំ វត្ដ សំរោងអណ្ដែត'}"
       ]
      },
-     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -103,8 +92,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
-   "id": "93cd7415",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -122,15 +111,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "id": "04751885",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-abf3b661c395248b.arrow\n"
      ]
     }
    ],
@@ -140,8 +129,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "e55d9cc9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -150,8 +139,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "4f637d1a",
    "metadata": {},
    "outputs": [
     {
@@ -170,8 +199,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "85334ad6",
    "metadata": {},
    "outputs": [
     {
@@ -179,10 +241,10 @@
      "output_type": "stream",
      "text": [
       "Prediction:\n",
-      "ក្រុង ប៉ោយប៉ែត នឹង ក្វាះ ទឹក ស្អាត ប្រើ ចាប់ ពី សប្តាហ ក្រោយ ទៅ\n",
       "\n",
       "Reference:\n",
-      "ក្រុង ប៉ោយប៉ែត នឹង ខ្វះ ទឹក ស្អាត ប្រើ ចាប់ ពី សប្តាហ៍ ក្រោយ ទៅ\n"
      ]
     }
    ],
@@ -199,7 +261,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "be1c8d79",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -207,7 +269,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1f7eaba0",
    "metadata": {},
    "outputs": [],
    "source": []

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
+   "id": "33e4a305",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor\n",
     "from datasets import load_dataset, load_metric, Audio\n",
+    "from pyctcdecode import build_ctcdecoder\n",
+    "from pydub import AudioSegment\n",
+    "from pydub.playback import play\n",
+    "\n",
     "import numpy as np\n",
+    "import torch\n",
+    "import kenlm\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "import soundfile as sf"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
+   "id": "328d0662",
    "metadata": {},
    "outputs": [],
    "source": [
+    "model = AutoModelForCTC.from_pretrained(\".\")\n",
     "processor = Wav2Vec2Processor.from_pretrained(\".\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "0fea2518",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
+   "id": "9cfef23c",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Using custom data configuration default-36119ec2a15afb82\n",
+      "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
      ]
     }
    ],
    "source": [
+    "common_voice_test  = (load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')\n",
     "                      .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
+    "                      .rename_column('text', 'sentence')\n",
+    "                      .cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio'))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "id": "29e6bb1a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_3154_2555595821.wav',\n",
+       "  'array': array([ 0.00014737,  0.00016698,  0.00013704, ..., -0.00011244,\n",
+       "         -0.0001059 , -0.00011476], dtype=float32),\n",
        "  'sampling_rate': 16000},\n",
+       " 'sentence': 'ការ ធ្វើ អាជីវកម្ម រ៉ែ ដំបូង នៅ កម្ពុជា'}"
       ]
      },
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
+   "id": "0554b8d8",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
+   "id": "d26a6659",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-081703c0621182da.arrow\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
+   "id": "04a94f74",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
+   "id": "3993d2c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7e3026dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_values': tensor([[ 2.8537e-04,  2.5043e-04,  2.7738e-04,  ..., -4.8949e-05,\n",
+       "         -1.1382e-04,  2.7166e-04]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "adf215c0",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
+   "id": "e8310629",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([ 1, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
+       "        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 10, 70, 70, 70, 10, 72,\n",
+       "        43, 72, 72, 72, 72, 72, 72,  0,  0, 72, 72, 18, 72, 54, 72, 72, 72, 72,\n",
+       "        72,  0, 72, 21, 72, 49, 72, 72, 72, 72, 72, 72, 23, 70, 70, 27, 72, 46,\n",
+       "        72, 72, 72,  1, 72,  0,  0, 30, 72, 72, 72, 72, 25, 70, 70, 72, 72, 11,\n",
+       "        55, 72, 72, 72, 72,  5, 72,  0, 20, 58, 72, 72, 72,  0,  0, 16, 72, 72,\n",
+       "        72, 20, 70, 70, 72, 72, 16, 70, 27, 72, 72, 72, 72, 72, 45,  0,  0, 30,\n",
+       "        30, 70, 70, 27, 72, 43, 72, 72, 72, 72, 72, 72, 21, 72, 53, 72, 72, 72,\n",
+       "        27, 72,  0,  1, 72, 72, 72, 72, 25, 70, 23, 23, 48, 72, 72, 72, 72, 72,\n",
+       "        72,  8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
+       "        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
+       "        72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
+       "        72, 72, 72, 72, 72, 72, 72, 72, 43], device='cuda:0')"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pred_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "5dd986a0",
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "Prediction:\n",
+      "កញ្ញា ទេ បូព្រឹក សម្ដែង នៅ តន្ត្រី ស្រាបៀរ កម្ពុជា\n",
       "\n",
       "Reference:\n",
+      "កញ្ញា ទេព បូព្រឹក្ស សម្ដែង នៅ តន្ត្រី ស្រាបៀរ កម្ពុជា\n"
      ]
     }
    ],
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "8e39b112",
    "metadata": {},
    "outputs": [],
    "source": []
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "562af933",
    "metadata": {},
    "outputs": [],
    "source": []

language_model/attrs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}

language_model/km_wiki_ngram.arpa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4eae7d94d04e95668df7306edf35e21f4bbab2a73c736b921e531cd25cde6d0
+size 109085039

language_model/unigrams.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json CHANGED Viewed

@@ -4,6 +4,7 @@
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
   "return_attention_mask": true,
   "sampling_rate": 16000
 }

   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
+  "processor_class": "Wav2Vec2ProcessorWithLM",
   "return_attention_mask": true,
   "sampling_rate": 16000
 }

special_tokens_map.json CHANGED Viewed

@@ -1 +1 @@

- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "vitouphy/xls-r-300m-km", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

train_kh.ipynb CHANGED Viewed

@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "a88514f8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -16,7 +16,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2d955148",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -19167,7 +19167,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "54b0e493",
    "metadata": {},
    "source": [
     "### Load KH Data"
@@ -19176,7 +19176,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "1f31e61b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19199,7 +19199,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "63b2d9b0",
    "metadata": {},
    "outputs": [
     {
@@ -19221,7 +19221,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "dbb54220",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19235,7 +19235,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3bb5808a",
    "metadata": {},
    "source": [
     "### Clean Up the Text"
@@ -19244,7 +19244,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "8d407f91",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19260,7 +19260,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "9fb25eaf",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19272,7 +19272,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "84c7300e",
    "metadata": {},
    "outputs": [
     {
@@ -19293,7 +19293,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "66dfb9ff",
    "metadata": {},
    "source": [
     "### Build Character"
@@ -19302,7 +19302,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "64329ebd",
    "metadata": {},
    "outputs": [
     {
@@ -19350,7 +19350,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "78297789",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19361,7 +19361,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "d66aebea",
    "metadata": {},
    "outputs": [
     {
@@ -19379,7 +19379,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "7c085935",
    "metadata": {},
    "outputs": [
     {
@@ -19406,7 +19406,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "fba33316",
    "metadata": {},
    "outputs": [
     {
@@ -19424,7 +19424,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "5376a5b4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19435,7 +19435,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aec637e0",
    "metadata": {},
    "source": [
     "# Tokenizer"
@@ -19444,7 +19444,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "781094bc",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19456,7 +19456,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "id": "3a3eb52f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19468,7 +19468,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "id": "2711ed79",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19485,7 +19485,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
-   "id": "2772b591",
    "metadata": {},
    "outputs": [
     {
@@ -19525,7 +19525,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "db2af48f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19536,7 +19536,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "id": "b7f42c6a",
    "metadata": {},
    "outputs": [
     {
@@ -19561,7 +19561,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "42b525d0",
    "metadata": {},
    "outputs": [
     {
@@ -19608,7 +19608,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "1db1a77c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19630,7 +19630,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "id": "b0a33568",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -19669,7 +19669,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "id": "ca8be265",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19681,7 +19681,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "id": "53a815bf",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19741,7 +19741,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "id": "1d0cbdf6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19751,7 +19751,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "id": "e26e68a2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19762,7 +19762,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
-   "id": "f347bb3e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19783,7 +19783,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
-   "id": "aff51ef4",
    "metadata": {},
    "outputs": [
     {
@@ -19819,7 +19819,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "id": "6e363fc8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19829,7 +19829,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
-   "id": "447dfc3e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19857,7 +19857,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
-   "id": "eeda7f6e",
    "metadata": {},
    "outputs": [
     {
@@ -19885,7 +19885,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
-   "id": "af09f9f9",
    "metadata": {},
    "outputs": [
     {
@@ -20232,7 +20232,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
-   "id": "e9563734",
    "metadata": {},
    "outputs": [
     {
@@ -20253,7 +20253,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
-   "id": "4c8fe67e",
    "metadata": {},
    "outputs": [
     {
@@ -20286,7 +20286,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
-   "id": "dc64c376",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -20303,7 +20303,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
-   "id": "9f9d87c3",
    "metadata": {},
    "outputs": [
     {
@@ -20322,7 +20322,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
-   "id": "4b50cbfe",
    "metadata": {},
    "outputs": [
     {
@@ -20373,8 +20373,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "id": "33a99751",
    "metadata": {},
    "outputs": [
     {
@@ -20395,7 +20395,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b9482eed",
    "metadata": {},
    "outputs": [],
    "source": []

   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "0ee7433e",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "90323ec7",
    "metadata": {
     "collapsed": true,
     "jupyter": {
   },
   {
    "cell_type": "markdown",
+   "id": "eda834f4",
    "metadata": {},
    "source": [
     "### Load KH Data"
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "e8b86dab",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "0b17a0e1",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "21239531",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "68736f61",
    "metadata": {},
    "source": [
     "### Clean Up the Text"
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "fcba882e",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "9ef37613",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "8e4fdc71",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "markdown",
+   "id": "1fcdf7d8",
    "metadata": {},
    "source": [
     "### Build Character"
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "7b7da87a",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "eb6f0804",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "9189ac57",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "c5fb8a71",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "10043978",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "42f02a78",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "95b09010",
    "metadata": {},
    "source": [
     "# Tokenizer"
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "c4d0f5a6",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "825623c4",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "cfb44de0",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "05ab24c0",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "0cfd158b",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "10d224fa",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "132efaa8",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "c39872d6",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "fef54a48",
    "metadata": {
     "collapsed": true,
     "jupyter": {
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "2f280b0d",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "c9dec52e",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "639dcc23",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "3bb04288",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "9ba8858b",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "434869f9",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "9ffb97fd",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "c83b8d4e",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "7352a29a",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "5a73ff08",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "967962d1",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "da40a75c",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "24166e72",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "95d69b2e",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "d60a731d",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 38,
+   "id": "beca9a8c",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "20063dbc",
    "metadata": {},
    "outputs": [],
    "source": []

train_kh_lm.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json CHANGED Viewed

@@ -1 +1 @@

- {"~~\u1780~~": 1, "~~\u1781~~": 2, "~~\u1782~~": 3, "~~\u1783~~": 4, "~~\u1784~~": 5, "~~\u1785~~": 6, "~~\u1786~~": 7, "~~\u1787~~": 8, "~~\u1788~~": 9, "~~\u1789~~": 10, "~~\u178a~~": 11, "~~\u178b~~": 12, "~~\u178c~~": 13, "~~\u178d~~": 14, "~~\u178e~~": 15, "~~\u178f~~": 16, "~~\u1790~~": 17, "~~\u1791~~": 18, "~~\u1792~~": 19, "~~\u1793~~": 20, "~~\u1794~~": 21, "~~\u1795~~": 22, "~~\u1796~~": 23, "~~\u1797~~": 24, "~~\u1798~~": 25, "~~\u1799~~": 26, "~~\u179a~~": 27, "~~\u179b~~": 28, "~~\u179c~~": 29, "~~\u179f~~": 30, "~~\u17a0~~": 31, "~~\u17a1~~": 32, "~~\u17a2~~": 33, "~~\u17a5~~": 34, "~~\u17a7~~": 35, "~~\u17aa~~": 36, "~~\u17ab~~": 37, "~~\u17ac~~": 38, "~~\u17ad~~": 39, "~~\u17ae~~": 40, "~~\u17af~~": 41, "~~\u17b1~~": 42, "~~\u17b6~~": 43, "~~\u17b7~~": 44, "~~\u17b8~~": 45, "~~\u17b9~~": 46, "~~\u17ba~~": 47, "~~\u17bb~~": 48, "~~\u17bc~~": 49, "~~\u17bd~~": 50, "~~\u17be~~": 51, "~~\u17bf~~": 52, "~~\u17c0~~": 53, "~~\u17c1~~": 54, "~~\u17c2~~": 55, "~~\u17c3~~": 56, "~~\u17c4~~": 57, "~~\u17c5~~": 58, "~~\u17c6~~": 59, "~~\u17c7~~": 60, "~~\u17c8~~": 61, "~~\u17c9~~": 62, "~~\u17ca~~": 63, "~~\u17cb~~": 64, "~~\u17cc~~": 65, "~~\u17cd~~": 66, "~~\u17ce~~": 67, "~~\u17cf~~": 68, "~~\u17d0~~": 69, "~~\u17d2~~": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}

+ {"ក": 1, "ខ": 2, "គ": 3, "ឃ": 4, "ង": 5, "ច": 6, "ឆ": 7, "ជ": 8, "ឈ": 9, "ញ": 10, "ដ": 11, "ឋ": 12, "ឌ": 13, "ឍ": 14, "ណ": 15, "ត": 16, "ថ": 17, "ទ": 18, "ធ": 19, "ន": 20, "ប": 21, "ផ": 22, "ព": 23, "ភ": 24, "ម": 25, "យ": 26, "រ": 27, "ល": 28, "វ": 29, "ស": 30, "ហ": 31, "ឡ": 32, "អ": 33, "ឥ": 34, "ឧ": 35, "ឪ": 36, "ឫ": 37, "ឬ": 38, "ឭ": 39, "ឮ": 40, "ឯ": 41, "ឱ": 42, "ា": 43, "ិ": 44, "ី": 45, "ឹ": 46, "ឺ": 47, "ុ": 48, "ូ": 49, "ួ": 50, "ើ": 51, "ឿ": 52, "ៀ": 53, "េ": 54, "ែ": 55, "ៃ": 56, "ោ": 57, "ៅ": 58, "ំ": 59, "ះ": 60, "ៈ": 61, "៉": 62, "៊": 63, "់": 64, "៌": 65, "៍": 66, "៎": 67, "៏": 68, "័": 69, "្": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}