diff --git "a/train_kh.ipynb" "b/train_kh.ipynb" --- "a/train_kh.ipynb" +++ "b/train_kh.ipynb" @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "bff05704", + "id": "a88514f8", "metadata": {}, "outputs": [], "source": [ @@ -16,7 +16,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9637cdfd", + "id": "2d955148", "metadata": { "collapsed": true, "jupyter": { @@ -19160,14 +19160,14 @@ } ], "source": [ - "%%bash \n", - "wget https://www.openslr.org/resources/42/km_kh_male.zip\n", - "unzip km_kh_male.zip" + "# %%bash \n", + "# wget https://www.openslr.org/resources/42/km_kh_male.zip\n", + "# unzip km_kh_male.zip" ] }, { "cell_type": "markdown", - "id": "b11b1d53", + "id": "54b0e493", "metadata": {}, "source": [ "### Load KH Data" @@ -19175,153 +19175,67 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "f35b6d68", + "execution_count": 6, + "id": "1f31e61b", "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", - "import pandas as pd\n", - "from datasets import load_dataset\n", + "# from sklearn.model_selection import train_test_split\n", + "# import pandas as pd\n", + "# from datasets import load_dataset\n", " \n", - "colnames=['path','drop','text'] \n", - "df = pd.read_csv('km_kh_male/line_index.tsv',sep='\\t',header=None,names=colnames)\n", - "df['path'] = '/workspace/xls-r-300m-km/km_kh_male/wavs/'+df['path'] +'.wav'\n", + "# colnames=['path','drop','text'] \n", + "# df = pd.read_csv('km_kh_male/line_index.tsv',sep='\\t',header=None,names=colnames)\n", + "# df['path'] = '/workspace/xls-r-300m-km/km_kh_male/wavs/'+df['path'] +'.wav'\n", "\n", - "train, test = train_test_split(df, test_size=0.1)\n", + "# train_valid, test = train_test_split(df, test_size=0.1)\n", + "# train, valid = train_test_split(train_valid, test_size=0.1)\n", "\n", - "train.to_csv('./km_kh_male/line_index_train.csv')\n", - "test.to_csv('./km_kh_male/line_index_test.csv')" + "# train.to_csv('./km_kh_male/line_index_train.csv')\n", + "# valid.to_csv('./km_kh_male/line_index_valid.csv')\n", + "# test.to_csv('./km_kh_male/line_index_test.csv')" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "a0b561cb", + "execution_count": 2, + "id": "63b2d9b0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Using custom data configuration default-9437ac7a59e13b5b\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading and preparing dataset csv/default to /workspace/.cache/huggingface/datasets/csv/default-9437ac7a59e13b5b/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cb75a7b737324ce98def82472b0823f3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 to the vocabulary\n", - "Adding to the vocabulary\n" - ] - } - ], + "outputs": [], "source": [ "tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(\"./\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\") # './' load vocab.json in the current directory\n", "feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) \n", @@ -19613,7 +19468,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "f971580d", + "id": "2711ed79", "metadata": {}, "outputs": [], "source": [ @@ -19630,7 +19485,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "d0368c7a", + "id": "2772b591", "metadata": {}, "outputs": [ { @@ -19669,32 +19524,32 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "62e9d0c6", + "execution_count": 15, + "id": "db2af48f", "metadata": {}, "outputs": [], "source": [ "common_voice_train = common_voice_train.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')\n", - "common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')" + "common_voice_valid = common_voice_valid.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "f642a861", + "execution_count": 16, + "id": "b7f42c6a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_6753_2783635929.wav',\n", - " 'array': array([ 3.3284457e-05, 5.5497538e-05, 2.6061889e-05, ...,\n", - " 2.0593125e-06, -5.3043197e-05, 0.0000000e+00], dtype=float32),\n", + "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_4015603856.wav',\n", + " 'array': array([-1.3359112e-06, 1.5759380e-06, -2.0205737e-06, ...,\n", + " -7.6091878e-06, 9.0511895e-07, 0.0000000e+00], dtype=float32),\n", " 'sampling_rate': 16000},\n", - " 'sentence': 'កុងដ្វាណា មាន មូលដ្ឋាន នៅ ប្រទេស ស៊ីរី'}" + " 'sentence': 'ទេសចរណ៍ នៅ ខេត្ត ព្រះ សីហនុ នា រដូវ បុណ្យ ភ្ជុំ បិណ្ឌ នេះ មាន ការ កើន ឡើង យ៉ាង ខ្លាំង'}" ] }, - "execution_count": 18, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -19705,16 +19560,16 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "0c756a07", + "execution_count": 17, + "id": "42b525d0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Target text: ផ្សារ អូឡាំពិក មាន ក្រណាត់ និង សម្លៀកបំពាក់ បោះ ដុំ និង លក់ រាយ\n", - "Input array shape: (81920,)\n", + "Target text: បញ្ជី ឈ្មោះ បោះឆ្នោត ខេត្ត កំពង់ចាម\n", + "Input array shape: (65536,)\n", "Sampling rate: 16000\n" ] }, @@ -19722,8 +19577,8 @@ "data": { "text/html": [ "\n", - "