diff --git "a/.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb" "b/.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb" --- "a/.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb" +++ "b/.ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb" @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "id": "065a8cf7-e54f-4ac3-900e-609c80714fca", "metadata": {}, "outputs": [], @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 37, "id": "a2787582-554f-44ce-9f38-4180a5ed6b44", "metadata": {}, "outputs": [], @@ -151,12 +151,12 @@ "\n", "raw_datasets = IterableDatasetDict()\n", "\n", - "raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-CN\", split=\"train\", use_auth_token=True)\n", - "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-CN\", split=\"test\", use_auth_token=True)\n", + "# raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-CN\", split=\"train\", use_auth_token=True)\n", + "# raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-CN\", split=\"test\", use_auth_token=True)\n", "\n", "# zh-TW is low resource\n", - "# raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"train+validation\", use_auth_token=True)\n", - "# raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"test\", use_auth_token=True)" + "raw_datasets[\"train\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"train+validation\", use_auth_token=True)\n", + "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"zh-TW\", split=\"test\", use_auth_token=True)" ] }, { @@ -189,109 +189,16395 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 38, "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "48ef23eaa9fb4d6ca621fd252befca48", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/185k [00:00 to the vocabulary\n", + "Adding <|startoftranscript|> to the vocabulary\n", + "Adding <|en|> to the vocabulary\n", + "Adding <|zh|> to the vocabulary\n", + "Adding <|de|> to the vocabulary\n", + "Adding <|es|> to the vocabulary\n", + "Adding <|ru|> to the vocabulary\n", + "Adding <|ko|> to the vocabulary\n", + "Adding <|fr|> to the vocabulary\n", + "Adding <|ja|> to the vocabulary\n", + "Adding <|pt|> to the vocabulary\n", + "Adding <|tr|> to the vocabulary\n", + "Adding <|pl|> to the vocabulary\n", + "Adding <|ca|> to the vocabulary\n", + "Adding <|nl|> to the vocabulary\n", + "Adding <|ar|> to the vocabulary\n", + "Adding <|sv|> to the vocabulary\n", + "Adding <|it|> to the vocabulary\n", + "Adding <|id|> to the vocabulary\n", + "Adding <|hi|> to the vocabulary\n", + "Adding <|fi|> to the vocabulary\n", + "Adding <|vi|> to the vocabulary\n", + "Adding <|iw|> to the vocabulary\n", + "Adding <|uk|> to the vocabulary\n", + "Adding <|el|> to the vocabulary\n", + "Adding <|ms|> to the vocabulary\n", + "Adding <|cs|> to the vocabulary\n", + "Adding <|ro|> to the vocabulary\n", + "Adding <|da|> to the vocabulary\n", + "Adding <|hu|> to the vocabulary\n", + "Adding <|ta|> to the vocabulary\n", + "Adding <|no|> to the vocabulary\n", + "Adding <|th|> to the vocabulary\n", + "Adding <|ur|> to the vocabulary\n", + "Adding <|hr|> to the vocabulary\n", + "Adding <|bg|> to the vocabulary\n", + "Adding <|lt|> to the vocabulary\n", + "Adding <|la|> to the vocabulary\n", + "Adding <|mi|> to the vocabulary\n", + "Adding <|ml|> to the vocabulary\n", + "Adding <|cy|> to the vocabulary\n", + "Adding <|sk|> to the vocabulary\n", + "Adding <|te|> to the vocabulary\n", + "Adding <|fa|> to the vocabulary\n", + "Adding <|lv|> to the vocabulary\n", + "Adding <|bn|> to the vocabulary\n", + "Adding <|sr|> to the vocabulary\n", + "Adding <|az|> to the vocabulary\n", + "Adding <|sl|> to the vocabulary\n", + "Adding <|kn|> to the vocabulary\n", + "Adding <|et|> to the vocabulary\n", + "Adding <|mk|> to the vocabulary\n", + "Adding <|br|> to the vocabulary\n", + "Adding <|eu|> to the vocabulary\n", + "Adding <|is|> to the vocabulary\n", + "Adding <|hy|> to the vocabulary\n", + "Adding <|ne|> to the vocabulary\n", + "Adding <|mn|> to the vocabulary\n", + "Adding <|bs|> to the vocabulary\n", + "Adding <|kk|> to the vocabulary\n", + "Adding <|sq|> to the vocabulary\n", + "Adding <|sw|> to the vocabulary\n", + "Adding <|gl|> to the vocabulary\n", + "Adding <|mr|> to the vocabulary\n", + "Adding <|pa|> to the vocabulary\n", + "Adding <|si|> to the vocabulary\n", + "Adding <|km|> to the vocabulary\n", + "Adding <|sn|> to the vocabulary\n", + "Adding <|yo|> to the vocabulary\n", + "Adding <|so|> to the vocabulary\n", + "Adding <|af|> to the vocabulary\n", + "Adding <|oc|> to the vocabulary\n", + "Adding <|ka|> to the vocabulary\n", + "Adding <|be|> to the vocabulary\n", + "Adding <|tg|> to the vocabulary\n", + "Adding <|sd|> to the vocabulary\n", + "Adding <|gu|> to the vocabulary\n", + "Adding <|am|> to the vocabulary\n", + "Adding <|yi|> to the vocabulary\n", + "Adding <|lo|> to the vocabulary\n", + "Adding <|uz|> to the vocabulary\n", + "Adding <|fo|> to the vocabulary\n", + "Adding <|ht|> to the vocabulary\n", + "Adding <|ps|> to the vocabulary\n", + "Adding <|tk|> to the vocabulary\n", + "Adding <|nn|> to the vocabulary\n", + "Adding <|mt|> to the vocabulary\n", + "Adding <|sa|> to the vocabulary\n", + "Adding <|lb|> to the vocabulary\n", + "Adding <|my|> to the vocabulary\n", + "Adding <|bo|> to the vocabulary\n", + "Adding <|tl|> to the vocabulary\n", + "Adding <|mg|> to the vocabulary\n", + "Adding <|as|> to the vocabulary\n", + "Adding <|tt|> to the vocabulary\n", + "Adding <|haw|> to the vocabulary\n", + "Adding <|ln|> to the vocabulary\n", + "Adding <|ha|> to the vocabulary\n", + "Adding <|ba|> to the vocabulary\n", + "Adding <|jw|> to the vocabulary\n", + "Adding <|su|> to the vocabulary\n", + "Adding <|translate|> to the vocabulary\n", + "Adding <|transcribe|> to the vocabulary\n", + "Adding <|startoflm|> to the vocabulary\n", + "Adding <|startofprev|> to the vocabulary\n", + "Adding <|nocaptions|> to the vocabulary\n", + "Adding <|notimestamps|> to the vocabulary\n" + ] } ], "source": [ @@ -318,7 +16604,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 39, "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988", "metadata": {}, "outputs": [ @@ -338,7 +16624,7 @@ " 'segment': Value(dtype='string', id=None)}" ] }, - "execution_count": 5, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -364,7 +16650,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 40, "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39", "metadata": {}, "outputs": [], @@ -384,7 +16670,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 41, "id": "d041650e-1c48-4439-87b3-5b6f4a514107", "metadata": {}, "outputs": [], @@ -411,7 +16697,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 42, "id": "c085911c-a10a-41ef-8874-306e0503e9bb", "metadata": {}, "outputs": [], @@ -447,7 +16733,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 43, "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684", "metadata": {}, "outputs": [], @@ -465,7 +16751,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 44, "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c", "metadata": {}, "outputs": [], @@ -486,7 +16772,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 45, "id": "01cb25ef-4bb0-4325-9461-f59198acadf6", "metadata": {}, "outputs": [], @@ -507,7 +16793,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 46, "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac", "metadata": {}, "outputs": [], @@ -577,7 +16863,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 47, "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5", "metadata": {}, "outputs": [], @@ -625,7 +16911,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 48, "id": "fc834702-c0d3-4a96-b101-7b87be32bf42", "metadata": {}, "outputs": [], @@ -652,25 +16938,10 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 49, "id": "b22b4011-f31f-4b57-b684-c52332f92890", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "037e658a8e194212a068ba9eea85cf11", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading builder script: 0%| | 0.00/4.49k [00:00\n", " \n", - " \n", - " [ 201/2000 22:53 < 3:26:59, 0.14 it/s, Epoch 0.10/9223372036854775807]\n", + " \n", + " [ 714/1000 2:15:14 < 54:19, 0.09 it/s, Epoch 4.12/9223372036854775807]\n", " \n", " \n", " \n", @@ -1026,9 +17433,28 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
StepTraining LossValidation LossWer
2000.1438000.18219642.436029
4000.0315000.18694442.128966
6000.0113000.19532140.634596

" ], @@ -1043,11 +17469,62 @@ "name": "stderr", "output_type": "stream", "text": [ + "Reading metadata...: 6568it [00:00, 36204.27it/s]\n", + "Reading metadata...: 4709it [00:00, 51489.81it/s]\n", "***** Running Evaluation *****\n", " Num examples: Unknown\n", " Batch size = 8\n", - "Reading metadata...: 10581it [00:00, 28979.41it/s]\n", - "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n" + "Reading metadata...: 4709it [00:00, 28976.18it/s]\n", + "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n", + "Saving model checkpoint to ./checkpoint-200\n", + "Configuration saved in ./checkpoint-200/config.json\n", + "Model weights saved in ./checkpoint-200/pytorch_model.bin\n", + "Feature extractor saved in ./checkpoint-200/preprocessor_config.json\n", + "tokenizer config file saved in ./checkpoint-200/tokenizer_config.json\n", + "Special tokens file saved in ./checkpoint-200/special_tokens_map.json\n", + "added tokens file saved in ./checkpoint-200/added_tokens.json\n", + "Feature extractor saved in ./preprocessor_config.json\n", + "tokenizer config file saved in ./tokenizer_config.json\n", + "Special tokens file saved in ./special_tokens_map.json\n", + "added tokens file saved in ./added_tokens.json\n", + "Reading metadata...: 6568it [00:00, 38299.83it/s]\n", + "Reading metadata...: 4709it [00:00, 30779.86it/s]\n", + "***** Running Evaluation *****\n", + " Num examples: Unknown\n", + " Batch size = 8\n", + "Reading metadata...: 4709it [00:00, 26095.46it/s]\n", + "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n", + "Saving model checkpoint to ./checkpoint-400\n", + "Configuration saved in ./checkpoint-400/config.json\n", + "Model weights saved in ./checkpoint-400/pytorch_model.bin\n", + "Feature extractor saved in ./checkpoint-400/preprocessor_config.json\n", + "tokenizer config file saved in ./checkpoint-400/tokenizer_config.json\n", + "Special tokens file saved in ./checkpoint-400/special_tokens_map.json\n", + "added tokens file saved in ./checkpoint-400/added_tokens.json\n", + "Feature extractor saved in ./preprocessor_config.json\n", + "tokenizer config file saved in ./tokenizer_config.json\n", + "Special tokens file saved in ./special_tokens_map.json\n", + "added tokens file saved in ./added_tokens.json\n", + "Reading metadata...: 6568it [00:00, 41768.54it/s]\n", + "Reading metadata...: 4709it [00:00, 23263.16it/s]\n", + "Reading metadata...: 6568it [00:00, 38529.47it/s]\n", + "Reading metadata...: 4709it [00:00, 70740.82it/s]\n", + "***** Running Evaluation *****\n", + " Num examples: Unknown\n", + " Batch size = 8\n", + "Reading metadata...: 4709it [00:00, 77474.87it/s]\n", + "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n", + "Saving model checkpoint to ./checkpoint-600\n", + "Configuration saved in ./checkpoint-600/config.json\n", + "Model weights saved in ./checkpoint-600/pytorch_model.bin\n", + "Feature extractor saved in ./checkpoint-600/preprocessor_config.json\n", + "tokenizer config file saved in ./checkpoint-600/tokenizer_config.json\n", + "Special tokens file saved in ./checkpoint-600/special_tokens_map.json\n", + "added tokens file saved in ./checkpoint-600/added_tokens.json\n", + "Feature extractor saved in ./preprocessor_config.json\n", + "tokenizer config file saved in ./tokenizer_config.json\n", + "Special tokens file saved in ./special_tokens_map.json\n", + "added tokens file saved in ./added_tokens.json\n" ] } ],