{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, DatasetDict" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n", " num_rows: 563\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minds14_train = load_dataset(\n", " \"PolyAI/minds14\", \n", " \"en-US\",\n", " split=\"train\"\n", ")\n", "minds14_train" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n", " num_rows: 450\n", " })\n", " test: Dataset({\n", " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n", " num_rows: 113\n", " })\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minds14 = DatasetDict()\n", "\n", "minds14[\"train\"] = minds14_train.select(range(450))\n", "minds14[\"test\"] = minds14_train.select(range(450, 563))\n", "\n", "minds14" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['audio', 'transcription'],\n", " num_rows: 450\n", " })\n", " test: Dataset({\n", " features: ['audio', 'transcription'],\n", " num_rows: 113\n", " })\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minds14 = minds14.select_columns(['audio', 'transcription'])\n", "minds14" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from transformers import WhisperProcessor\n", "\n", "processor = WhisperProcessor.from_pretrained(\n", " \"openai/whisper-tiny\", language=\"english\", task=\"transcribe\"\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),\n", " 'transcription': Value(dtype='string', id=None)}" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minds14[\"train\"].features" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from datasets import Audio\n", "\n", "sampling_rate = processor.feature_extractor.sampling_rate\n", "minds14 = minds14.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def prepare_dataset(example):\n", " audio = example[\"audio\"]\n", "\n", " example = processor(\n", " audio=audio[\"array\"],\n", " sampling_rate=audio[\"sampling_rate\"],\n", " text=example[\"transcription\"],\n", " )\n", "\n", " # compute input length of audio sample in seconds\n", " example[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n", "\n", " return example" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "356d0ccec48f41b9ad10504ae0ca4813", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/450 [00:00 Dict[str, torch.Tensor]:\n", " # split inputs and labels since they have to be of different lengths and need different padding methods\n", " # first treat the audio inputs by simply returning torch tensors\n", " input_features = [\n", " {\"input_features\": feature[\"input_features\"][0]} for feature in features\n", " ]\n", " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n", "\n", " # get the tokenized label sequences\n", " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", " # pad the labels to max length\n", " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n", "\n", " # replace padding with -100 to ignore loss correctly\n", " labels = labels_batch[\"input_ids\"].masked_fill(\n", " labels_batch.attention_mask.ne(1), -100\n", " )\n", "\n", " # if bos token is appended in previous tokenization step,\n", " # cut bos token here as it's append later anyways\n", " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n", " labels = labels[:, 1:]\n", "\n", " batch[\"labels\"] = labels\n", "\n", " return batch" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "\n", "metric = evaluate.load(\"wer\")\n", "normalizer = BasicTextNormalizer()\n", "\n", "def compute_metrics(pred):\n", " pred_ids = pred.predictions\n", " label_ids = pred.label_ids\n", "\n", " # replace -100 with the pad_token_id\n", " label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n", "\n", " # we do not want to group tokens when computing the metrics\n", " pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)\n", " label_str = processor.batch_decode(label_ids, skip_special_tokens=True)\n", "\n", " # compute orthographic wer\n", " wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)\n", "\n", " # compute normalised WER\n", " pred_str_norm = [normalizer(pred) for pred in pred_str]\n", " label_str_norm = [normalizer(label) for label in label_str]\n", " # filtering step to only evaluate the samples that correspond to non-zero references:\n", " pred_str_norm = [\n", " pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0\n", " ]\n", " label_str_norm = [\n", " label_str_norm[i]\n", " for i in range(len(label_str_norm))\n", " if len(label_str_norm[i]) > 0\n", " ]\n", "\n", " wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)\n", "\n", " return {\"wer_ortho\": wer_ortho, \"wer\": wer}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from transformers import WhisperForConditionalGeneration\n", "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", "\n", "# disable cache during training since it's incompatible with gradient checkpointing\n", "model.config.use_cache = False\n", "\n", "# set language and task for generation and re-enable cache\n", "model.generate = partial(\n", " model.generate, language=\"english\", task=\"transcribe\", use_cache=True\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from transformers import Seq2SeqTrainingArguments\n", "\n", "training_args = Seq2SeqTrainingArguments(\n", " output_dir=\"./whisper-tiny-en-us-minds14\", # name on the HF Hub\n", " per_device_train_batch_size=16,\n", " gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size\n", " learning_rate=1e-5,\n", " lr_scheduler_type=\"constant_with_warmup\",\n", " warmup_steps=50,\n", " max_steps=4000, # increase to 4000 if you have your own GPU or a Colab paid plan\n", " gradient_checkpointing=True,\n", " # fp16=True,\n", " # fp16_full_eval=True,\n", " evaluation_strategy=\"steps\",\n", " per_device_eval_batch_size=16,\n", " predict_with_generate=True,\n", " generation_max_length=225,\n", " save_steps=500,\n", " eval_steps=500,\n", " logging_steps=25,\n", " report_to=[\"tensorboard\"],\n", " load_best_model_at_end=True,\n", " metric_for_best_model=\"wer\",\n", " greater_is_better=False,\n", " # push_to_hub=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from transformers import Seq2SeqTrainer\n", "\n", "trainer = Seq2SeqTrainer(\n", " args=training_args,\n", " model=model,\n", " train_dataset=minds14[\"train\"],\n", " eval_dataset=minds14[\"test\"],\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", " tokenizer=processor,\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9dcf642e434e48468854ec1cbaa6120c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/4000 [00:00 1\u001b[0m trainer\u001b[39m.\u001b[39;49mtrain()\n", "File \u001b[0;32m~/Projects/mml/audio-course/venv/lib/python3.8/site-packages/transformers/trainer.py:1555\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1553\u001b[0m hf_hub_utils\u001b[39m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1554\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1555\u001b[0m \u001b[39mreturn\u001b[39;00m inner_training_loop(\n\u001b[1;32m 1556\u001b[0m args\u001b[39m=\u001b[39;49margs,\n\u001b[1;32m 1557\u001b[0m resume_from_checkpoint\u001b[39m=\u001b[39;49mresume_from_checkpoint,\n\u001b[1;32m 1558\u001b[0m trial\u001b[39m=\u001b[39;49mtrial,\n\u001b[1;32m 1559\u001b[0m ignore_keys_for_eval\u001b[39m=\u001b[39;49mignore_keys_for_eval,\n\u001b[1;32m 1560\u001b[0m )\n", "File \u001b[0;32m~/Projects/mml/audio-course/venv/lib/python3.8/site-packages/transformers/trainer.py:1862\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1859\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39maccelerator\u001b[39m.\u001b[39maccumulate(model):\n\u001b[1;32m 1860\u001b[0m tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m-> 1862\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m 1863\u001b[0m args\u001b[39m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 1864\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m 1865\u001b[0m \u001b[39mand\u001b[39;00m (torch\u001b[39m.\u001b[39misnan(tr_loss_step) \u001b[39mor\u001b[39;00m torch\u001b[39m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 1866\u001b[0m ):\n\u001b[1;32m 1867\u001b[0m \u001b[39m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 1868\u001b[0m tr_loss \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m tr_loss \u001b[39m/\u001b[39m (\u001b[39m1\u001b[39m \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mglobal_step \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_globalstep_last_logged)\n\u001b[1;32m 1869\u001b[0m \u001b[39melse\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# from transformers import GenerationConfig\n", "# generation_config = GenerationConfig.from_pretrained(\"openai/whisper-tiny.en\")\n", "# generation_config.push_to_hub('mirodil/whisper-tiny-en-us-minds14')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5cb7500ba08c4c98b821669c3207517d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "events.out.tfevents.1700719599.L67DDV9G7R.91939.0: 0%| | 0.00/29.3k [00:00