{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "63594f228ab14d9796bbf24112269a52", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
Dict[str, torch.Tensor]:\n", " # split inputs and labels since they have to be of different lengths and need different padding methods\n", " # first treat the audio inputs by simply returning torch tensors\n", " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n", " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n", "\n", " # get the tokenized label sequences\n", " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", "\n", " # pad the labels to max length\n", " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n", "\n", " # replace padding with -100 to ignore loss correctly\n", " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", "\n", " # if bos token is appended in previous tokenization step,\n", " # cut bos token here as it's append later anyways\n", " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n", " labels = labels[:, 1:]\n", "\n", " batch[\"labels\"] = labels\n", "\n", " return batch\n", "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "from transformers import WhisperForConditionalGeneration\n", "\n", "\n", "model = WhisperForConditionalGeneration.from_pretrained(\n", " model_name_or_path, device_map=\"auto\"\n", ")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " test: Dataset({\n", " features: ['input_features', 'labels'],\n", " num_rows: 857\n", " })\n", "})" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "evaluation_dataset" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 108/108 [09:19<00:00, 5.18s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer=24.938214396045723\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from torch.utils.data import DataLoader\n", "from tqdm import tqdm\n", "import numpy as np\n", "import gc\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=8, collate_fn=data_collator)\n", "\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n", " decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " metric.add_batch(\n", " predictions=decoded_preds,\n", " references=decoded_labels,\n", " )\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "wer = 100 * metric.compute()\n", "print(f\"{wer=}\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 54/54 [07:20<00:00, 8.15s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer=24.934352795798578 and normalized_wer=13.639508070714834\n", "{'eval/wer': 24.934352795798578, 'eval/normalized_wer': 13.639508070714834}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import gc\n", "import numpy as np\n", "from tqdm import tqdm\n", "from torch.utils.data import DataLoader\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "\n", "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=16, collate_fn=data_collator)\n", "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task='transcribe')\n", "normalizer = BasicTextNormalizer()\n", "\n", "predictions = []\n", "references = []\n", "normalized_predictions = []\n", "normalized_references = []\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " forced_decoder_ids=forced_decoder_ids,\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n", " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " predictions.extend(decoded_preds)\n", " references.extend(decoded_labels)\n", " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n", " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "wer = 100 * metric.compute(predictions=predictions, references=references)\n", "normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n", "eval_metrics = {\"eval/wer\": wer, \"eval/normalized_wer\": normalized_wer}\n", "\n", "print(f\"{wer=} and {normalized_wer=}\")\n", "print(eval_metrics)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lora\n" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "from dataclasses import dataclass\n", "from typing import Any, Dict, List, Union\n", "\n", "@dataclass\n", "class DataCollatorSpeechSeq2SeqWithPadding:\n", " processor: Any\n", "\n", " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n", " # split inputs and labels since they have to be of different lengths and need different padding methods\n", " # first treat the audio inputs by simply returning torch tensors\n", " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n", " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n", "\n", " # get the tokenized label sequences\n", " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", "\n", " # pad the labels to max length\n", " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n", "\n", " # replace padding with -100 to ignore loss correctly\n", " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", "\n", " # if bos token is appended in previous tokenization step,\n", " # cut bos token here as it's append later anyways\n", " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n", " labels = labels[:, 1:]\n", "\n", " batch[\"labels\"] = labels\n", "\n", " return batch" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import (\n", " AutomaticSpeechRecognitionPipeline,\n", " WhisperForConditionalGeneration,\n", " WhisperTokenizer,\n", " WhisperProcessor,\n", ")\n", "from peft import PeftModel, PeftConfig\n", "\n", "peft_model_id = \"DuyTa/vi-whisper-medium-Lora\"\n", "\n", "language = \"Vietnamese\"\n", "task = \"transcribe\"\n", "\n", "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", "model = WhisperForConditionalGeneration.from_pretrained(\n", " peft_config.base_model_name_or_path,\n", ")\n", "model = PeftModel.from_pretrained(model, peft_model_id)\n", "model.to(\"cuda\").half()\n", "\n", "processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)\n", "\n" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 108/108 [12:31<00:00, 6.96s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer_lora=24.934352795798578\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "from torch.utils.data import DataLoader\n", "from tqdm import tqdm\n", "import numpy as np\n", "import gc\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=8, collate_fn=data_collator)\n", "\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " decoder_input_ids=batch[\"labels\"][:, :4].to(\"cuda\"),\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n", " decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " metric.add_batch(\n", " predictions=decoded_preds,\n", " references=decoded_labels,\n", " )\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "wer_lora = 100 * metric.compute()\n", "print(f\"{wer_lora=}\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 54/54 [09:20<00:00, 10.39s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer=24.934352795798578 and normalized_wer=13.624135280553421\n", "{'eval/wer': 24.934352795798578, 'eval/normalized_wer': 13.624135280553421}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import gc\n", "import numpy as np\n", "from tqdm import tqdm\n", "from torch.utils.data import DataLoader\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "\n", "eval_dataloader = DataLoader(evaluation_dataset['test'], batch_size=16, collate_fn=data_collator)\n", "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task='transcribe')\n", "normalizer = BasicTextNormalizer()\n", "\n", "predictions = []\n", "references = []\n", "normalized_predictions = []\n", "normalized_references = []\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " forced_decoder_ids=forced_decoder_ids,\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n", " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " predictions.extend(decoded_preds)\n", " references.extend(decoded_labels)\n", " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n", " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "wer = 100 * metric.compute(predictions=predictions, references=references)\n", "normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n", "eval_metrics = {\"eval/wer\": wer, \"eval/normalized_wer\": normalized_wer}\n", "\n", "print(f\"{wer=} and {normalized_wer=}\")\n", "print(eval_metrics)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Quantization Whisper Lora" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from whisper_quant import WhisperModel\n", "\n", "model_size = \"medium\"\n", "\n", "# Run on GPU with FP16\n", "model = WhisperModel(model_size, device=\"cuda\", compute_type=\"float16\")\n", "\n", "# or run on GPU with INT8\n", "# model = WhisperModel(model_size, device=\"cuda\", compute_type=\"int8_float16\")\n", "# or run on CPU with INT8\n", "# model = WhisperModel(model_size, device=\"cpu\", compute_type=\"int8\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "segments, info = model.transcribe(\"audio.wav\", beam_size=1, language ='vi', temperature= 0)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.00s -> 7.10s] Hai, đây tức là một kẻ ăn mày vậy, anh ta chưa kịp quay đi thì đã thấy mấy con chó vàng chạy sồng sộc ra cứ nhảy sổ vào chân anh.\n" ] } ], "source": [ "for segment in segments:\n", " print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "normalizer = BasicTextNormalizer()\n", "norm = normalizer(segment.text)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' hai đây tức là một kẻ ăn mầy vậy anh ta chưa kịp quay đi thì đã thấy mấy con chó vàng chạy sồng sộc ra cứ nhảy sổ vào chân anh '" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "norm" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class DataCollatorSpeechSeq2SeqWithPadding:\n", " processor: Any\n", "\n", " def __call__(self, features):\n", " audios = []\n", " for feature in features:\n", " audios.append(feature[\"audio\"])\n", " batch = {\n", " \"audio\": [feature[\"audio\"]['array'] for feature in features],\n", " \"transcription\": [feature[\"transcription\"] for feature in features]\n", " }\n", " return batch\n", "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor='No')" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "from torch.utils.data import DataLoader\n", "from tqdm import tqdm\n", "import numpy as np\n", "import gc\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "eval_dataloader = DataLoader(fleurs['test'], batch_size=16, collate_fn=data_collator)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "for data in eval_dataloader:\n", " audios = data['audio']\n", " transcriptions = data['transcription']\n", " final = []\n", " for audio in data['audio']:\n", " print(\"-\" * 20)\n", " segments, info = model.transcribe(audio, beam_size=1, language='vi')\n", " out = [out.text for out in segments]\n", " pred = ''.join(out)\n", " norm_pred = normalizer(pred)\n", " final.append(norm_pred)\n", "cleaned_text_list = [re.sub(r'\\s+', ' ', text.strip()) for text in final]\n", " \n", "\n", " print(cleaned_text_list)\n", " print(transcriptions)\n", " break\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", "\n", " labels = batch[\"transcription\"]\n", " print(labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " final = []\n", " labels = batch[\"transcription\"]\n", " for audio in batch[\"audio\"]:\n", " \n", " segments, _ = model.transcribe(audio, beam_size=1, language='vi')\n", " out = [out.text for out in segments]\n", " pred = ''.join(out)\n", " norm_pred = normalizer(pred)\n", " final.append(norm_pred)\n", " cleaned_text_list = [re.sub(r'\\s+', ' ', text.strip()) for text in final]\n", " print(cleaned_text_list)\n", " print(labels)\n", " metric.add_batch(\n", " predictions=cleaned_text_list,\n", " references=labels,\n", " )\n", " del labels, batch, final\n", " gc.collect()\n", "wer_lora = 100 * metric.compute()\n", "print(f\"{wer_lora=}\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "\n", "model_names = [\"whisper-medium\", \"whisper-lora\",\"quantz-whisper-lora\"]\n", "wer_values = [13.639508070714834,13.624135280553421,14.044640098856966]\n", "wer_values = [round(value, 2) for value in wer_values]\n", "\n", "bar_colors = ['blue', 'green',\"red\"]\n", "# Create a bar chart\n", "plt.bar(model_names, wer_values,color=bar_colors)\n", "plt.xlabel(\"Fleurs\")\n", "plt.ylabel(\"WER (%)\")\n", "plt.title(\"Word Error Rate (WER) of Models\")\n", "plt.ylim(0, 15) # Set y-axis range to 0-100\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vin100h" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## medium" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n", " num_rows: 44927\n", " })\n", " test: Dataset({\n", " features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n", " num_rows: 11245\n", " })\n", "})\n" ] } ], "source": [ " # Load Dataset\n", "from datasets import load_dataset, DatasetDict, load_from_disk\n", "processed_dataset = DatasetDict()\n", "processed_dataset = load_from_disk(\"./vin_clean\")\n", "\n", "\n", "print(processed_dataset)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py:992: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", " warnings.warn(\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/models/auto/feature_extraction_auto.py:322: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", " warnings.warn(\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:628: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", " warnings.warn(\n" ] } ], "source": [ "\n", "\n", "import datasets\n", "from datasets import DatasetDict, load_dataset, concatenate_datasets\n", "from tqdm import tqdm\n", "from transformers import (\n", " AutoConfig,\n", " AutoFeatureExtractor,\n", " AutoModelForSpeechSeq2Seq,\n", " AutoTokenizer,\n", " set_seed,\n", ")\n", "from tqdm import tqdm\n", "\n", "\n", "\n", "from datasets import Dataset, DatasetDict\n", "import torchaudio\n", "from torchaudio import transforms as at\n", "import pandas as pd\n", "import torch\n", "from pathlib import Path\n", "\n", "set_seed(42)\n", "\n", "\n", "config = AutoConfig.from_pretrained(\n", " \"openai/whisper-medium\", revision=\"main\", use_auth_token=True\n", " )\n", "\n", "config.update({\"forced_decoder_ids\": None, \"suppress_tokens\": None})\n", "\n", "\n", "\n", "feature_extractor = AutoFeatureExtractor.from_pretrained(\n", " \"openai/whisper-medium\",\n", " revision=\"main\",\n", " use_auth_token=True,\n", " )\n", "tokenizer = AutoTokenizer.from_pretrained(\n", " \"openai/whisper-medium\",\n", " use_fast=True,\n", " revision=\"main\",\n", " use_auth_token=True,\n", " )\n", "\n", "tokenizer.set_prefix_tokens(language=\"vi\", task=\"transcribe\")\n", "\n", "audio_column_name = \"audio\"\n", "text_column_name = \"transcription\"\n", "model_input_name = feature_extractor.model_input_names[0]\n", "\n", "\n", "forward_attention_mask = True\n", "\n", "\n", "\n", "def prepare_dataset(batch):\n", " # load and resample audio data from 48 to 16kHz\n", " audio = batch[\"audio\"]\n", "\n", " # compute log-Mel input features from input audio array \n", " batch[\"input_features\"] = feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n", "\n", " # encode target text to label ids \n", " batch[\"labels\"] = tokenizer(batch[text_column_name]).input_ids\n", " return batch\n", "\n", "\n", "clean_data = processed_dataset['test']\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "from dataclasses import dataclass\n", "from typing import Any, Dict, List, Union\n", "\n", "from transformers import WhisperProcessor\n", "\n", "processor = WhisperProcessor.from_pretrained(\"openai/whisper-medium\", language=\"vi\", task=\"transcribe\")\n", "@dataclass\n", "class DataCollatorSpeechSeq2SeqWithPadding:\n", " processor: Any\n", "\n", " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n", " # split inputs and labels since they have to be of different lengths and need different padding methods\n", " # first treat the audio inputs by simply returning torch tensors\n", " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n", " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n", "\n", " # get the tokenized label sequences\n", " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", "\n", " # pad the labels to max length\n", " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n", "\n", " # replace padding with -100 to ignore loss correctly\n", " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", "\n", " # if bos token is appended in previous tokenization step,\n", " # cut bos token here as it's append later anyways\n", " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n", " labels = labels[:, 1:]\n", "\n", " batch[\"labels\"] = labels\n", "\n", " return batch\n", "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['input_features', 'input_length', 'attention_mask', 'labels'],\n", " num_rows: 11245\n", "})" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_data" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "torch.cuda.empty_cache()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from transformers import WhisperForConditionalGeneration\n", "\n", "\n", "model = WhisperForConditionalGeneration.from_pretrained(\n", " 'openai/whisper-medium', device_map=\"auto\"\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 703/703 [1:42:04<00:00, 8.71s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer=35.96574964967958 and normalized_wer=22.798881834138733\n", "{'eval/wer': 35.96574964967958, 'eval/normalized_wer': 22.798881834138733}\n" ] } ], "source": [ "import gc\n", "import numpy as np\n", "from tqdm import tqdm\n", "from torch.utils.data import DataLoader\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "\n", "eval_dataloader = DataLoader(clean_data, batch_size=16, collate_fn=data_collator)\n", "forced_decoder_ids = processor.get_decoder_prompt_ids(language=\"vi\", task='transcribe')\n", "normalizer = BasicTextNormalizer()\n", "\n", "predictions = []\n", "references = []\n", "normalized_predictions = []\n", "normalized_references = []\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " forced_decoder_ids=forced_decoder_ids,\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n", " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " predictions.extend(decoded_preds)\n", " references.extend(decoded_labels)\n", " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n", " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "wer = 100 * metric.compute(predictions=predictions, references=references)\n", "normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n", "eval_metrics = {\"eval/wer\": wer, \"eval/normalized_wer\": normalized_wer}\n", "\n", "print(f\"{wer=} and {normalized_wer=}\")\n", "print(eval_metrics)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lora" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import (\n", " AutomaticSpeechRecognitionPipeline,\n", " WhisperForConditionalGeneration,\n", " WhisperTokenizer,\n", " WhisperProcessor,\n", ")\n", "from peft import PeftModel, PeftConfig\n", "\n", "\n", "peft_model_id = \"DuyTa/vi-whisper-medium-Lora\"\n", "language = \"Vietnamese\"\n", "task = \"transcribe\"\n", "\n", "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", "model = WhisperForConditionalGeneration.from_pretrained(\n", " peft_config.base_model_name_or_path\n", ")\n", "model = PeftModel.from_pretrained(model, peft_model_id)\n", "model.to(\"cuda:0\").half()\n", "\n", "processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 703/703 [2:13:50<00:00, 11.42s/it] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer=35.96574964967958 and normalized_wer=22.798881834138733\n", "{'eval/wer': 35.95314171676819, 'eval/normalized_wer': 22.793825528564515}\n" ] } ], "source": [ "import gc\n", "import numpy as np\n", "from tqdm import tqdm\n", "from torch.utils.data import DataLoader\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "\n", "eval_dataloader = DataLoader(clean_data, batch_size=16, collate_fn=data_collator)\n", "forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task='transcribe')\n", "normalizer = BasicTextNormalizer()\n", "\n", "predictions = []\n", "references = []\n", "normalized_predictions = []\n", "normalized_references = []\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "model.eval()\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " generated_tokens = (\n", " model.generate(\n", " input_features=batch[\"input_features\"].to(\"cuda\"),\n", " forced_decoder_ids=forced_decoder_ids,\n", " max_new_tokens=255,\n", " )\n", " .cpu()\n", " .numpy()\n", " )\n", " labels = batch[\"labels\"].cpu().numpy()\n", " labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)\n", " decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n", " decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)\n", " predictions.extend(decoded_preds)\n", " references.extend(decoded_labels)\n", " normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds])\n", " normalized_references.extend([normalizer(label).strip() for label in decoded_labels])\n", " del generated_tokens, labels, batch\n", " gc.collect()\n", "lora_wer = 100 * metric.compute(predictions=predictions, references=references)\n", "lora_normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references)\n", "eval_metrics = {\"eval/wer\": lora_wer, \"eval/normalized_wer\": lora_normalized_wer}\n", "\n", "print(f\"{wer=} and {normalized_wer=}\")\n", "print(eval_metrics)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Quantized model" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from whisper_quant import WhisperModel\n", "\n", "model_size = \"medium\"\n", "\n", "# Run on GPU with FP16\n", "model = WhisperModel(model_size, device=\"cuda\", compute_type=\"float16\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### format dataset for quantized model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import numpy as np\n", "import datasets\n", "from datasets import DatasetDict, load_dataset, concatenate_datasets\n", "from tqdm import tqdm\n", "from transformers import (\n", " AutoConfig,\n", " AutoFeatureExtractor,\n", " AutoModelForSpeechSeq2Seq,\n", " AutoTokenizer,\n", " set_seed,\n", ")\n", "from transformers.utils.versions import require_version\n", "from transformers.utils import check_min_version\n", "from tqdm import tqdm\n", "\n", "\n", "\n", "\n", "logger = logging.getLogger(__name__)\n", "from datasets import Dataset, DatasetDict\n", "import torchaudio\n", "from torchaudio import transforms as at\n", "import pandas as pd\n", "import torch\n", "from pathlib import Path\n", "\n", "def main():\n", " set_seed(42)\n", "\n", "\n", " max_input_length = 30.0 * 16000\n", " min_input_length = 0.0 * 16000\n", " audio_column_name = \"audio\"\n", " text_column_name = \"text\"\n", "\n", "\n", "\n", "\n", "\n", " def load_wave(wave_path, sample_rate:int=16000) -> np.ndarray:\n", " waveform, sr = torchaudio.load(wave_path, normalize=True)\n", " if sample_rate != sr:\n", " waveform = at.Resample(sr, sample_rate)(waveform)\n", " return np.asarray(waveform)\n", " \n", "\n", " def get_list_files_vin100h(phase, sample_rate=16000, audio_max_sample_length=480000):\n", " audio_list = []\n", " text_list = []\n", " path_list = []\n", " if phase == 'train':\n", " csv_file = 'vin_train.csv'\n", " else:\n", " csv_file = 'vin_test.csv'\n", " df = pd.read_csv(csv_file)\n", " for index, row in tqdm(df.iterrows()):\n", " path = row['path']\n", " new_path = Path(row['path'])\n", " audio_id = index\n", " text = row['sentence']\n", " if new_path.exists():\n", " audio = load_wave(new_path, sample_rate=sample_rate)[0]\n", " if len(audio) > audio_max_sample_length or len(audio) < 0:\n", " print('skip file:', new_path,'with len audio', len(audio))\n", " del new_path\n", " continue\n", " text_list.append(text)\n", " path_list.append(path) \n", "\n", " return path_list, text_list\n", "\n", "\n", "\n", "\n", " # Get the testing dataset\n", " test_audio, test_text = get_list_files_vin100h(phase='test')\n", " #print(test_audio[0])\n", " test_dataset = Dataset.from_dict({\"audio\": test_audio, \"text\": test_text})\n", " vin_100h = DatasetDict({\"test\": test_dataset})\n", " #print(clean_data)\n", "\n", "\n", " return vin_100h, test_dataset\n", "\n", "\n", "if __name__ == \"__main__\":\n", " clean_data, test_dataset = main()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " test: Dataset({\n", " features: ['audio', 'text'],\n", " num_rows: 11245\n", " })\n", "})" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_data" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "\n", "segments, _ = model.transcribe(clean_data['test']['audio'][0], beam_size=1, language='vi', temperature= 0)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Cây cam canh là loại cây ăn quả dễ chồng, dễ chăm sót và cho năng suốt cao nên hiện nay được chồng ở nhiều địa phương trong nước ta.\n", " Cam canh có đặc điểm phỏ mỏng, ăn rất ngọt.\n" ] } ], "source": [ "for segment in segments :\n", " print(segment.text)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class DataCollatorSpeechSeq2SeqWithPadding:\n", "\n", " def __call__(self, features):\n", " batch = {\n", " \"audio\": [feature[\"audio\"] for feature in features],\n", " \"transcription\": [feature[\"text\"] for feature in features]\n", " }\n", " return batch\n", "data_collator = DataCollatorSpeechSeq2SeqWithPadding()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 703/703 [1:48:04<00:00, 9.22s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "wer_quantz=21.69356959398854\n" ] } ], "source": [ "from torch.utils.data import DataLoader\n", "import re\n", "from tqdm import tqdm\n", "import numpy as np\n", "import gc\n", "import evaluate\n", "metric = evaluate.load(\"wer\")\n", "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n", "normalizer = BasicTextNormalizer()\n", "eval_dataloader = DataLoader(clean_data['test'], batch_size=16, collate_fn=data_collator)\n", "for step, batch in enumerate(tqdm(eval_dataloader)):\n", " with torch.cuda.amp.autocast():\n", " with torch.no_grad():\n", " final = []\n", " labels = batch[\"transcription\"]\n", " for audio in batch[\"audio\"]:\n", " \n", " segments, _ = model.transcribe(audio, beam_size=1, language='vi',temperature=0)\n", " out = [out.text for out in segments]\n", " pred = ''.join(out)\n", " norm_pred = normalizer(pred)\n", " final.append(norm_pred)\n", " cleaned_text_list = [re.sub(r'\\s+', ' ', text.strip()) for text in final]\n", " #print(cleaned_text_list)\n", " #print(labels)\n", " metric.add_batch(\n", " predictions=cleaned_text_list,\n", " references=labels,\n", " )\n", " del labels, batch, final\n", " gc.collect()\n", "wer_quantz = 100 * metric.compute()\n", "print(f\"{wer_quantz=}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "normalized_wer=22.798881834138733\n", "lora_wer = 22.793825528564515" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "\n", "model_names = [\"whisper-medium\", \"whisper-lora\",\"quantz-whisper-lora\"]\n", "wer_values = [normalized_wer,lora_wer,wer_quantz]\n", "\n", "\n", "bar_colors = ['blue', 'green',\"red\"]\n", "# Create a bar chart\n", "plt.bar(model_names, wer_values,color=bar_colors)\n", "plt.xlabel(\"Vin100h\")\n", "plt.ylabel(\"WER (%)\")\n", "plt.title(\"Word Error Rate (WER) of Models\")\n", "plt.ylim(0, 25) # Set y-axis range to 0-100\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The model 'PeftModel' is not supported for . Supported models are ['SpeechEncoderDecoderModel', 'Speech2TextForConditionalGeneration', 'SpeechT5ForSpeechToText', 'WhisperForConditionalGeneration', 'Data2VecAudioForCTC', 'HubertForCTC', 'MCTCTForCTC', 'SEWForCTC', 'SEWDForCTC', 'UniSpeechForCTC', 'UniSpeechSatForCTC', 'Wav2Vec2ForCTC', 'Wav2Vec2ConformerForCTC', 'WavLMForCTC'].\n" ] } ], "source": [ "import torch\n", "from transformers import (\n", " AutomaticSpeechRecognitionPipeline,\n", " WhisperForConditionalGeneration,\n", " WhisperTokenizer,\n", " WhisperProcessor,\n", ")\n", "from peft import PeftModel, PeftConfig\n", "\n", "\n", "peft_model_id = \"DuyTa/vi-whisper-medium-Lora\"\n", "language = \"Vietnamese\"\n", "task = \"transcribe\"\n", "\n", "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", "model = WhisperForConditionalGeneration.from_pretrained(\n", " peft_config.base_model_name_or_path\n", ")\n", "peft_model = PeftModel.from_pretrained(model, peft_model_id)\n", "peft_model.to(\"cuda:0\").half()\n", "\n", "processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)\n", "\n", "pipe = AutomaticSpeechRecognitionPipeline(model=peft_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, batch_size=8, torch_dtype=torch.float16, device=\"cuda:0\")\n", "\n", "def transcribe(audio, return_timestamps=False):\n", " text = pipe(audio, chunk_length_s=30, return_timestamps=return_timestamps, generate_kwargs={\"language\": language, \"task\": task},)[\"text\"]\n", " return text" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import pipeline\n", "from datasets import load_dataset\n", "\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "pipe = pipeline(\n", " \"automatic-speech-recognition\",\n", " model=\"openai/whisper-medium\",\n", " chunk_length_s=30,\n", " device=device,\n", ")\n", "\n", "def transcribe_hf(audio, return_timestamps=False):\n", " text = pipe(audio, chunk_length_s=30, return_timestamps=return_timestamps, generate_kwargs={\"language\": language, \"task\": task},max_new_tokens =448)[\"text\"]\n", " return text\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def transcribe(audio, return_timestamps=False):\n", " text = pipe(audio, chunk_length_s=30, return_timestamps=return_timestamps, generate_kwargs={\"language\": language, \"task\": task}, max_new_tokens =448)[\"text\"]\n", " return text" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "faster-whisper: 0.643276 seconds\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "lora-whisper: 2.030503 seconds\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/generation/utils.py:1396: UserWarning: Using the model-agnostic default `max_length` (=448) to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.\n", " warnings.warn(\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/generation/utils.py:1396: UserWarning: Using the model-agnostic default `max_length` (=448) to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.\n", " warnings.warn(\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/generation/utils.py:1396: UserWarning: Using the model-agnostic default `max_length` (=448) to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.\n", " warnings.warn(\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/pipelines/base.py:1090: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n", " warnings.warn(\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libncursesw.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "ffmpeg: /home/tesla/miniconda3/lib/libtinfo.so.6: no version information available (required by /lib/x86_64-linux-gnu/libcaca.so.0)\n", "/home/tesla/miniconda3/envs/DUY/lib/python3.9/site-packages/transformers/generation/utils.py:1396: UserWarning: Using the model-agnostic default `max_length` (=448) to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "hf-whisper: 1.959464 seconds\n" ] } ], "source": [ "import os\n", "import time\n", "\n", "\n", "import whisper\n", "from whisper_quant import WhisperModel\n", "\n", "#SSopenai_whisper_model = whisper.load_model(\"medium\", device=\"cuda\")\n", "faster_whisper_model = WhisperModel(\"medium\", device=\"cuda\", compute_type = \"float16\")\n", "\n", "input_file = \"audio.wav\"\n", "samples = 3\n", "\n", "\n", "# def run_openai_whisper():\n", "# result = openai_whisper_model.transcribe(input_file, beam_size=1)\n", "\n", "\n", "def run_lora_whisper():\n", " text = transcribe(input_file)\n", "\n", "def run_hf_whisper():\n", " text = transcribe_hf(input_file)\n", "\n", "def run_faster_whisper():\n", " segments, _ = faster_whisper_model.transcribe(input_file, beam_size=1, best_of=1)\n", " segments = list(segments)\n", "\n", "\n", "def measure_execution_time(name, func, samples=3):\n", " func()\n", "\n", " start = time.time()\n", " for _ in range(samples):\n", " func()\n", " end = time.time()\n", "\n", " print(\"%s: %f seconds\" % (name, (end - start) / samples))\n", "\n", "\n", "# measure_execution_time(\"openai-whisper\", run_openai_whisper)\n", "measure_execution_time(\"faster-whisper\", run_faster_whisper)\n", "measure_execution_time(\"lora-whisper\", run_lora_whisper)\n", "measure_execution_time(\"hf-whisper\", run_hf_whisper)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# MITI quantized" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "420it [00:02, 153.21it/s]\n" ] } ], "source": [ "import logging\n", "import numpy as np\n", "import datasets\n", "from datasets import DatasetDict, load_dataset, concatenate_datasets\n", "from tqdm import tqdm\n", "from transformers import (\n", " AutoConfig,\n", " AutoFeatureExtractor,\n", " AutoModelForSpeechSeq2Seq,\n", " AutoTokenizer,\n", " set_seed,\n", ")\n", "from transformers.utils.versions import require_version\n", "from transformers.utils import check_min_version\n", "from tqdm import tqdm\n", "\n", "\n", "\n", "\n", "logger = logging.getLogger(__name__)\n", "from datasets import Dataset, DatasetDict\n", "import torchaudio\n", "from torchaudio import transforms as at\n", "import pandas as pd\n", "import torch\n", "from pathlib import Path\n", "\n", "def main():\n", " set_seed(42)\n", "\n", "\n", " max_input_length = 30.0 * 16000\n", " min_input_length = 0.0 * 16000\n", " audio_column_name = \"audio\"\n", " text_column_name = \"text\"\n", "\n", "\n", "\n", "\n", "\n", " def load_wave(wave_path, sample_rate:int=16000) -> np.ndarray:\n", " waveform, sr = torchaudio.load(wave_path, normalize=True)\n", " if sample_rate != sr:\n", " waveform = at.Resample(sr, sample_rate)(waveform)\n", " return np.asarray(waveform)\n", " \n", "\n", " def get_list_files_MITI(phase, sample_rate=16000, audio_max_sample_length=480000):\n", " text_list = []\n", " path_list = []\n", " if phase == 'train':\n", " csv_file = 'MITI_train.csv'\n", " else:\n", " csv_file = 'MITI_test.csv'\n", " df = pd.read_csv(csv_file)\n", " for index, row in tqdm(df.iterrows()):\n", " path = row['path']\n", " new_path = Path(row['path'])\n", " audio_id = index\n", " text = row['sentence']\n", " if new_path.exists():\n", " audio = load_wave(new_path, sample_rate=sample_rate)[0]\n", " if len(audio) > audio_max_sample_length or len(audio) < 0:\n", " print('skip file:', new_path,'with len audio', len(audio))\n", " del new_path\n", " continue\n", " text_list.append(text)\n", " path_list.append(path) \n", "\n", " return path_list, text_list\n", "\n", "\n", "\n", "\n", " # Get the testing dataset\n", " test_audio, test_text = get_list_files_MITI(phase='test')\n", " #print(test_audio[0])\n", " test_dataset = Dataset.from_dict({\"audio\": test_audio, \"text\": test_text})\n", " MITI = DatasetDict({\"test\": test_dataset})\n", " #print(clean_data)\n", "\n", "\n", " return MITI, test_dataset\n", "\n", "\n", "if __name__ == \"__main__\":\n", " clean_data, test_dataset = main()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class DataCollatorSpeechSeq2SeqWithPadding:\n", "\n", " def __call__(self, features):\n", " batch = {\n", " \"audio\": [feature[\"audio\"] for feature in features],\n", " \"transcription\": [feature[\"text\"] for feature in features]\n", " }\n", " return batch\n", "data_collator = DataCollatorSpeechSeq2SeqWithPadding()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " test: Dataset({\n", " features: ['audio', 'text'],\n", " num_rows: 420\n", " })\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from whisper_quant import WhisperModel\n", "\n", "model_size = \"medium\"\n", "\n", "# Run on GPU with FP16\n", "model = WhisperModel(model_size, device=\"cuda\", compute_type=\"float16\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/27 [00:00