NbAiLab
/

whisper

Model card Files Files and versions Community

pere commited on Nov 7, 2022

Commit

c531109

•

1 Parent(s): 6799df5

Fist submit

Browse files

Files changed (3) hide show

README.md +4 -0
requirements.txt +116 -0
run_whisper.py +187 -0

README.md CHANGED Viewed

@@ -1,3 +1,7 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+# Whisper Finetuning
+Whisper finetuning example script.

requirements.txt ADDED Viewed

	@@ -0,0 +1,116 @@

+absl-py==1.3.0
+aiohttp==3.8.3
+aiosignal==1.2.0
+anyio==3.6.2
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==22.1.0
+audioread==3.0.0
+autopep8==2.0.0
+bcrypt==4.0.1
+cachetools==5.2.0
+certifi==2022.9.24
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+contourpy==1.0.6
+cryptography==38.0.3
+cycler==0.11.0
+datasets==2.6.1
+decorator==5.1.1
+dill==0.3.5.1
+evaluate==0.3.0
+fastapi==0.86.0
+ffmpy==0.3.0
+filelock==3.8.0
+fonttools==4.38.0
+frozenlist==1.3.1
+fsspec==2022.10.0
+google-auth==2.14.0
+google-auth-oauthlib==0.4.6
+gradio==3.9
+grpcio==1.50.0
+h11==0.12.0
+httpcore==0.15.0
+httpx==0.23.0
+huggingface-hub==0.10.1
+idna==3.4
+importlib-metadata==5.0.0
+Jinja2==3.1.2
+jiwer==2.5.1
+joblib==1.2.0
+kiwisolver==1.4.4
+Levenshtein==0.20.2
+librosa==0.9.2
+linkify-it-py==1.0.3
+llvmlite==0.39.1
+Markdown==3.4.1
+markdown-it-py==2.1.0
+MarkupSafe==2.1.1
+matplotlib==3.6.2
+mdit-py-plugins==0.3.1
+mdurl==0.1.2
+multidict==6.0.2
+multiprocess==0.70.13
+numba==0.56.4
+numpy==1.23.4
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+oauthlib==3.2.2
+orjson==3.8.1
+packaging==21.3
+pandas==1.5.1
+paramiko==2.12.0
+Pillow==9.3.0
+pooch==1.6.0
+protobuf==3.19.6
+pyarrow==10.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycodestyle==2.9.1
+pycparser==2.21
+pycryptodome==3.15.0
+pydantic==1.10.2
+pydub==0.25.1
+PyNaCl==1.5.0
+pyparsing==3.0.9
+python-dateutil==2.8.2
+python-multipart==0.0.5
+pytz==2022.6
+PyYAML==6.0
+rapidfuzz==2.13.2
+regex==2022.10.31
+requests==2.28.1
+requests-oauthlib==1.3.1
+resampy==0.4.2
+responses==0.18.0
+rfc3986==1.5.0
+rsa==4.9
+scikit-learn==1.1.3
+scipy==1.9.3
+sentencepiece==0.1.97
+six==1.16.0
+sniffio==1.3.0
+soundfile==0.11.0
+starlette==0.20.4
+tensorboard==2.10.1
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+threadpoolctl==3.1.0
+tokenizers==0.13.1
+tomli==2.0.1
+torch==1.12.1
+torchaudio==0.12.1
+tqdm==4.64.1
+transformers @ git+https://github.com/huggingface/transformers@504db92e7da010070c36e185332420a1d52c12b2
+typing_extensions==4.4.0
+uc-micro-py==1.0.1
+urllib3==1.26.12
+uvicorn==0.19.0
+websockets==10.4
+Werkzeug==2.2.2
+xxhash==3.1.0
+yarl==1.8.1
+zipp==3.10.0

run_whisper.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from datasets import load_dataset, DatasetDict
+from datasets import Audio
+from transformers import WhisperFeatureExtractor
+from transformers import WhisperTokenizer
+from transformers import WhisperProcessor
+from transformers import WhisperForConditionalGeneration
+from transformers import Seq2SeqTrainingArguments
+from transformers import Seq2SeqTrainer
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union
+import evaluate
+# Functions
+# Define a Data Collator
+@dataclass
+class DataCollatorSpeechSeq2SeqWithPadding:
+    processor: Any
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need different padding methods
+        # first treat the audio inputs by simply returning torch tensors
+        input_features = [{"input_features": feature["input_features"]}
+                          for feature in features]
+        batch = self.processor.feature_extractor.pad(
+            input_features, return_tensors="pt")
+        # get the tokenized label sequences
+        label_features = [{"input_ids": feature["labels"]}
+                          for feature in features]
+        # pad the labels to max length
+        labels_batch = self.processor.tokenizer.pad(
+            label_features, return_tensors="pt")
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(
+            labels_batch.attention_mask.ne(1), -100)
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
+            labels = labels[:, 1:]
+        batch["labels"] = labels
+        return batch
+# Metrics
+def compute_metrics(pred):
+    pred_ids = pred.predictions
+    label_ids = pred.label_ids
+    # replace -100 with the pad_token_id
+    label_ids[label_ids == -100] = tokenizer.pad_token_id
+    # we do not want to group tokens when computing the metrics
+    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
+    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
+    return {"wer": wer}
+# Prepare dataset
+def prepare_dataset(batch):
+    # load and resample audio data from 48 to 16kHz
+    audio = batch["audio"]
+    # compute log-Mel input features from input audio array
+    batch["input_features"] = feature_extractor(
+        audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
+    # encode target text to label ids
+    batch["labels"] = tokenizer(batch["sentence"]).input_ids
+    return batch
+# Whisper Trainin Script
+# Map the source and target columns
+# Whisper expects these to be "audio" and "sentence". Change if anything else in the dataset
+source = "audio"
+target = "sentence"
+# Load a sample dataset
+speech_data = DatasetDict()
+# Examples
+# speech_data["train"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="train", use_auth_token=True)
+# speech_data["test"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="test", use_auth_token=True)
+# speech_data["train"] = load_dataset("NbAiLab/LIA_speech", split="train", use_auth_token=True)
+ #speech_data["test"] = load_dataset("NbAiLab/LIA_speech", split="test", use_auth_token=True)
+# The smallest dataset I found
+speech_data["train"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0", "nn-NO", split="train", use_auth_token=True)
+speech_data["test"] = load_dataset(
+    "mozilla-foundation/common_voice_11_0", "nn-NO", split="test", use_auth_token=True)
+# Rename columns
+if "audio" not in speech_data.column_names["train"]:
+    speech_data = speech_data.rename_column(source, "audio")
+if "sentence" not in speech_data.column_names["train"]:
+    speech_data = speech_data.rename_column(target, "sentence")
+# Remove not needed columns - Not really sure if this is necessary
+remove_list = [i for i in speech_data.column_names["train"]
+               if i not in ["audio", "sentence"]]
+speech_data = speech_data.remove_columns(remove_list)
+# Initialise
+feature_extractor = WhisperFeatureExtractor.from_pretrained(
+    "openai/whisper-small")
+tokenizer = WhisperTokenizer.from_pretrained(
+    "openai/whisper-small", language="Norwegian", task="transcribe")
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-small", language="Norwegian", task="transcribe")
+data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
+# Prepare data
+speech_data = speech_data.cast_column("audio", Audio(sampling_rate=16000))
+speech_data = speech_data.map(
+    prepare_dataset, remove_columns=speech_data.column_names["train"], num_proc=1)
+# Metrics
+metric = evaluate.load("wer")
+# Initialise a Pretrained model
+# We need to set use_cache=False here if we want to use gradient accumulation
+model = WhisperForConditionalGeneration.from_pretrained(
+    "openai/whisper-small", use_cache=False)
+# Overriding generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):
+model.config.forced_decoder_ids = None
+model.config.suppress_tokens = []
+# Training arguments
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./whisper-small-no-test",  # change to a repo name of your choice
+    # Use at least 16 is reasonable. This is just for the test on Ficino
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
+    learning_rate=1e-5,
+    warmup_steps=500,
+    max_steps=1000,  # Changed from 4000
+    gradient_checkpointing=True,
+    fp16=True,
+    group_by_length=True,
+    evaluation_strategy="steps",
+    per_device_eval_batch_size=8,
+    predict_with_generate=True,
+    generation_max_length=225,
+    save_steps=500,
+    eval_steps=500,
+    logging_steps=25,
+    report_to=["tensorboard"],
+    load_best_model_at_end=True,
+    metric_for_best_model="wer",
+    greater_is_better=False,
+    push_to_hub=True,
+)
+trainer = Seq2SeqTrainer(
+    args=training_args,
+    model=model,
+    train_dataset=speech_data["train"],
+    eval_dataset=speech_data["test"],
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    tokenizer=processor.feature_extractor,
+)
+# Start training
+trainer.train()