NbAiLab
/

whisper

Model card Files Files and versions Community

pere commited on Nov 7, 2022

Commit

b3ebd4c

•

1 Parent(s): e9502fe

first

Browse files

Files changed (4) hide show

requirements.txt +2 -2
run.sh +4 -0
run_whisper.py +12 -13
xla_spawn.py +83 -0

requirements.txt CHANGED Viewed

@@ -101,8 +101,8 @@ tensorboard-plugin-wit==1.8.1
 threadpoolctl==3.1.0
 tokenizers==0.13.1
 tomli==2.0.1
-torch==1.12.1
-torchaudio==0.12.1
 tqdm==4.64.1
 transformers @ git+https://github.com/huggingface/transformers@504db92e7da010070c36e185332420a1d52c12b2
 typing_extensions==4.4.0

 threadpoolctl==3.1.0
 tokenizers==0.13.1
 tomli==2.0.1
+torch>=1.12.1
+torchaudio>=0.12.1
 tqdm==4.64.1
 transformers @ git+https://github.com/huggingface/transformers@504db92e7da010070c36e185332420a1d52c12b2
 typing_extensions==4.4.0

run.sh ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ python xla_spawn.py --num_cores=4 run_whisper.py
3	+
4	+

run_whisper.py CHANGED Viewed

@@ -88,23 +88,23 @@ def main():
     # Map the source and target columns
     # Whisper expects these to be "audio" and "sentence". Change if anything else in the dataset
     source = "audio"
-    target = "sentence"
     # Load a sample dataset
     speech_data = DatasetDict()
     # Examples
-    # speech_data["train"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="train", use_auth_token=True)
-    # speech_data["test"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="test", use_auth_token=True)
     # speech_data["train"] = load_dataset("NbAiLab/LIA_speech", split="train", use_auth_token=True)
     #speech_data["test"] = load_dataset("NbAiLab/LIA_speech", split="test", use_auth_token=True)
     # The smallest dataset I found
-    speech_data["train"] = load_dataset(
-        "mozilla-foundation/common_voice_11_0", "nn-NO", split="train", use_auth_token=True)
-    speech_data["test"] = load_dataset(
-        "mozilla-foundation/common_voice_11_0", "nn-NO", split="test", use_auth_token=True)
     # Rename columns
@@ -148,15 +148,13 @@ def main():
     # Training arguments
     training_args = Seq2SeqTrainingArguments(
-        output_dir="../whisper-test",  # change to a repo name of your choice
-        # Use at least 16 is reasonable. This is just for the test on Ficino
-        per_device_train_batch_size=4,
         gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
-        learning_rate=1e-5,
         warmup_steps=500,
-        max_steps=1000,  # Changed from 4000
         gradient_checkpointing=True,
-        fp16=True,
         group_by_length=True,
         evaluation_strategy="steps",
         per_device_eval_batch_size=8,
@@ -189,6 +187,7 @@ def main():
 def _mp_fn(index):
     # For xla_spawn (TPUs)
     main()

     # Map the source and target columns
     # Whisper expects these to be "audio" and "sentence". Change if anything else in the dataset
     source = "audio"
+    target = "sentence_text"
     # Load a sample dataset
     speech_data = DatasetDict()
     # Examples
+    speech_data["train"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="train", use_auth_token=True)
+    speech_data["test"] = load_dataset("NbAiLab/NPSC", "16K_mp3_bokmaal",  split="test", use_auth_token=True)
     # speech_data["train"] = load_dataset("NbAiLab/LIA_speech", split="train", use_auth_token=True)
     #speech_data["test"] = load_dataset("NbAiLab/LIA_speech", split="test", use_auth_token=True)
     # The smallest dataset I found
+    #speech_data["train"] = load_dataset(
+    #    "mozilla-foundation/common_voice_11_0", "nn-NO", split="train", use_auth_token=True)
+    #speech_data["test"] = load_dataset(
+    #    "mozilla-foundation/common_voice_11_0", "nn-NO", split="test", use_auth_token=True)
     # Rename columns
     # Training arguments
     training_args = Seq2SeqTrainingArguments(
+        output_dir="./first-whisper-test2",  # change to a repo name of your choice
+        per_device_train_batch_size=64,
         gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
+        learning_rate=2e-5,
         warmup_steps=500,
+        max_steps=5000,  # Changed from 4000
         gradient_checkpointing=True,
         group_by_length=True,
         evaluation_strategy="steps",
         per_device_eval_batch_size=8,
 def _mp_fn(index):
     # For xla_spawn (TPUs)
+    print("The XLA is initiated")
     main()

xla_spawn.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A simple launcher script for TPU training
+Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py
+::
+    >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+"""
+import importlib
+import sys
+from argparse import REMAINDER, ArgumentParser
+from pathlib import Path
+import torch_xla.distributed.xla_multiprocessing as xmp
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description=(
+            "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
+        )
+    )
+    # Optional arguments for the launch helper
+    parser.add_argument("--num_cores", type=int, default=1, help="Number of TPU cores to use. 1 or 8 on v3-8. 1 or 4 on v4-8")
+    # positional
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help=(
+            "The full path to the single TPU training "
+            "program/script to be launched in parallel, "
+            "followed by all the arguments for the "
+            "training script"
+        ),
+    )
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # Import training_script as a module.
+    script_fpath = Path(args.training_script)
+    sys.path.append(str(script_fpath.parent.resolve()))
+    mod_name = script_fpath.stem
+    mod = importlib.import_module(mod_name)
+    # Patch sys.argv
+    sys.argv = [args.training_script] + args.training_script_args + ["--tpu_num_cores", str(args.num_cores)]
+    xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)
+if __name__ == "__main__":
+    main()