fix training script

Browse files

Files changed (4) hide show

added_tokens.json +0 -1
run.sh +2 -2
run_speech_recognition_ctc.py +13 -5
vocab.json +1 -1

added_tokens.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"<s>": 33, "</s>": 34}

run.sh CHANGED Viewed

@@ -1,8 +1,8 @@
 python run_speech_recognition_ctc.py \
-	--dataset_name="mozilla-foundation/common_voice_7_0,marinone94/nst_sv" \
 	--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
 	--dataset_config_name="sv-SE,distant_channel" \
-	--train_split_name="None,train" \
 	--eval_split_name="test,None" \
 	--output_dir="./" \
 	--overwrite_output_dir \

 python run_speech_recognition_ctc.py \
+	--dataset_name="mozilla-foundation/common_voice_8_0,marinone94/nst_sv" \
 	--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
 	--dataset_config_name="sv-SE,distant_channel" \
+	--train_split_name="train+validation,train" \
 	--eval_split_name="test,None" \
 	--output_dir="./" \
 	--overwrite_output_dir \

run_speech_recognition_ctc.py CHANGED Viewed

@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Union
 import datasets
 import numpy as np
 import torch
 import wandb
 from datasets import DatasetDict, concatenate_datasets, load_dataset, load_metric
@@ -376,7 +377,7 @@ def main():
         wandb.login()
         training_args.report_to = ["wandb"]
         training_args.run_name = run_name
-        wandb.init()
     except:
         pass
@@ -480,6 +481,11 @@ def main():
         other_columns_train = [col for col in raw_datasets["train"].column_names if col not in min_columns_train]
         raw_datasets["train"].remove_columns(other_columns_train)
     if training_args.do_eval:
         # Multiple datasets might need to be loaded from HF
         # It assumes they all follow the common voice format
@@ -520,6 +526,11 @@ def main():
         other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
         raw_datasets["eval"].remove_columns(other_columns_eval)
     # 2. We remove some special characters from the datasets
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
@@ -755,15 +766,12 @@ def main():
     if data_args.dataset_seed is not None:
         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
-    # Log sample of datasets
     pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
     pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
     # wandb.log({"train_sample": pd_train})
     # wandb.log({"eval_sample": pd_eval})
-    print(pd_train)
-    print(pd_eval)
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
     # be a timeout when running the script in distributed mode.

 import datasets
 import numpy as np
+import pandas as pd
 import torch
 import wandb
 from datasets import DatasetDict, concatenate_datasets, load_dataset, load_metric
         wandb.login()
         training_args.report_to = ["wandb"]
         training_args.run_name = run_name
+        # wandb.init()
     except:
         pass
         other_columns_train = [col for col in raw_datasets["train"].column_names if col not in min_columns_train]
         raw_datasets["train"].remove_columns(other_columns_train)
+    # pd_train_head = raw_datasets["train"].select(range(10)).to_pandas()
+    # pd_train_tail = raw_datasets["train"].select(range(raw_datasets["train"].num_rows-10, raw_datasets["train"].num_rows)).to_pandas()
+    # pd_train = pd.concat([pd_train_head, pd_train_tail])
+    # print(pd_train["audio"])
     if training_args.do_eval:
         # Multiple datasets might need to be loaded from HF
         # It assumes they all follow the common voice format
         other_columns_eval = [col for col in raw_datasets["eval"].column_names if col not in min_columns_eval]
         raw_datasets["eval"].remove_columns(other_columns_eval)
+    # pd_eval_head = raw_datasets["eval"].select(range(10)).to_pandas()
+    # pd_eval_tail = raw_datasets["eval"].select(range(raw_datasets["eval"].num_rows-10, raw_datasets["eval"].num_rows)).to_pandas()
+    # pd_eval = pd.concat([pd_eval_head, pd_eval_tail])
+    # print(pd_eval["audio"])
     # 2. We remove some special characters from the datasets
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     if data_args.dataset_seed is not None:
         vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
+    # TODO: Log sample of datasets in the right way (see wandb docs)
     pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
     pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
     # wandb.log({"train_sample": pd_train})
     # wandb.log({"eval_sample": pd_eval})
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
     # be a timeout when running the script in distributed mode.

vocab.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ä": 27, "å": 28, "ô": 29, "~~ö": 30, "~~\|": 0, "[UNK]": 31, "[PAD]": 32}


1	+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "\u00e4": 27, "\u00e5": 28, "\u00f6": 29, "\|": 0, "[UNK]": 30, "[PAD]": 31}