marinone94
commited on
Merge branch 'main' of https://huggingface.co/marinone94/whisper-tiny-sv into main
Browse files
run_speech_recognition_seq2seq_streaming.py
CHANGED
@@ -838,9 +838,10 @@ def main():
|
|
838 |
kwargs["dataset_tags"] = dataset_names
|
839 |
if data_args.dataset_train_config_name is not None:
|
840 |
dataset_config_names = list(data_args.dataset_train_config_name.split(","))
|
841 |
-
|
842 |
else:
|
843 |
-
|
|
|
844 |
# if "common_voice" in data_args.dataset_name:
|
845 |
# kwargs["language"] = data_args.dataset_config_name[:2]
|
846 |
if data_args.language_train is not None:
|
|
|
838 |
kwargs["dataset_tags"] = dataset_names
|
839 |
if data_args.dataset_train_config_name is not None:
|
840 |
dataset_config_names = list(data_args.dataset_train_config_name.split(","))
|
841 |
+
dataset_config_names_list = [f"{ds_name} {ds_cfg_name}" for ds_name, ds_cfg_name in zip(dataset_names, dataset_config_names)]
|
842 |
else:
|
843 |
+
dataset_config_names_list = dataset_names
|
844 |
+
kwargs["dataset"] = "\n".join(dataset_config_names_list)
|
845 |
# if "common_voice" in data_args.dataset_name:
|
846 |
# kwargs["language"] = data_args.dataset_config_name[:2]
|
847 |
if data_args.language_train is not None:
|
test_run_nordic.sh
CHANGED
@@ -2,11 +2,11 @@ python $1run_speech_recognition_seq2seq_streaming.py \
|
|
2 |
--model_name_or_path="openai/whisper-tiny" \
|
3 |
--dataset_train_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,NbAiLab/NST,NbAiLab/NPSC,google/fleurs,google/fleurs,google/fleurs" \
|
4 |
--dataset_train_config_name="sv-SE,da,nn-NO,nst,no-distant,16K_mp3_nynorsk,sv_se,da_dk,nb_no" \
|
5 |
-
--language_train="
|
6 |
--train_split_name="train+validation,train+validation,train+validation,train,train+test,train+validation,train+validation,train+validation,train+validation" \
|
7 |
--dataset_eval_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
|
8 |
--dataset_eval_config_name="sv-SE,da,nn-NO" \
|
9 |
-
--language_eval="
|
10 |
--eval_split_name="test" \
|
11 |
--model_index_name="Whisper Tiny Nordic" \
|
12 |
--max_train_samples="64" \
|
|
|
2 |
--model_name_or_path="openai/whisper-tiny" \
|
3 |
--dataset_train_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,babelbox/babelbox_voice,NbAiLab/NST,NbAiLab/NPSC,google/fleurs,google/fleurs,google/fleurs" \
|
4 |
--dataset_train_config_name="sv-SE,da,nn-NO,nst,no-distant,16K_mp3_nynorsk,sv_se,da_dk,nb_no" \
|
5 |
+
--language_train="sv,da,no,sv,no,no,sv,da,no" \
|
6 |
--train_split_name="train+validation,train+validation,train+validation,train,train+test,train+validation,train+validation,train+validation,train+validation" \
|
7 |
--dataset_eval_name="mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0,mozilla-foundation/common_voice_11_0" \
|
8 |
--dataset_eval_config_name="sv-SE,da,nn-NO" \
|
9 |
+
--language_eval="sv,da,no" \
|
10 |
--eval_split_name="test" \
|
11 |
--model_index_name="Whisper Tiny Nordic" \
|
12 |
--max_train_samples="64" \
|