freddy commited on
Commit
5ef4794
1 Parent(s): 24973b0
Files changed (2) hide show
  1. run.sh +6 -5
  2. run_speech_recognition_ctc.py +7 -6
run.sh CHANGED
@@ -1,15 +1,15 @@
1
- WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py \
2
  --dataset_name="NbAiLab/NPSC" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
5
  --dataset_config_name="16K_mp3" \
6
  --output_dir="./" \
7
  --overwrite_output_dir \
8
- --num_train_epochs="15" \
9
  --per_device_train_batch_size="16" \
10
  --per_device_eval_batch_size="16" \
11
  --gradient_accumulation_steps="2" \
12
- --learning_rate="1e-4" \
13
  --warmup_steps="2000" \
14
  --length_column_name="input_length" \
15
  --evaluation_strategy="steps" \
@@ -29,7 +29,7 @@ WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py
29
  --mask_feature_prob="0.25" \
30
  --mask_feature_length="64" \
31
  --gradient_checkpointing \
32
- --min_duration_in_seconds="0.5" \
33
  --max_duration_in_seconds="30.0" \
34
  --use_auth_token \
35
  --seed="42" \
@@ -37,4 +37,5 @@ WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py
37
  --group_by_length \
38
  --do_train --do_eval \
39
  --push_to_hub \
40
- --preprocessing_num_workers="32"
 
 
1
+ python run_speech_recognition_ctc.py \
2
  --dataset_name="NbAiLab/NPSC" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
5
  --dataset_config_name="16K_mp3" \
6
  --output_dir="./" \
7
  --overwrite_output_dir \
8
+ --num_train_epochs="40" \
9
  --per_device_train_batch_size="16" \
10
  --per_device_eval_batch_size="16" \
11
  --gradient_accumulation_steps="2" \
12
+ --learning_rate="7.5e-5" \
13
  --warmup_steps="2000" \
14
  --length_column_name="input_length" \
15
  --evaluation_strategy="steps" \
 
29
  --mask_feature_prob="0.25" \
30
  --mask_feature_length="64" \
31
  --gradient_checkpointing \
32
+ --min_duration_in_seconds="0.8" \
33
  --max_duration_in_seconds="30.0" \
34
  --use_auth_token \
35
  --seed="42" \
 
37
  --group_by_length \
38
  --do_train --do_eval \
39
  --push_to_hub \
40
+ --preprocessing_num_workers="32"\
41
+ --ctc_zero_infinity=True
run_speech_recognition_ctc.py CHANGED
@@ -409,11 +409,11 @@ def main():
409
  and "9" not in entry["text"]
410
  )
411
 
412
- def filter_inaudible(entry):
413
- return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
414
-
415
  def filter_nynorsk(entry):
416
- return re.search("nb-no", entry["sentence_language_code"], flags=re.IGNORECASE)
417
 
418
  def filter_tooshort(entry):
419
  #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
@@ -433,6 +433,7 @@ def main():
433
  batch["text"] = re.sub('<ee>', 'eee', batch["text"])
434
  batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
435
  batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
 
436
  # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
437
  if "<" in batch["text"]:
438
  raise ValueError(batch["text"])
@@ -448,7 +449,7 @@ def main():
448
  split=data_args.train_split_name,
449
  use_auth_token=data_args.use_auth_token,
450
  ).shuffle()
451
- raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
452
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
453
 
454
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -475,7 +476,7 @@ def main():
475
  split=data_args.eval_split_name,
476
  use_auth_token=data_args.use_auth_token,
477
  ).shuffle()
478
- raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_inaudible).filter(filter_nynorsk).filter(filter_tooshort)
479
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
480
 
481
  if data_args.max_eval_samples is not None:
 
409
  and "9" not in entry["text"]
410
  )
411
 
412
+ #def filter_inaudible(entry):
413
+ # return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
414
+ #
415
  def filter_nynorsk(entry):
416
+ return re.search("nn-no", entry["sentence_language_code"], flags=re.IGNORECASE)
417
 
418
  def filter_tooshort(entry):
419
  #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
 
433
  batch["text"] = re.sub('<ee>', 'eee', batch["text"])
434
  batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
435
  batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
436
+ batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
437
  # batch["text"] = re.sub('<inaudible>', '?', batch["text"])
438
  if "<" in batch["text"]:
439
  raise ValueError(batch["text"])
 
449
  split=data_args.train_split_name,
450
  use_auth_token=data_args.use_auth_token,
451
  ).shuffle()
452
+ raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
453
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
454
 
455
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
 
476
  split=data_args.eval_split_name,
477
  use_auth_token=data_args.use_auth_token,
478
  ).shuffle()
479
+ raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
480
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
481
 
482
  if data_args.max_eval_samples is not None: