marinone94 commited on
Commit
09dc80f
1 Parent(s): 044dff6

train only on nst

Browse files
Files changed (2) hide show
  1. run.sh +3 -3
  2. run_speech_recognition_ctc.py +10 -3
run.sh CHANGED
@@ -2,10 +2,10 @@ python run_speech_recognition_ctc.py \
2
  --dataset_name="mozilla-foundation/common_voice_7_0,marinone94/nst_sv" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --dataset_config_name="sv-SE,distant_channel" \
5
- --train_split_name="train+validation,train" \
6
  --eval_split_name="test,None" \
7
  --output_dir="./" \
8
- --preprocessing_only \
9
  --num_train_epochs="3" \
10
  --per_device_train_batch_size="32" \
11
  --per_device_eval_batch_size="32" \
@@ -19,7 +19,7 @@ python run_speech_recognition_ctc.py \
19
  --save_steps="100" \
20
  --text_column_name="sentence" \
21
  --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
22
- --logging_steps="10" \
23
  --dataset_seed="42" \
24
  --layerdrop="0.0" \
25
  --activation_dropout="0.1" \
2
  --dataset_name="mozilla-foundation/common_voice_7_0,marinone94/nst_sv" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --dataset_config_name="sv-SE,distant_channel" \
5
+ --train_split_name="None,train" \
6
  --eval_split_name="test,None" \
7
  --output_dir="./" \
8
+ --overwrite_output_dir \
9
  --num_train_epochs="3" \
10
  --per_device_train_batch_size="32" \
11
  --per_device_eval_batch_size="32" \
19
  --save_steps="100" \
20
  --text_column_name="sentence" \
21
  --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
22
+ --logging_steps="20" \
23
  --dataset_seed="42" \
24
  --layerdrop="0.0" \
25
  --activation_dropout="0.1" \
run_speech_recognition_ctc.py CHANGED
@@ -371,10 +371,12 @@ def main():
371
  # TODO: Replace with check of wandb env vars
372
  try:
373
  repo_name = os.getcwd().split("/")[-1]
 
374
  os.environ["WANDB_PROJECT"] = repo_name
375
  wandb.login()
376
  training_args.report_to = ["wandb"]
377
- training_args.run_name = f"{datetime.datetime.utcnow()}".replace(" ", "T")
 
378
  except:
379
  pass
380
 
@@ -544,6 +546,7 @@ def main():
544
  .replace("î", "i") \
545
  .replace("ü", "u") \
546
  .replace("ÿ", "y") \
 
547
  .replace("\\", "") \
548
  .replace("/", "") \
549
  .replace("|", "") \
@@ -557,6 +560,7 @@ def main():
557
  .replace("î", "i") \
558
  .replace("ü", "u") \
559
  .replace("ÿ", "y") \
 
560
  .replace("\\", "") \
561
  .replace("/", "") \
562
  .replace("|", "") \
@@ -754,8 +758,11 @@ def main():
754
  # Log sample of datasets
755
  pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
756
  pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
757
- wandb.log({"train_sample": pd_train})
758
- wandb.log("eval_sample": pd_eval)
 
 
 
759
 
760
  # for large datasets it is advised to run the preprocessing on a
761
  # single machine first with ``args.preprocessing_only`` since there will mostly likely
371
  # TODO: Replace with check of wandb env vars
372
  try:
373
  repo_name = os.getcwd().split("/")[-1]
374
+ run_name = f"{datetime.datetime.utcnow()}".replace(" ", "T")
375
  os.environ["WANDB_PROJECT"] = repo_name
376
  wandb.login()
377
  training_args.report_to = ["wandb"]
378
+ training_args.run_name = run_name
379
+ wandb.init()
380
  except:
381
  pass
382
 
546
  .replace("î", "i") \
547
  .replace("ü", "u") \
548
  .replace("ÿ", "y") \
549
+ .replace("ô", "o") \
550
  .replace("\\", "") \
551
  .replace("/", "") \
552
  .replace("|", "") \
560
  .replace("î", "i") \
561
  .replace("ü", "u") \
562
  .replace("ÿ", "y") \
563
+ .replace("ô", "o") \
564
  .replace("\\", "") \
565
  .replace("/", "") \
566
  .replace("|", "") \
758
  # Log sample of datasets
759
  pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
760
  pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
761
+ # wandb.log({"train_sample": pd_train})
762
+ # wandb.log({"eval_sample": pd_eval})
763
+
764
+ print(pd_train)
765
+ print(pd_eval)
766
 
767
  # for large datasets it is advised to run the preprocessing on a
768
  # single machine first with ``args.preprocessing_only`` since there will mostly likely