Shiry commited on
Commit
0dd5737
1 Parent(s): 1586b29

Training in progress, step 150

Browse files
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2058472cc77971ab50e1ba117bbf18120c67a402a61522cdd51ffd6c354dc3e
3
  size 6173655480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ca9bec437c0cd91a8eeda2217dd1c4db24cd28a139cc75502b23adc9965093
3
  size 6173655480
run_speech_recognition_seq2seq_streaming.py CHANGED
@@ -50,6 +50,7 @@ from transformers.trainer_pt_utils import IterableDatasetShard
50
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
51
  from transformers.utils import check_min_version, send_example_telemetry
52
  from transformers.utils.versions import require_version
 
53
 
54
 
55
  # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -386,6 +387,26 @@ def main():
386
  f"{', '.join(raw_datasets_features)}."
387
  )
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  # 5. Load pretrained model, tokenizer, and feature extractor
390
  #
391
  # Distributed training:
 
50
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
51
  from transformers.utils import check_min_version, send_example_telemetry
52
  from transformers.utils.versions import require_version
53
+ from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
54
 
55
 
56
  # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 
387
  f"{', '.join(raw_datasets_features)}."
388
  )
389
 
390
+ augment_waveform = Compose([
391
+ AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.2),
392
+ TimeStretch(min_rate=0.8, max_rate=1.25, p=0.2, leave_length_unchanged=False),
393
+ PitchShift(min_semitones=-4, max_semitones=4, p=0.2)
394
+ ,])
395
+
396
+ def augment_dataset(batch):
397
+
398
+ audio = batch["audio"]["array"]
399
+ # apply augmentation
400
+ augmented_audio = augment_waveform(samples=audio, sample_rate=16000)
401
+
402
+ batch["audio"]["array"] = augmented_audio
403
+
404
+ return batch
405
+
406
+
407
+ # call augment dataset on the training set
408
+ raw_datasets["train"] = raw_datasets["train"].map(augment_dataset)
409
+
410
  # 5. Load pretrained model, tokenizer, and feature extractor
411
  #
412
  # Distributed training:
runs/Dec20_13-35-03_0393d32b0779/events.out.tfevents.1671536148.0393d32b0779.2738.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33f0be47432bebad940f83bf1336d1178f196cdd1cb3dfc136682151cf9c1071
3
- size 5503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08bcbacbcb00c4f7158b2e49a194b441dc14d855720e61e1f56bcb64ed52b1e5
3
+ size 6132
train.sh CHANGED
@@ -16,7 +16,7 @@ python -m torch.distributed.launch --nproc_per_node 2 run_speech_recognition_seq
16
  --per_device_eval_batch_size="16" \
17
  --logging_steps="25" \
18
  --learning_rate="1e-6" \
19
- --warmup_steps="10" \
20
  --evaluation_strategy="steps" \
21
  --eval_steps="50" \
22
  --save_strategy="steps" \
 
16
  --per_device_eval_batch_size="16" \
17
  --logging_steps="25" \
18
  --learning_rate="1e-6" \
19
+ --warmup_steps="40" \
20
  --evaluation_strategy="steps" \
21
  --eval_steps="50" \
22
  --save_strategy="steps" \