patrickvonplaten commited on
Commit
e64919d
1 Parent(s): 109e077
.gitattributes CHANGED
File without changes
config.json CHANGED
File without changes
german-1st/events.out.tfevents.1626192156.instance-3.21320.3.v2 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb69ed2d3a263b3e1161b25ce2cb012898003f1f56e7d8b6114511f85cc161b8
3
- size 40
 
 
 
 
german-1st/{events.out.tfevents.1625643205.t1v-n-3abeb69a-w-0.838585.3.v2 → events.out.tfevents.1626262981.t1v-n-3abeb69a-w-0.12272.3.v2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6160222312f88827d6178c3853ae198991de9b0a3cab6bdd6e9b257c77909892
3
  size 40
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73239178ccdb9423fbc79388ce2a835996b379734f7c7d160bf4ee2822d34e0a
3
  size 40
preprocessor_config.json CHANGED
File without changes
requirements.txt CHANGED
File without changes
run_german.sh CHANGED
@@ -16,7 +16,6 @@
16
  --max_duration_in_seconds="10.0" \
17
  --adam_beta1="0.9" \
18
  --adam_beta2="0.98" \
19
- --dtype="bfloat16" \
20
- --cache_dir="./wav2vec2/" \
21
  --pad_to_multiple_of="16384" \
22
  --push_to_hub
 
16
  --max_duration_in_seconds="10.0" \
17
  --adam_beta1="0.9" \
18
  --adam_beta2="0.98" \
19
+ --adafactor \
 
20
  --pad_to_multiple_of="16384" \
21
  --push_to_hub
run_wav2vec2_pretrain_flax.py CHANGED
@@ -275,7 +275,7 @@ def main():
275
  )
276
 
277
  # save vectorized dataset once
278
- vectorized_datasets = datasets.load_from_disk("/home/german-common-voice-processed/normalized")
279
 
280
  # pretraining is only supported for "newer" stable layer norm architecture
281
  # apply_spec_augment has to be True, mask_feature_prob has to be 0.0
@@ -353,17 +353,24 @@ def main():
353
  return traverse_util.unflatten_dict(flat_mask)
354
 
355
  # create adam optimizer
356
- adamw = optax.adamw(
357
- learning_rate=linear_decay_lr_schedule_fn,
358
- b1=training_args.adam_beta1,
359
- b2=training_args.adam_beta2,
360
- eps=training_args.adam_epsilon,
361
- weight_decay=training_args.weight_decay,
362
- mask=decay_mask_fn,
363
- )
 
 
 
 
 
 
 
364
 
365
  # Setup train state and define training hyper-parameters
366
- state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
367
  num_negatives = model.config.num_negatives
368
  contrastive_logits_temperature = model.config.contrastive_logits_temperature
369
  num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
 
275
  )
276
 
277
  # save vectorized dataset once
278
+ vectorized_datasets = datasets.load_from_disk("/home/wav2vec2-experiments/datasets/german-common-voice-processed/normalized/")
279
 
280
  # pretraining is only supported for "newer" stable layer norm architecture
281
  # apply_spec_augment has to be True, mask_feature_prob has to be 0.0
 
353
  return traverse_util.unflatten_dict(flat_mask)
354
 
355
  # create adam optimizer
356
+ if training_args.adafactor:
357
+ # We use the default parameters here to initialize adafactor,
358
+ # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
359
+ optimizer = optax.adafactor(
360
+ learning_rate=linear_decay_lr_schedule_fn,
361
+ )
362
+ else:
363
+ optimizer = optax.adamw(
364
+ learning_rate=linear_decay_lr_schedule_fn,
365
+ b1=training_args.adam_beta1,
366
+ b2=training_args.adam_beta2,
367
+ eps=training_args.adam_epsilon,
368
+ weight_decay=training_args.weight_decay,
369
+ mask=decay_mask_fn,
370
+ )
371
 
372
  # Setup train state and define training hyper-parameters
373
+ state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
374
  num_negatives = model.config.num_negatives
375
  contrastive_logits_temperature = model.config.contrastive_logits_temperature
376
  num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups