flax-community
/

wav2vec2-spanish

@@ -174,6 +174,7 @@ class FlaxDataCollatorForWav2Vec2Pretraining:
         batch_size = batch["input_values"].shape[0]
         if batch["attention_mask"] is not None:
             output_lengths = self.model._get_feat_extract_output_lengths(batch["attention_mask"].sum(-1))
             attention_mask = np.zeros((batch_size, mask_indices_seq_length), dtype=np.int8)
@@ -196,6 +197,7 @@ class FlaxDataCollatorForWav2Vec2Pretraining:
         batch["sampled_negative_indices"] = _sample_negative_indices(
             (batch["mask_time_indices"].shape + (self.model.config.proj_codevector_dim,)),
             self.model.config.num_negatives,
         )
         return batch
@@ -342,6 +344,7 @@ def main():
     def normalize(batch):
         return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
     # normalize and transform to `BatchFeatures`
     vectorized_datasets = vectorized_datasets.map(
         normalize,

         batch_size = batch["input_values"].shape[0]
+        attention_mask = None
         if batch["attention_mask"] is not None:
             output_lengths = self.model._get_feat_extract_output_lengths(batch["attention_mask"].sum(-1))
             attention_mask = np.zeros((batch_size, mask_indices_seq_length), dtype=np.int8)
         batch["sampled_negative_indices"] = _sample_negative_indices(
             (batch["mask_time_indices"].shape + (self.model.config.proj_codevector_dim,)),
             self.model.config.num_negatives,
+            attention_mask=attention_mask,
         )
         return batch
     def normalize(batch):
         return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
+    batch_size = 64
     # normalize and transform to `BatchFeatures`
     vectorized_datasets = vectorized_datasets.map(
         normalize,

train.sh CHANGED Viewed

@@ -1,22 +1,21 @@
 #!/usr/bin/env bash
-./preprocess_dataset.py \
-    --output_dir="./output" \
     --num_train_epochs="5" \
-    --per_device_train_batch_size="32" \
-    --per_device_eval_batch_size="32" \
     --learning_rate="5e-4" \
     --weight_decay="0.01" \
-    --warmup_steps="2000" \
     --model_name_or_path="./" \
     --dataset_name="common_voice" \
     --dataset_config_name="es" \
-    --preprocessing_num_workers="64" \
     --max_duration_in_seconds="10.0" \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
     --pad_to_multiple_of="16384" \
     --validation_split_percentage="5" \
     --speech_file_column="path" \
-    --dtype="bfloat16" \
     --cache_dir="./data_cache" \
     --push_to_hub

 #!/usr/bin/env bash
+./run_wav2vec2_pretrain_flax.py \
+    --output_dir="./wav2vec2-spanish" \
     --num_train_epochs="5" \
+    --per_device_train_batch_size="16" \
+    --per_device_eval_batch_size="16" \
     --learning_rate="5e-4" \
     --weight_decay="0.01" \
+    --warmup_steps="1000" \
     --model_name_or_path="./" \
     --dataset_name="common_voice" \
     --dataset_config_name="es" \
+    --preprocessing_num_workers="32" \
     --max_duration_in_seconds="10.0" \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
     --pad_to_multiple_of="16384" \
     --validation_split_percentage="5" \
     --speech_file_column="path" \
     --cache_dir="./data_cache" \
     --push_to_hub