marinone94 commited on
Commit
044dff6
1 Parent(s): fcf680c

log df of train and test data

Browse files
Files changed (2) hide show
  1. run.sh +1 -1
  2. run_speech_recognition_ctc.py +6 -0
run.sh CHANGED
@@ -5,7 +5,7 @@ python run_speech_recognition_ctc.py \
5
  --train_split_name="train+validation,train" \
6
  --eval_split_name="test,None" \
7
  --output_dir="./" \
8
- --overwrite_output_dir \
9
  --num_train_epochs="3" \
10
  --per_device_train_batch_size="32" \
11
  --per_device_eval_batch_size="32" \
 
5
  --train_split_name="train+validation,train" \
6
  --eval_split_name="test,None" \
7
  --output_dir="./" \
8
+ --preprocessing_only \
9
  --num_train_epochs="3" \
10
  --per_device_train_batch_size="32" \
11
  --per_device_eval_batch_size="32" \
run_speech_recognition_ctc.py CHANGED
@@ -750,6 +750,12 @@ def main():
750
  # If dataset_seed is set, shuffle train
751
  if data_args.dataset_seed is not None:
752
  vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
 
 
 
 
 
 
753
 
754
  # for large datasets it is advised to run the preprocessing on a
755
  # single machine first with ``args.preprocessing_only`` since there will mostly likely
 
750
  # If dataset_seed is set, shuffle train
751
  if data_args.dataset_seed is not None:
752
  vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(seed=data_args.dataset_seed)
753
+
754
+ # Log sample of datasets
755
+ pd_train = vectorized_datasets["train"].select(range(10)).to_pandas()
756
+ pd_eval = vectorized_datasets["eval"].select(range(10)).to_pandas()
757
+ wandb.log({"train_sample": pd_train})
758
+ wandb.log("eval_sample": pd_eval)
759
 
760
  # for large datasets it is advised to run the preprocessing on a
761
  # single machine first with ``args.preprocessing_only`` since there will mostly likely