command: - python3 - ${program} - --do_train - --use_scan - --gradient_checkpointing - --overwrite_output_dir - --predict_with_generate - --freeze_encoder - --streaming - --use_auth_token - --compilation_cache - ${args} method: grid metric: goal: minimize name: train/loss parameters: model_name_or_path: value: distil-whisper/large-32-2 teacher_model_name_or_path: value: openai/whisper-large-v2 train_dataset_name: value: librispeech_asr train_dataset_config_name: value: all train_split_name: value: train.other.500 train_dataset_samples: value: 100 cache_dir: value: /fsx/sanchitgandhi/cache dataset_cache_dir: value: /fsx/sanchitgandhi/cache output_dir: value: ./ per_device_train_batch_size: values: - 128 - 256 - 512 precision: values: - "full_mixed" - "half_mixed" dtype: value: bfloat16 do_eval: value: false learning_rate: value: 3e-4 lr_scheduler_type: value: constant_with_warmup warmup_steps: value: 30 max_steps: value: 30 save_steps: value: 51 # don't save checkpoints during sweep dataloader_num_workers: value: 48 logging_steps: value: 5 wer_threshold: value: 100 program: run_distillation.py project: distil-whisper-sweeps