|
#!/usr/bin/env bash |
|
|
|
MODEL_NAME="openai/whisper-large-v3" |
|
CACHE_DIR="/home/sanchitgandhi/.cache" |
|
OUTPUT_DIR="./transcriptions-streaming" |
|
WANDB_DIR="/home/sanchitgandhi/.cache" |
|
WANDB_PROJECT="distil-whisper-label" |
|
BATCH_SIZE=64 |
|
NUM_BEAMS=1 |
|
MAX_LABEL_LENGTH=256 |
|
LOGGING_STEPS=500 |
|
NUM_WORKERS=64 |
|
RETURN_TIMESTAMPS=False |
|
|
|
python run_pseudo_labelling.py \ |
|
--model_name_or_path $MODEL_NAME \ |
|
--dataset_name "distil-whisper/librispeech_asr" \ |
|
--dataset_config_name "all" \ |
|
--data_split_name "train.other.500+validation.clean+validation.other+test.clean+test.other" \ |
|
--wandb_name "whisper-large-v2-librispeech_asr" \ |
|
--cache_dir $CACHE_DIR \ |
|
--dataset_cache_dir $CACHE_DIR \ |
|
--output_dir $OUTPUT_DIR \ |
|
--wandb_dir $WANDB_DIR \ |
|
--wandb_project $WANDB_PROJECT \ |
|
--per_device_eval_batch_size $BATCH_SIZE \ |
|
--generation_num_beams $NUM_BEAMS \ |
|
--max_label_length $MAX_LABEL_LENGTH \ |
|
--logging_steps $LOGGING_STEPS \ |
|
--dataloader_num_workers $NUM_WORKERS \ |
|
--dtype "bfloat16" \ |
|
--report_to "wandb" \ |
|
--streaming True \ |
|
--push_to_hub \ |
|
--return_timestamps $RETURN_TIMESTAMPS \ |
|
--compilation_cache $CACHE_DIR |
|
|
|
python run_pseudo_labelling.py \ |
|
--model_name_or_path $MODEL_NAME \ |
|
--dataset_name "distil-whisper/peoples_speech-clean" \ |
|
--dataset_config_name "clean" \ |
|
--data_split_name "train+validation+test" \ |
|
--wandb_name "whisper-large-v2-peoples_speech-clean" \ |
|
--cache_dir $CACHE_DIR \ |
|
--dataset_cache_dir $CACHE_DIR \ |
|
--output_dir $OUTPUT_DIR \ |
|
--wandb_dir $WANDB_DIR \ |
|
--wandb_project $WANDB_PROJECT \ |
|
--per_device_eval_batch_size $BATCH_SIZE \ |
|
--generation_num_beams $NUM_BEAMS \ |
|
--max_label_length $MAX_LABEL_LENGTH \ |
|
--logging_steps $LOGGING_STEPS \ |
|
--dataloader_num_workers $NUM_WORKERS \ |
|
--dtype "bfloat16" \ |
|
--report_to "wandb" \ |
|
--streaming True \ |
|
--push_to_hub \ |
|
--return_timestamps $RETURN_TIMESTAMPS \ |
|
--compilation_cache $CACHE_DIR |
|
|