File size: 2,042 Bytes
a1be16b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
#!/usr/bin/env bash
MODEL_NAME="openai/whisper-large-v3"
CACHE_DIR="/home/sanchitgandhi/.cache"
OUTPUT_DIR="./transcriptions-streaming"
WANDB_DIR="/home/sanchitgandhi/.cache"
WANDB_PROJECT="distil-whisper-label"
BATCH_SIZE=16
NUM_BEAMS=1
MAX_LABEL_LENGTH=256
LOGGING_STEPS=500
NUM_WORKERS=64
RETURN_TIMESTAMPS=False
DECODE_TOKEN_IDS=False
python run_pseudo_labelling.py \
--model_name_or_path $MODEL_NAME \
--dataset_name "distil-whisper/librispeech_asr" \
--dataset_config_name "all" \
--data_split_name "train.other.500+validation.clean+validation.other+test.clean+test.other" \
--wandb_name "whisper-large-v2-librispeech_asr-token-ids" \
--cache_dir $CACHE_DIR \
--dataset_cache_dir $CACHE_DIR \
--output_dir $OUTPUT_DIR \
--wandb_dir $WANDB_DIR \
--wandb_project $WANDB_PROJECT \
--per_device_eval_batch_size $BATCH_SIZE \
--generation_num_beams $NUM_BEAMS \
--max_label_length $MAX_LABEL_LENGTH \
--logging_steps $LOGGING_STEPS \
--dataloader_num_workers $NUM_WORKERS \
--dtype "bfloat16" \
--report_to "wandb" \
--streaming True \
--push_to_hub \
--return_timestamps $RETURN_TIMESTAMPS \
--compilation_cache $CACHE_DIR \
--decode_token_ids $DECODE_TOKEN_IDS
python run_pseudo_labelling.py \
--model_name_or_path $MODEL_NAME \
--dataset_name "distil-whisper/peoples_speech-clean" \
--dataset_config_name "clean" \
--data_split_name "train+validation+test" \
--wandb_name "whisper-large-v2-peoples_speech-clean-token-ids" \
--cache_dir $CACHE_DIR \
--dataset_cache_dir $CACHE_DIR \
--output_dir $OUTPUT_DIR \
--wandb_dir $WANDB_DIR \
--wandb_project $WANDB_PROJECT \
--per_device_eval_batch_size $BATCH_SIZE \
--generation_num_beams $NUM_BEAMS \
--max_label_length $MAX_LABEL_LENGTH \
--logging_steps $LOGGING_STEPS \
--dataloader_num_workers $NUM_WORKERS \
--dtype "bfloat16" \
--report_to "wandb" \
--streaming True \
--push_to_hub \
--return_timestamps $RETURN_TIMESTAMPS \
--compilation_cache $CACHE_DIR \
--decode_token_ids $DECODE_TOKEN_IDS
|