#!/usr/bin/env bash

MODEL_NAME="openai/whisper-large-v3"
CACHE_DIR="/home/sanchitgandhi/.cache"
OUTPUT_DIR="./transcriptions-streaming"
WANDB_DIR="/home/sanchitgandhi/.cache"
WANDB_PROJECT="distil-whisper-label"
SPLITS="train+validation+test"
BATCH_SIZE=16
NUM_BEAMS=1
MAX_LABEL_LENGTH=256
LOGGING_STEPS=500
NUM_WORKERS=64
RETURN_TIMESTAMPS=False
DECODE_TOKEN_IDS=False

DATASET_NAMES=("distil-whisper/common_voice_13_0" "distil-whisper/voxpopuli" "distil-whisper/tedlium" "distil-whisper/ami-ihm" "distil-whisper/ami-sdm" "distil-whisper/spgispeech" "distil-whisper/gigaspeech-l")
CONFIGS=("en" "en" "release3" "ihm" "sdm" "L" "l")

for i in "${!DATASET_NAMES[@]}"; do
  python run_pseudo_labelling.py \
  --model_name_or_path $MODEL_NAME \
  --dataset_name "${DATASET_NAMES[i]}" \
  --dataset_config_name "${CONFIGS[i]}" \
  --data_split_name "$SPLITS" \
  --wandb_name "whisper-large-v2-${DATASET_NAMES[i]}-token-ids" \
  --cache_dir $CACHE_DIR \
  --dataset_cache_dir $CACHE_DIR \
  --output_dir $OUTPUT_DIR \
  --wandb_dir $WANDB_DIR \
  --wandb_project $WANDB_PROJECT \
  --per_device_eval_batch_size $BATCH_SIZE \
  --generation_num_beams $NUM_BEAMS \
  --max_label_length $MAX_LABEL_LENGTH \
  --logging_steps $LOGGING_STEPS \
  --dataloader_num_workers $NUM_WORKERS \
  --dtype "bfloat16" \
  --report_to "wandb" \
  --streaming True \
  --push_to_hub \
  --return_timestamps $RETURN_TIMESTAMPS \
  --compilation_cache $CACHE_DIR \
  --decode_token_ids $DECODE_TOKEN_IDS
done