#! /usr/bin/bash

#
# This script runs the speech recognition training using DeepSpeed
#

# CHANGE THESE AS PER YOUR REQUIREMENTS

# LANG as it is referred in the dataset
#LANG=te			# 2 letter ISO code for the language
LANG=kn_in		# 2 letter ISO code for the language with locale (some datasets like Google/Fleurs require this)
LANG_ISO_3=kan		# 3 letter ISO code for the language
LANGUAGE=Kannada	# Full language name as per Whisper convention

# For Mozilla Commonvoice datasets, uncomment the following
#DATASET="mozilla-foundation/common_voice_16_0"
#TEXT_COLUMN="sentence"

# For Google Fleurs datasets, uncomment the following
DATASET="google/fleurs"
TEXT_COLUMN="transcription"

# Custom datasets
#DATASET="parambharat/kannada_asr_corpus"
#TEXT_COLUMN=${TEXT_COLUMN:-"sentence"}

#
# Main
#

SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}")
SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}"))

# Port to use
export MASTER_PORT="${MASTER_PORT:-29500}"
echo "Using master_port for deepspeech: ${MASTER_PORT}"

export "MASTER_ADDR"="localhost"
export "RANK"="0"
export "LOCAL_RANK"="0"
export "WORLD_SIZE"="1"

# Base model variant
MODEL=w2v2

# Model names and other stuff
#BASE_MODEL="facebook/mms-1b-all"
BASE_MODEL="facebook/mms-1b-fl102"

JUST_LANG=${LANG%%_*}
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}-GF"

OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}"
echo "OUTDIR: ${OUTDIR}"

# Training parameters you can tweak. Feel free to directly change any of the parameters below.

MAX_EPOCHS=4
TRAIN_BATCH_SIZE=4
EVAL_BATCH_SIZE=4
LEARNING_RATE="1e-3"

EVAL_STEPS="1000"
SAVE_STEPS="1000"

# Create dir
mkdir -p ${OUTDIR}

#	--overwrite_output_dir \

# If you want to resume from existing checkpoint, include the following argument as well. Modify the checkpoint directory.
# --resume_from_checkpoint="${MY_MODEL}/checkpoint-400" \

echo "================ TRAINING: START ================"

python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \
	--dataset_name="${DATASET}" \
	--model_name_or_path="${BASE_MODEL}" \
	--dataset_config_name="${LANG}" \
	--target_language="${LANG_ISO_3}"	\
	--output_dir="${OUTDIR}" \
	--num_train_epochs="${MAX_EPOCHS}" \
	--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \
	--gradient_accumulation_steps="16"	\
	--learning_rate="${LEARNING_RATE}" \
	--warmup_steps="100" \
	--evaluation_strategy="steps" \
	--text_column_name="${TEXT_COLUMN}" \
	--length_column_name="input_length" \
	--save_steps="${SAVE_STEPS}" \
	--eval_steps="${EVAL_STEPS}" \
	--save_total_limit="3" \
	--optim="adamw_bnb_8bit"	\
	--hub_model_id "simpragma/${MY_MODEL}" \
	--gradient_checkpointing \
	--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \
	--fp16 \
	--group_by_length \
	--do_train 	\
	--do_eval \
	--push_to_hub	\
	--overwrite_output_dir	\
	| tee ${OUTDIR}/${MY_MODEL}.log

# Copy the script to the output directory so that we can recreate the model
cp ${SCRIPT_PATH} ${OUTDIR}

echo "================ TRAINING: DONE ================"

exit 0