|
#! /usr/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LANG=ml |
|
LANG_ISO_3=mal |
|
LANGUAGE=Malayalam |
|
|
|
|
|
DATASET="mozilla-foundation/common_voice_16_0" |
|
TEXT_COLUMN="sentence" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get_fine_tuning_lr() { |
|
local model_size=$1 |
|
local lr |
|
|
|
case $model_size in |
|
"tiny") |
|
lr="3.75e-5" |
|
;; |
|
"base") |
|
lr="2.5e-5" |
|
;; |
|
"small") |
|
lr="1.25e-5" |
|
;; |
|
"medium") |
|
lr="6.25e-6" |
|
;; |
|
"large") |
|
lr="4.375e-6" |
|
;; |
|
"large-v2") |
|
lr="5e-6" |
|
;; |
|
*) |
|
echo "Invalid model size" |
|
exit 1 |
|
;; |
|
esac |
|
|
|
echo $lr |
|
} |
|
|
|
SCRIPT_PATH=$(realpath "${BASH_SOURCE[0]}") |
|
SCRIPT_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")) |
|
|
|
|
|
export MASTER_PORT="${MASTER_PORT:-29500}" |
|
echo "Using master_port for deepspeech: ${MASTER_PORT}" |
|
|
|
export "MASTER_ADDR"="localhost" |
|
export "RANK"="0" |
|
export "LOCAL_RANK"="0" |
|
export "WORLD_SIZE"="1" |
|
|
|
|
|
MODEL=w2v2 |
|
|
|
|
|
BASE_MODEL="facebook/mms-1b-all" |
|
|
|
JUST_LANG=${LANG%%_*} |
|
MY_MODEL="breeze-listen-${MODEL}-${JUST_LANG}" |
|
|
|
OUTDIR="/cosmos/home/sp-operator/ai/training/models/simpragma/${MY_MODEL}" |
|
echo "OUTDIR: ${OUTDIR}" |
|
|
|
|
|
|
|
MAX_EPOCHS=4 |
|
TRAIN_BATCH_SIZE=4 |
|
EVAL_BATCH_SIZE=4 |
|
LEARNING_RATE="1e-3" |
|
|
|
EVAL_STEPS="200" |
|
SAVE_STEPS="200" |
|
|
|
|
|
mkdir -p ${OUTDIR} |
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "================ TRAINING: START ================" |
|
|
|
python ${SCRIPT_DIR}/run_speech_recognition_ctc_adapter.py \ |
|
--dataset_name="${DATASET}" \ |
|
--model_name_or_path="${BASE_MODEL}" \ |
|
--dataset_config_name="${LANG}" \ |
|
--target_language="${LANG_ISO_3}" \ |
|
--output_dir="${OUTDIR}" \ |
|
--num_train_epochs="${MAX_EPOCHS}" \ |
|
--per_device_train_batch_size="${TRAIN_BATCH_SIZE}" \ |
|
--learning_rate="${LEARNING_RATE}" \ |
|
--warmup_steps="100" \ |
|
--evaluation_strategy="steps" \ |
|
--text_column_name="${TEXT_COLUMN}" \ |
|
--length_column_name="input_length" \ |
|
--save_steps="${SAVE_STEPS}" \ |
|
--eval_steps="${EVAL_STEPS}" \ |
|
--save_total_limit="3" \ |
|
--optim="adamw_bnb_8bit" \ |
|
--hub_model_id "simpragma/${MY_MODEL}" \ |
|
--gradient_checkpointing \ |
|
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ |
|
--fp16 \ |
|
--group_by_length \ |
|
--do_train \ |
|
--do_eval \ |
|
--push_to_hub \ |
|
| tee ${OUTDIR}/${MY_MODEL}.log |
|
|
|
|
|
cp ${SCRIPT_PATH} ${OUTDIR} |
|
|
|
echo "================ TRAINING: DONE ================" |
|
|
|
exit 0 |
|
|