#!/bin/bash # author: laurie # script to sample OpenLID-v2 prior to training # usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2 set -eo pipefail START_DIR=${PWD} echo "starting dir is ${START_DIR}" INPUT_DATA=$1 # should be openlid-v2 dataset echo "using openlid-v2 data from ${1}" echo "generating counts in stats/" mkdir -p stats cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts echo "applying temperature sampling..." python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv