# author: laurie | |
# script to sample OpenLID-v2 prior to training | |
# usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2 | |
set -eo pipefail | |
START_DIR=${PWD} | |
echo "starting dir is ${START_DIR}" | |
INPUT_DATA=$1 # should be openlid-v2 dataset | |
echo "using openlid-v2 data from ${1}" | |
echo "generating counts in stats/" | |
mkdir -p stats | |
cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts | |
echo "applying temperature sampling..." | |
python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv |