OpenLID-v2 / scripts /prepare_openlid_v2_for_model_training.sh
laurievb's picture
Upload scripts/prepare_openlid_v2_for_model_training.sh with huggingface_hub
664c399 verified
raw
history blame
584 Bytes
#!/bin/bash
# author: laurie
# script to sample OpenLID-v2 prior to training
# usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2
set -eo pipefail
START_DIR=${PWD}
echo "starting dir is ${START_DIR}"
INPUT_DATA=$1 # should be openlid-v2 dataset
echo "using openlid-v2 data from ${1}"
echo "generating counts in stats/"
mkdir -p stats
cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts
echo "applying temperature sampling..."
python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv