Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# data should be downloaded and processed with reprocess_RACE.py | |
if [[ $# -ne 2 ]]; then | |
echo "Run as following:" | |
echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>" | |
exit 1 | |
fi | |
RACE_DATA_FOLDER=$1 | |
OUT_DATA_FOLDER=$2 | |
# download bpe encoder.json, vocabulary and fairseq dictionary | |
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' | |
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' | |
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' | |
SPLITS="train dev test-middle test-high" | |
INPUT_TYPES="input0 input1 input2 input3 input4" | |
for INPUT_TYPE in $INPUT_TYPES | |
do | |
for SPLIT in $SPLITS | |
do | |
echo "BPE encoding $SPLIT/$INPUT_TYPE" | |
python -m examples.roberta.multiprocessing_bpe_encoder \ | |
--encoder-json encoder.json \ | |
--vocab-bpe vocab.bpe \ | |
--inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \ | |
--outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \ | |
--workers 10 \ | |
--keep-empty; | |
done | |
done | |
for INPUT_TYPE in $INPUT_TYPES | |
do | |
LANG="input$INPUT_TYPE" | |
fairseq-preprocess \ | |
--only-source \ | |
--trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \ | |
--validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \ | |
--testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \ | |
--destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \ | |
--workers 10 \ | |
--srcdict dict.txt; | |
done | |
rm -rf "$OUT_DATA_FOLDER/label" | |
mkdir -p "$OUT_DATA_FOLDER/label" | |
cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/" | |
cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label" | |
cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label" | |
cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label" | |