#!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # data should be downloaded and processed with reprocess_RACE.py if [[ $# -ne 2 ]]; then echo "Run as following:" echo "./examples/roberta/preprocess_RACE.sh " exit 1 fi RACE_DATA_FOLDER=$1 OUT_DATA_FOLDER=$2 # download bpe encoder.json, vocabulary and fairseq dictionary wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' SPLITS="train dev test-middle test-high" INPUT_TYPES="input0 input1 input2 input3 input4" for INPUT_TYPE in $INPUT_TYPES do for SPLIT in $SPLITS do echo "BPE encoding $SPLIT/$INPUT_TYPE" python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \ --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \ --workers 10 \ --keep-empty; done done for INPUT_TYPE in $INPUT_TYPES do LANG="input$INPUT_TYPE" fairseq-preprocess \ --only-source \ --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \ --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \ --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \ --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \ --workers 10 \ --srcdict dict.txt; done rm -rf "$OUT_DATA_FOLDER/label" mkdir -p "$OUT_DATA_FOLDER/label" cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/" cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label" cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label" cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"