Spaces:

koajoel
/

PolyFormer

Running

PolyFormer / fairseq /examples /roberta /preprocess_RACE.sh

jiang

init commit

650c5f6 over 1 year ago

2.07 kB

	#!/bin/bash
	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.


	# data should be downloaded and processed with reprocess_RACE.py
	if [[ $# -ne 2 ]]; then
	echo "Run as following:"
	echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>"
	exit 1
	fi

	RACE_DATA_FOLDER=$1
	OUT_DATA_FOLDER=$2

	# download bpe encoder.json, vocabulary and fairseq dictionary
	wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
	wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
	wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

	SPLITS="train dev test-middle test-high"
	INPUT_TYPES="input0 input1 input2 input3 input4"
	for INPUT_TYPE in $INPUT_TYPES
	do
	for SPLIT in $SPLITS
	do
	echo "BPE encoding $SPLIT/$INPUT_TYPE"
	python -m examples.roberta.multiprocessing_bpe_encoder \
	--encoder-json encoder.json \
	--vocab-bpe vocab.bpe \
	--inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \
	--outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \
	--workers 10 \
	--keep-empty;

	done
	done

	for INPUT_TYPE in $INPUT_TYPES
	do
	LANG="input$INPUT_TYPE"
	fairseq-preprocess \
	--only-source \
	--trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
	--validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
	--testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \
	--destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \
	--workers 10 \
	--srcdict dict.txt;
	done

	rm -rf "$OUT_DATA_FOLDER/label"
	mkdir -p "$OUT_DATA_FOLDER/label"
	cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
	cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
	cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
	cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"