Spaces:

koajoel
/

PolyFormer

Running

PolyFormer / fairseq /examples /roberta /preprocess_RACE.py

jiang

init commit

650c5f6 over 1 year ago

3.43 kB

	#!/usr/bin/env python
	# Copyright (c) Facebook, Inc. and its affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import argparse
	import json
	import os
	import re


	class InputExample:
	def __init__(self, paragraph, qa_list, label):
	self.paragraph = paragraph
	self.qa_list = qa_list
	self.label = label


	def get_examples(data_dir, set_type):
	"""
	Extract paragraph and question-answer list from each json file
	"""
	examples = []

	levels = ["middle", "high"]
	set_type_c = set_type.split("-")
	if len(set_type_c) == 2:
	levels = [set_type_c[1]]
	set_type = set_type_c[0]
	for level in levels:
	cur_dir = os.path.join(data_dir, set_type, level)
	for filename in os.listdir(cur_dir):
	cur_path = os.path.join(cur_dir, filename)
	with open(cur_path, "r") as f:
	cur_data = json.load(f)
	answers = cur_data["answers"]
	options = cur_data["options"]
	questions = cur_data["questions"]
	context = cur_data["article"].replace("\n", " ")
	context = re.sub(r"\s+", " ", context)
	for i in range(len(answers)):
	label = ord(answers[i]) - ord("A")
	qa_list = []
	question = questions[i]
	for j in range(4):
	option = options[i][j]
	if "_" in question:
	qa_cat = question.replace("_", option)
	else:
	qa_cat = " ".join([question, option])
	qa_cat = re.sub(r"\s+", " ", qa_cat)
	qa_list.append(qa_cat)
	examples.append(InputExample(context, qa_list, label))

	return examples


	def main():
	"""
	Helper script to extract paragraphs questions and answers from RACE datasets.
	"""
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--input-dir",
	help="input directory for downloaded RACE dataset",
	)
	parser.add_argument(
	"--output-dir",
	help="output directory for extracted data",
	)
	args = parser.parse_args()

	if not os.path.exists(args.output_dir):
	os.makedirs(args.output_dir, exist_ok=True)

	for set_type in ["train", "dev", "test-middle", "test-high"]:
	examples = get_examples(args.input_dir, set_type)
	qa_file_paths = [
	os.path.join(args.output_dir, set_type + ".input" + str(i + 1))
	for i in range(4)
	]
	qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths]
	outf_context_path = os.path.join(args.output_dir, set_type + ".input0")
	outf_label_path = os.path.join(args.output_dir, set_type + ".label")
	outf_context = open(outf_context_path, "w")
	outf_label = open(outf_label_path, "w")
	for example in examples:
	outf_context.write(example.paragraph + "\n")
	for i in range(4):
	qa_files[i].write(example.qa_list[i] + "\n")
	outf_label.write(str(example.label) + "\n")

	for f in qa_files:
	f.close()
	outf_label.close()
	outf_context.close()


	if __name__ == "__main__":
	main()