File size: 2,124 Bytes
a70e840 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from datasets import DatasetBuilder, DatasetInfo, SplitGenerators, DownloadManager
from datasets.features import Features, ClassLabel, Sequence, Value
class MyCustomDataset(DatasetBuilder):
VERSION = datasets.Version("1.0.0")
def _info(self):
return DatasetInfo(
description="My custom dataset for tracking objects.",
features=Features({
"prompting_type": Value("string"),
"deception": Value("bool"),
"story_length": Value("int32"),
"question_order": Value("int32"),
"sample_id": Value("int32"),
"story": Value("string"),
"question": Value("string"),
"choices": Value("string"),
"answer": Value("string"),
}),
supervised_keys=None,
homepage="https://github.com/ying-hui-he/Hi-ToM_dataset",
citation=CITATION,
)
def _split_generators(self, dl_manager: DownloadManager):
downloaded_files = dl_manager.download_and_extract({
"data_file": "https://github.com/ying-hui-he/Hi-ToM_dataset/blob/main/Hi-ToM_data.json"
})
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": downloaded_files["data_file"],
},
),
]
def _generate_examples(self, filepath):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
for id, item in enumerate(data["data"]):
yield id, {
"prompting_type": item["prompting_type"],
"deception": item["deception"],
"story_length": item["story_length"],
"question_order": item["question_order"],
"sample_id": item["sample_id"],
"story": item["story"],
"question": item["question"],
"choices": item["choices"],
"answer": item["answer"],
}
|