File size: 2,124 Bytes

a70e840

from datasets import DatasetBuilder, DatasetInfo, SplitGenerators, DownloadManager
from datasets.features import Features, ClassLabel, Sequence, Value

class MyCustomDataset(DatasetBuilder):
    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return DatasetInfo(
            description="My custom dataset for tracking objects.",
            features=Features({
                "prompting_type": Value("string"),
                "deception": Value("bool"),
                "story_length": Value("int32"),
                "question_order": Value("int32"),
                "sample_id": Value("int32"),
                "story": Value("string"),
                "question": Value("string"),
                "choices": Value("string"),
                "answer": Value("string"),
            }),
            supervised_keys=None,
            homepage="https://github.com/ying-hui-he/Hi-ToM_dataset",
            citation=CITATION,
        )

    def _split_generators(self, dl_manager: DownloadManager):
        downloaded_files = dl_manager.download_and_extract({
            "data_file": "https://github.com/ying-hui-he/Hi-ToM_dataset/blob/main/Hi-ToM_data.json"
        })

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": downloaded_files["data_file"],
                },
            ),
        ]

    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            for id, item in enumerate(data["data"]):
                yield id, {
                    "prompting_type": item["prompting_type"],
                    "deception": item["deception"],
                    "story_length": item["story_length"],
                    "question_order": item["question_order"],
                    "sample_id": item["sample_id"],
                    "story": item["story"],
                    "question": item["question"],
                    "choices": item["choices"],
                    "answer": item["answer"],
                }