import csv import json import os import datasets import pandas as pd import numpy as np # TODO: Add BibTeX citation # Find for instance the citation on arxiv or on the dataset repo/website _CITATION = """\ @InProceedings{huggingface:dataset, title = {A great new dataset}, author={huggingface, Inc. }, year={2020} } """ # TODO: Add description of the dataset here # You can copy an official description _DESCRIPTION = """\ This new dataset is designed to solve this great NLP task and is crafted with a lot of care. """ # TODO: Add a link to an official homepage for the dataset here _HOMEPAGE = "" # TODO: Add the licence for the dataset here if you can find it _LICENSE = "" # TODO: Add link to the official dataset URLs here # The HuggingFace dataset library don't host the datasets but only point to the original files # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) _URLs = { } # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case class COCODataset(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" VERSION = datasets.Version("1.1.0") DEFAULT_CONFIG_NAME = "en" def _info(self): # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features( { "id": datasets.Value("int64"), "en": datasets.Value("string"), "fr": datasets.Value("string"), "image_id": datasets.Value("int64"), "image_file": datasets.Value("string") # These are the features of your dataset like images, labels ... } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name data_dir = self.config.data_dir return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": data_dir, "split": "val", }, ), ] def _generate_examples( self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` ): """ Yields examples as (key, example) tuples. """ # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. # /home/33611/caption/ # train2014 if split == 'dev': split == 'val' with open(os.path.join(data_dir, f'{split}.json')) as fp: examples = json.load(fp) for id_, ex in enumerate(examples): image_id = ex["image_id"] fn = f'COCO_{split}2014_{str(image_id).zfill(12)}.jpg' image_file = os.path.join(data_dir, f'{split}2014', fn) yield id_, { "id": ex["id"], "en": ex["caption"], "fr": ex["fr"], "image_id": ex["image_id"], "image_file": image_file }