import csv import json import os import datasets import pandas as pd import numpy as np # TODO: Add BibTeX citation # Find for instance the citation on arxiv or on the dataset repo/website _CITATION = """\ @InProceedings{huggingface:dataset, title = {A great new dataset}, author={huggingface, Inc. }, year={2020} } """ # TODO: Add description of the dataset here # You can copy an official description _DESCRIPTION = """\ This new dataset is designed to solve this great NLP task and is crafted with a lot of care. """ # TODO: Add a link to an official homepage for the dataset here _HOMEPAGE = "" # TODO: Add the licence for the dataset here if you can find it _LICENSE = "" # TODO: Add link to the official dataset URLs here # The HuggingFace dataset library don't host the datasets but only point to the original files # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) _URLs = { } # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case class WITDataset(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" VERSION = datasets.Version("1.1.0") DEFAULT_CONFIG_NAME = "en" def _info(self): # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features( { "id": datasets.Value("int64"), "lang": datasets.Value("string"), "caption": datasets.Value("string"), "context": datasets.Value("string"), "image_url": datasets.Value("string"), "page_url": datasets.Value("string"), "image_file": datasets.Value("string"), "pixels_file": datasets.Value("string") # These are the features of your dataset like images, labels ... } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name data_dir = self.config.data_dir return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": os.path.join(data_dir, "train"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": os.path.join(data_dir, "test"), "split": "test" }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={ "data_dir": os.path.join(data_dir, "dev"), "split": "dev", }, ), ] def _generate_examples( self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` ): """ Yields examples as (key, example) tuples. """ # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. df = pd.read_csv(os.path.join(data_dir, f'{split}.tsv'), sep='\t') for id_, row in df.iterrows(): _id = row[0] # null caption and context if type(row[4]) != str or type(row[5]) != str: continue image_file = os.path.join(data_dir, 'images', f'{_id}.jpg') pixels_file = os.path.join(data_dir, 'numpy', f'{_id}.npy') yield id_, { "id": row[0], "lang": row[1], "caption": row[4], "context": row[5], "image_url": row[2], "page_url": row[3], "image_file": image_file, "pixels_file": pixels_file }