vit-gpt2 / coco_dataset_script.py
ydshieh
update model.py and add coco files
a99072f
raw
history blame
4.94 kB
import csv
import json
import os
import datasets
import pandas as pd
import numpy as np
# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{huggingface:dataset,
title = {A great new dataset},
author={huggingface, Inc.
},
year={2020}
}
"""
# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
"""
# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
}
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class COCODataset(datasets.GeneratorBasedBuilder):
"""TODO: Short description of my dataset."""
VERSION = datasets.Version("1.1.0")
DEFAULT_CONFIG_NAME = "en"
def _info(self):
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
features = datasets.Features(
{
"id": datasets.Value("int64"),
"en": datasets.Value("string"),
"fr": datasets.Value("string"),
"image_id": datasets.Value("int64"),
"image_file": datasets.Value("string")
# These are the features of your dataset like images, labels ...
}
)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
data_dir = self.config.data_dir
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"data_dir": data_dir,
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"data_dir": data_dir,
"split": "test"
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"data_dir": data_dir,
"split": "val",
},
),
]
def _generate_examples(
self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
):
""" Yields examples as (key, example) tuples. """
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is here for legacy reason (tfds) and is not important in itself.
# /home/33611/caption/
# train2014
if split == 'dev':
split == 'val'
with open(os.path.join(data_dir, f'{split}.json')) as fp:
examples = json.load(fp)
for id_, ex in enumerate(examples):
image_id = ex["image_id"]
fn = f'COCO_{split}2014_{str(image_id).zfill(12)}.jpg'
image_file = os.path.join(data_dir, f'{split}2014', fn)
yield id_, {
"id": ex["id"],
"en": ex["caption"],
"fr": ex["fr"],
"image_id": ex["image_id"],
"image_file": image_file
}