vit-gpt2 / image_caption_dataset.py
ydshieh
Add dataset script
9628bd8
import csv
import json
import os
import datasets
import pandas as pd
import numpy as np
class ImageCaptionBuilderConfig(datasets.BuilderConfig):
def __init__(self, name, splits, langs, prefix_before_image_fn=False, zfill=1, **kwargs):
super().__init__(name, **kwargs)
self.splits = splits
self.langs = langs
self.prefix_before_image_fn = prefix_before_image_fn
self.zfill = zfill
# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{None,
title = {Generic images to captions dataset},
author={Yih-Dar SHIEH},
year={2020}
}
"""
# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
"""
# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {}
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
"""TODO: Short description of my dataset."""
VERSION = datasets.Version("0.0.0")
BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
BUILDER_CONFIGS = [
ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], prefix_before_image_fn=False, zfill=12, langs=['en', 'fr']),
ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr']),
ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr'])
]
DEFAULT_CONFIG_NAME = "coco_2017"
def _info(self):
# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
feature_dict = {
"image_id": datasets.Value("int64"),
"id": datasets.Value("int64"),
"caption": datasets.Value("string"),
}
for lang in self.config.langs:
feature_dict[lang] = datasets.Value("string")
feature_dict["image_url"] = datasets.Value("string")
feature_dict["image_file"] = datasets.Value("string")
features = datasets.Features(feature_dict)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
data_dir = self.config.data_dir
splits = []
for split in self.config.splits:
if split == 'train':
dataset = datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'train'),
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'train'),
"split": "train",
}
)
elif split in ['val', 'valid', 'validation', 'dev']:
dataset = datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'valid'),
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'valid'),
"split": "valid",
},
)
elif split == 'test':
dataset = datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'test'),
"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'test'),
"split": "test",
},
)
else:
continue
splits.append(dataset)
return splits
def _generate_examples(
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
self, jsonl_dir, image_dir, split
):
""" Yields examples as (key, example) tuples. """
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is here for legacy reason (tfds) and is not important in itself.
if split == 'dev':
split = 'valid'
fns = [os.path.join(jsonl_dir, fn) for fn in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, fn)) and fn.endswith("jsonl")]
for jsonl_file in fns:
with open(jsonl_file, 'r', encoding='UTF-8') as fp:
for id_, line in enumerate(fp):
ex = json.loads(line)
example = {
"image_id": ex['image_id'],
"id": ex["id"],
"caption": ex["caption"],
}
for lang in self.config.langs:
example[lang] = ex[lang]
if 'image_url' in ex:
example['image_url'] = ex['image_url']
else:
example['image_url'] = ''
fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
if self.config.prefix_before_image_fn:
fn = f'{self.config.name}_{split}_' + fn
image_file = os.path.join(image_dir, fn)
example['image_file'] = image_file
if not os.path.isfile(image_file):
continue
yield id_, example