vit-gpt2 / image_caption_dataset.py

ydshieh

Add dataset script

9628bd8 over 2 years ago

No virus

7.28 kB

	import csv
	import json
	import os

	import datasets
	import pandas as pd
	import numpy as np


	class ImageCaptionBuilderConfig(datasets.BuilderConfig):

	def __init__(self, name, splits, langs, prefix_before_image_fn=False, zfill=1, **kwargs):

	super().__init__(name, **kwargs)

	self.splits = splits
	self.langs = langs
	self.prefix_before_image_fn = prefix_before_image_fn
	self.zfill = zfill


	# TODO: Add BibTeX citation
	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@InProceedings{None,
	title = {Generic images to captions dataset},
	author={Yih-Dar SHIEH},
	year={2020}
	}
	"""

	# TODO: Add description of the dataset here
	# You can copy an official description
	_DESCRIPTION = """\

	"""

	# TODO: Add a link to an official homepage for the dataset here
	_HOMEPAGE = ""

	# TODO: Add the licence for the dataset here if you can find it
	_LICENSE = ""

	# TODO: Add link to the official dataset URLs here
	# The HuggingFace dataset library don't host the datasets but only point to the original files
	# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
	_URLs = {}


	# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
	class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
	"""TODO: Short description of my dataset."""

	VERSION = datasets.Version("0.0.0")

	BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
	BUILDER_CONFIGS = [
	ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], prefix_before_image_fn=False, zfill=12, langs=['en', 'fr']),
	ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr']),
	ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr'])
	]
	DEFAULT_CONFIG_NAME = "coco_2017"

	def _info(self):
	# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset

	feature_dict = {
	"image_id": datasets.Value("int64"),
	"id": datasets.Value("int64"),
	"caption": datasets.Value("string"),
	}
	for lang in self.config.langs:
	feature_dict[lang] = datasets.Value("string")
	feature_dict["image_url"] = datasets.Value("string")
	feature_dict["image_file"] = datasets.Value("string")

	features = datasets.Features(feature_dict)

	return datasets.DatasetInfo(
	# This is the description that will appear on the datasets page.
	description=_DESCRIPTION,
	# This defines the different columns of the dataset and their types
	features=features, # Here we define them above because they are different between the two configurations
	# If there's a common (input, target) tuple from the features,
	# specify them here. They'll be used if as_supervised=True in
	# builder.as_dataset.
	supervised_keys=None,
	# Homepage of the dataset for documentation
	homepage=_HOMEPAGE,
	# License for the dataset if available
	license=_LICENSE,
	# Citation for the dataset
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
	# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

	data_dir = self.config.data_dir

	splits = []
	for split in self.config.splits:
	if split == 'train':
	dataset = datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'train'),
	"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'train'),
	"split": "train",
	}
	)
	elif split in ['val', 'valid', 'validation', 'dev']:
	dataset = datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'valid'),
	"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'valid'),
	"split": "valid",
	},
	)
	elif split == 'test':
	dataset = datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'test'),
	"image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'test'),
	"split": "test",
	},
	)
	else:
	continue

	splits.append(dataset)

	return splits

	def _generate_examples(
	# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
	self, jsonl_dir, image_dir, split
	):
	""" Yields examples as (key, example) tuples. """
	# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
	# The `key` is here for legacy reason (tfds) and is not important in itself.

	if split == 'dev':
	split = 'valid'

	fns = [os.path.join(jsonl_dir, fn) for fn in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, fn)) and fn.endswith("jsonl")]

	for jsonl_file in fns:

	with open(jsonl_file, 'r', encoding='UTF-8') as fp:

	for id_, line in enumerate(fp):

	ex = json.loads(line)

	example = {
	"image_id": ex['image_id'],
	"id": ex["id"],
	"caption": ex["caption"],
	}

	for lang in self.config.langs:
	example[lang] = ex[lang]

	if 'image_url' in ex:
	example['image_url'] = ex['image_url']
	else:
	example['image_url'] = ''

	fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
	if self.config.prefix_before_image_fn:
	fn = f'{self.config.name}_{split}_' + fn

	image_file = os.path.join(image_dir, fn)
	example['image_file'] = image_file

	if not os.path.isfile(image_file):
	continue

	yield id_, example