vit-gpt2 / wit_dataset_script.py

ydshieh

add codes

a244e91 almost 3 years ago

No virus

5.26 kB

	import csv
	import json
	import os

	import datasets
	import pandas as pd
	import numpy as np


	# TODO: Add BibTeX citation
	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@InProceedings{huggingface:dataset,
	title = {A great new dataset},
	author={huggingface, Inc.
	},
	year={2020}
	}
	"""

	# TODO: Add description of the dataset here
	# You can copy an official description
	_DESCRIPTION = """\
	This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
	"""

	# TODO: Add a link to an official homepage for the dataset here
	_HOMEPAGE = ""

	# TODO: Add the licence for the dataset here if you can find it
	_LICENSE = ""

	# TODO: Add link to the official dataset URLs here
	# The HuggingFace dataset library don't host the datasets but only point to the original files
	# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
	_URLs = {
	}


	# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
	class WITDataset(datasets.GeneratorBasedBuilder):
	"""TODO: Short description of my dataset."""

	VERSION = datasets.Version("1.1.0")

	DEFAULT_CONFIG_NAME = "en"

	def _info(self):
	# TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset

	features = datasets.Features(
	{
	"id": datasets.Value("int64"),
	"lang": datasets.Value("string"),
	"caption": datasets.Value("string"),
	"context": datasets.Value("string"),
	"image_url": datasets.Value("string"),
	"page_url": datasets.Value("string"),
	"image_file": datasets.Value("string"),
	"pixels_file": datasets.Value("string")
	# These are the features of your dataset like images, labels ...
	}
	)

	return datasets.DatasetInfo(
	# This is the description that will appear on the datasets page.
	description=_DESCRIPTION,
	# This defines the different columns of the dataset and their types
	features=features, # Here we define them above because they are different between the two configurations
	# If there's a common (input, target) tuple from the features,
	# specify them here. They'll be used if as_supervised=True in
	# builder.as_dataset.
	supervised_keys=None,
	# Homepage of the dataset for documentation
	homepage=_HOMEPAGE,
	# License for the dataset if available
	license=_LICENSE,
	# Citation for the dataset
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
	# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

	data_dir = self.config.data_dir

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"data_dir": os.path.join(data_dir, "train"),
	"split": "train",
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"data_dir": os.path.join(data_dir, "test"),
	"split": "test"
	},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={
	"data_dir": os.path.join(data_dir, "dev"),
	"split": "dev",
	},
	),
	]

	def _generate_examples(
	self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
	):
	""" Yields examples as (key, example) tuples. """
	# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
	# The `key` is here for legacy reason (tfds) and is not important in itself.

	df = pd.read_csv(os.path.join(data_dir, f'{split}.tsv'), sep='\t')

	for id_, row in df.iterrows():

	_id = row[0]

	# null caption and context
	if type(row[4]) != str or type(row[5]) != str:
	continue

	image_file = os.path.join(data_dir, 'images', f'{_id}.jpg')
	pixels_file = os.path.join(data_dir, 'numpy', f'{_id}.npy')

	yield id_, {
	"id": row[0],
	"lang": row[1],
	"caption": row[4],
	"context": row[5],
	"image_url": row[2],
	"page_url": row[3],
	"image_file": image_file,
	"pixels_file": pixels_file
	}