flax-community
/

vit-gpt2

TensorBoard

Model card Files Files and versions Metrics Training metrics Community

ydshieh commited on Aug 12, 2021

Commit

0bf3646

•

1 Parent(s): 64afcd5

Update dataset script

Browse files

Files changed (1) hide show

coco_dataset_script.py +101 -65

coco_dataset_script.py CHANGED Viewed

@@ -7,21 +7,31 @@ import pandas as pd
 import numpy as np
 # TODO: Add BibTeX citation
 # Find for instance the citation on arxiv or on the dataset repo/website
 _CITATION = """\
-@InProceedings{huggingface:dataset,
-title = {A great new dataset},
-author={huggingface, Inc.
-},
-year={2020}
 }
 """
 # TODO: Add description of the dataset here
 # You can copy an official description
 _DESCRIPTION = """\
-This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
 """
 # TODO: Add a link to an official homepage for the dataset here
@@ -33,31 +43,37 @@ _LICENSE = ""
 # TODO: Add link to the official dataset URLs here
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_URLs = {
-}
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class COCODataset(datasets.GeneratorBasedBuilder):
     """TODO: Short description of my dataset."""
-    VERSION = datasets.Version("1.1.0")
-    DEFAULT_CONFIG_NAME = "en"
     def _info(self):
         # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
-        features = datasets.Features(
-            {
-                "id": datasets.Value("int64"),
-                "en": datasets.Value("string"),
-                "fr": datasets.Value("string"),
-                "image_id": datasets.Value("int64"),
-                "image_file": datasets.Value("string")
-                # These are the features of your dataset like images, labels ...
-            }
-        )
         return datasets.DatasetInfo(
             # This is the description that will appear on the datasets page.
@@ -83,60 +99,80 @@ class COCODataset(datasets.GeneratorBasedBuilder):
         data_dir = self.config.data_dir
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "data_dir": data_dir,
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "data_dir": data_dir,
-                    "split": "test"
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "data_dir": data_dir,
-                    "split": "val",
-                },
-            ),
-        ]
     def _generate_examples(
-        self, data_dir, split  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
     ):
         """ Yields examples as (key, example) tuples. """
         # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
         # The `key` is here for legacy reason (tfds) and is not important in itself.
-        # /home/33611/caption/
-        # train2014
         if split == 'dev':
-            split == 'val'
-        with open(os.path.join(data_dir, f'{split}.json')) as fp:
-            examples = json.load(fp)
-        for id_, ex in enumerate(examples):
-            image_id = ex["image_id"]
-            fn = f'COCO_{split}2014_{str(image_id).zfill(12)}.jpg'
-            image_file = os.path.join(data_dir, f'{split}2014', fn)
-            yield id_, {
-                "id": ex["id"],
-                "en": ex["caption"],
-                "fr": ex["fr"],
-                "image_id": ex["image_id"],
-                "image_file": image_file
-            }

 import numpy as np
+class ImageCaptionBuilderConfig(datasets.BuilderConfig):
+    def __init__(self, name, splits, zfill, langs, **kwargs):
+        super().__init__(name, **kwargs)
+        self.splits = splits
+        self.zfill = zfill
+        self.langs = langs
 # TODO: Add BibTeX citation
 # Find for instance the citation on arxiv or on the dataset repo/website
 _CITATION = """\
+@InProceedings{None,
+    title = {Generic images to captions dataset},
+    author={Yih-Dar SHIEH},
+    year={2020}
 }
 """
 # TODO: Add description of the dataset here
 # You can copy an official description
 _DESCRIPTION = """\
 """
 # TODO: Add a link to an official homepage for the dataset here
 # TODO: Add link to the official dataset URLs here
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URLs = {}
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
+class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
     """TODO: Short description of my dataset."""
+    VERSION = datasets.Version("0.0.0")
+    BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
+    BUILDER_CONFIGS = [
+        ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
+        ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
+        ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr'])
+    ]
+    DEFAULT_CONFIG_NAME = "coco_2017"
     def _info(self):
         # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
+        feature_dict = {
+            "image_id": datasets.Value("int64"),
+            "id": datasets.Value("int64"),
+            "caption": datasets.Value("string"),
+        }
+        for lang in self.config.langs:
+            feature_dict[lang] = datasets.Value("string")
+        feature_dict["image_url"] = datasets.Value("string")
+        feature_dict["image_file"] = datasets.Value("string")
+        features = datasets.Features(feature_dict)
         return datasets.DatasetInfo(
             # This is the description that will appear on the datasets page.
         data_dir = self.config.data_dir
+        splits = []
+        for split in self.config.splits:
+            if split == 'train':
+                dataset = datasets.SplitGenerator(
+                    name=datasets.Split.TRAIN,
+                    # These kwargs will be passed to _generate_examples
+                    gen_kwargs={
+                        "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_train.jsonl'),
+                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_train'),
+                        "split": "train",
+                    }
+                )
+            elif split in ['val', 'valid', 'validation', 'dev']:
+                dataset = datasets.SplitGenerator(
+                    name=datasets.Split.VALIDATION,
+                    # These kwargs will be passed to _generate_examples
+                    gen_kwargs={
+                        "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_valid.jsonl'),
+                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_valid'),
+                        "split": "valid",
+                    },
+                )
+            elif split == 'test':
+                dataset = datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    # These kwargs will be passed to _generate_examples
+                    gen_kwargs={
+                        "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_test.jsonl'),
+                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_test'),
+                        "split": "test",
+                    },
+                )
+            else:
+                continue
+            splits.append(dataset)
+        return splits
     def _generate_examples(
+        # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+        self, jsonl_file, image_dir, split
     ):
         """ Yields examples as (key, example) tuples. """
         # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
         # The `key` is here for legacy reason (tfds) and is not important in itself.
         if split == 'dev':
+            split = 'valid'
+        with open(jsonl_file, 'r', encoding='UTF-8') as fp:
+            for id_, line in enumerate(fp):
+                ex = json.loads(line)
+                example = {
+                    "image_id": ex['image_id'],
+                    "id": ex["id"],
+                    "caption": ex["caption"],
+                }
+                for lang in self.config.langs:
+                    example[lang] = ex[lang]
+                if 'image_url' in ex:
+                    example['image_url'] = ex['image_url']
+                else:
+                    example['image_url'] = ''
+                # fn = f'{self.config.name}_{split}_{str(image_id).zfill(self.config.zfill)}.jpg'
+                fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
+                image_file = os.path.join(image_dir, fn)
+                example['image_file'] = image_file
+                yield id_, example