ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions Community

ydshieh commited on Dec 19, 2021

Commit

bf29ca1

•

1 Parent(s): a651689

improve

Browse files

Files changed (1) hide show

coco_dataset/coco_dataset.py +15 -6

coco_dataset/coco_dataset.py CHANGED Viewed

@@ -113,6 +113,15 @@ class COCODataset(datasets.GeneratorBasedBuilder):
                 "This script is supposed to work with local (downloaded) COCO dataset. The argument `data_dir` in `load_dataset()` is required."
             )
         splits = []
         for split in self.config.splits:
             if split == 'train':
@@ -120,8 +129,8 @@ class COCODataset(datasets.GeneratorBasedBuilder):
                     name=datasets.Split.TRAIN,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
-                        "json_path": os.path.join(data_dir, f"captions_train{self.config.name}.json"),
-                        "image_dir": os.path.join(data_dir, f'train{self.config.name}'),
                         "split": "train",
                     }
                 )
@@ -130,8 +139,8 @@ class COCODataset(datasets.GeneratorBasedBuilder):
                     name=datasets.Split.VALIDATION,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
-                        "json_path": os.path.join(data_dir, f"captions_val{self.config.name}.json"),
-                        "image_dir": os.path.join(data_dir, f'val{self.config.name}'),
                         "split": "valid",
                     },
                 )
@@ -140,8 +149,8 @@ class COCODataset(datasets.GeneratorBasedBuilder):
                     name=datasets.Split.TEST,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
-                        "json_path": os.path.join(data_dir, f'image_info_test{self.config.name}.json'),
-                        "image_dir": os.path.join(data_dir, f'test{self.config.name}'),
                         "split": "test",
                     },
                 )

                 "This script is supposed to work with local (downloaded) COCO dataset. The argument `data_dir` in `load_dataset()` is required."
             )
+        _DL_URLS = {
+            "train": os.path.join(data_dir, "train2017.zip"),
+            "val": os.path.join(data_dir, "val2017.zip"),
+            "test": os.path.join(data_dir, "test2017.zip"),
+            "annotations_trainval": os.path.join(data_dir, "annotations_trainval2017.zip"),
+            "image_info_test": os.path.join(data_dir, "image_info_test2017.zip"),
+        }
+        archive_path = dl_manager.download_and_extract(_DL_URLS)
         splits = []
         for split in self.config.splits:
             if split == 'train':
                     name=datasets.Split.TRAIN,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
+                        "json_path": os.path.join(archive_path["annotations_trainval"], "annotations", "captions_train2017.json"),
+                        "image_dir": os.path.join(archive_path["train"], "train2017"),
                         "split": "train",
                     }
                 )
                     name=datasets.Split.VALIDATION,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
+                        "json_path": os.path.join(archive_path["annotations_trainval"], "annotations", "captions_val2017.json"),
+                        "image_dir": os.path.join(archive_path["val"], "val2017"),
                         "split": "valid",
                     },
                 )
                     name=datasets.Split.TEST,
                     # These kwargs will be passed to _generate_examples
                     gen_kwargs={
+                        "json_path": os.path.join(archive_path["image_info_test"], "annotations", "image_info_test2017.json"),
+                        "image_dir": os.path.join(archive_path["test"], "test2017"),
                         "split": "test",
                     },
                 )