ydshieh commited on
Commit
0bf3646
1 Parent(s): 64afcd5

Update dataset script

Browse files
Files changed (1) hide show
  1. coco_dataset_script.py +101 -65
coco_dataset_script.py CHANGED
@@ -7,21 +7,31 @@ import pandas as pd
7
  import numpy as np
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
10
  # TODO: Add BibTeX citation
11
  # Find for instance the citation on arxiv or on the dataset repo/website
12
  _CITATION = """\
13
- @InProceedings{huggingface:dataset,
14
- title = {A great new dataset},
15
- author={huggingface, Inc.
16
- },
17
- year={2020}
18
  }
19
  """
20
 
21
  # TODO: Add description of the dataset here
22
  # You can copy an official description
23
  _DESCRIPTION = """\
24
- This new dataset is designed to solve this great NLP task and is crafted with a lot of care.
25
  """
26
 
27
  # TODO: Add a link to an official homepage for the dataset here
@@ -33,31 +43,37 @@ _LICENSE = ""
33
  # TODO: Add link to the official dataset URLs here
34
  # The HuggingFace dataset library don't host the datasets but only point to the original files
35
  # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
36
- _URLs = {
37
- }
38
 
39
 
40
  # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
41
- class COCODataset(datasets.GeneratorBasedBuilder):
42
  """TODO: Short description of my dataset."""
43
 
44
- VERSION = datasets.Version("1.1.0")
45
 
46
- DEFAULT_CONFIG_NAME = "en"
 
 
 
 
 
 
47
 
48
  def _info(self):
49
  # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
50
 
51
- features = datasets.Features(
52
- {
53
- "id": datasets.Value("int64"),
54
- "en": datasets.Value("string"),
55
- "fr": datasets.Value("string"),
56
- "image_id": datasets.Value("int64"),
57
- "image_file": datasets.Value("string")
58
- # These are the features of your dataset like images, labels ...
59
- }
60
- )
 
61
 
62
  return datasets.DatasetInfo(
63
  # This is the description that will appear on the datasets page.
@@ -83,60 +99,80 @@ class COCODataset(datasets.GeneratorBasedBuilder):
83
 
84
  data_dir = self.config.data_dir
85
 
86
- return [
87
- datasets.SplitGenerator(
88
- name=datasets.Split.TRAIN,
89
- # These kwargs will be passed to _generate_examples
90
- gen_kwargs={
91
- "data_dir": data_dir,
92
- "split": "train",
93
- },
94
- ),
95
- datasets.SplitGenerator(
96
- name=datasets.Split.TEST,
97
- # These kwargs will be passed to _generate_examples
98
- gen_kwargs={
99
- "data_dir": data_dir,
100
- "split": "test"
101
- },
102
- ),
103
- datasets.SplitGenerator(
104
- name=datasets.Split.VALIDATION,
105
- # These kwargs will be passed to _generate_examples
106
- gen_kwargs={
107
- "data_dir": data_dir,
108
- "split": "val",
109
- },
110
- ),
111
- ]
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def _generate_examples(
114
- self, data_dir, split # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
 
115
  ):
116
  """ Yields examples as (key, example) tuples. """
117
  # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
118
  # The `key` is here for legacy reason (tfds) and is not important in itself.
119
 
120
- # /home/33611/caption/
121
- # train2014
122
-
123
  if split == 'dev':
124
- split == 'val'
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- with open(os.path.join(data_dir, f'{split}.json')) as fp:
127
- examples = json.load(fp)
128
 
129
- for id_, ex in enumerate(examples):
 
 
 
130
 
131
- image_id = ex["image_id"]
132
- fn = f'COCO_{split}2014_{str(image_id).zfill(12)}.jpg'
133
 
134
- image_file = os.path.join(data_dir, f'{split}2014', fn)
 
135
 
136
- yield id_, {
137
- "id": ex["id"],
138
- "en": ex["caption"],
139
- "fr": ex["fr"],
140
- "image_id": ex["image_id"],
141
- "image_file": image_file
142
- }
 
7
  import numpy as np
8
 
9
 
10
+ class ImageCaptionBuilderConfig(datasets.BuilderConfig):
11
+
12
+ def __init__(self, name, splits, zfill, langs, **kwargs):
13
+
14
+ super().__init__(name, **kwargs)
15
+
16
+ self.splits = splits
17
+ self.zfill = zfill
18
+ self.langs = langs
19
+
20
+
21
  # TODO: Add BibTeX citation
22
  # Find for instance the citation on arxiv or on the dataset repo/website
23
  _CITATION = """\
24
+ @InProceedings{None,
25
+ title = {Generic images to captions dataset},
26
+ author={Yih-Dar SHIEH},
27
+ year={2020}
 
28
  }
29
  """
30
 
31
  # TODO: Add description of the dataset here
32
  # You can copy an official description
33
  _DESCRIPTION = """\
34
+
35
  """
36
 
37
  # TODO: Add a link to an official homepage for the dataset here
 
43
  # TODO: Add link to the official dataset URLs here
44
  # The HuggingFace dataset library don't host the datasets but only point to the original files
45
  # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
46
+ _URLs = {}
 
47
 
48
 
49
  # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
50
+ class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
51
  """TODO: Short description of my dataset."""
52
 
53
+ VERSION = datasets.Version("0.0.0")
54
 
55
+ BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
56
+ BUILDER_CONFIGS = [
57
+ ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
58
+ ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
59
+ ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr'])
60
+ ]
61
+ DEFAULT_CONFIG_NAME = "coco_2017"
62
 
63
  def _info(self):
64
  # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
65
 
66
+ feature_dict = {
67
+ "image_id": datasets.Value("int64"),
68
+ "id": datasets.Value("int64"),
69
+ "caption": datasets.Value("string"),
70
+ }
71
+ for lang in self.config.langs:
72
+ feature_dict[lang] = datasets.Value("string")
73
+ feature_dict["image_url"] = datasets.Value("string")
74
+ feature_dict["image_file"] = datasets.Value("string")
75
+
76
+ features = datasets.Features(feature_dict)
77
 
78
  return datasets.DatasetInfo(
79
  # This is the description that will appear on the datasets page.
 
99
 
100
  data_dir = self.config.data_dir
101
 
102
+ splits = []
103
+ for split in self.config.splits:
104
+ if split == 'train':
105
+ dataset = datasets.SplitGenerator(
106
+ name=datasets.Split.TRAIN,
107
+ # These kwargs will be passed to _generate_examples
108
+ gen_kwargs={
109
+ "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_train.jsonl'),
110
+ "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_train'),
111
+ "split": "train",
112
+ }
113
+ )
114
+ elif split in ['val', 'valid', 'validation', 'dev']:
115
+ dataset = datasets.SplitGenerator(
116
+ name=datasets.Split.VALIDATION,
117
+ # These kwargs will be passed to _generate_examples
118
+ gen_kwargs={
119
+ "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_valid.jsonl'),
120
+ "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_valid'),
121
+ "split": "valid",
122
+ },
123
+ )
124
+ elif split == 'test':
125
+ dataset = datasets.SplitGenerator(
126
+ name=datasets.Split.TEST,
127
+ # These kwargs will be passed to _generate_examples
128
+ gen_kwargs={
129
+ "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_test.jsonl'),
130
+ "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_test'),
131
+ "split": "test",
132
+ },
133
+ )
134
+ else:
135
+ continue
136
+
137
+ splits.append(dataset)
138
+
139
+ return splits
140
 
141
  def _generate_examples(
142
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
143
+ self, jsonl_file, image_dir, split
144
  ):
145
  """ Yields examples as (key, example) tuples. """
146
  # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
147
  # The `key` is here for legacy reason (tfds) and is not important in itself.
148
 
 
 
 
149
  if split == 'dev':
150
+ split = 'valid'
151
+
152
+ with open(jsonl_file, 'r', encoding='UTF-8') as fp:
153
+
154
+ for id_, line in enumerate(fp):
155
+
156
+ ex = json.loads(line)
157
+
158
+ example = {
159
+ "image_id": ex['image_id'],
160
+ "id": ex["id"],
161
+ "caption": ex["caption"],
162
+ }
163
 
164
+ for lang in self.config.langs:
165
+ example[lang] = ex[lang]
166
 
167
+ if 'image_url' in ex:
168
+ example['image_url'] = ex['image_url']
169
+ else:
170
+ example['image_url'] = ''
171
 
172
+ # fn = f'{self.config.name}_{split}_{str(image_id).zfill(self.config.zfill)}.jpg'
173
+ fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
174
 
175
+ image_file = os.path.join(image_dir, fn)
176
+ example['image_file'] = image_file
177
 
178
+ yield id_, example