ydshieh commited on
Commit
ae50e80
1 Parent(s): 0bf3646

remove dataset script

Browse files
Files changed (1) hide show
  1. coco_dataset_script.py +0 -178
coco_dataset_script.py DELETED
@@ -1,178 +0,0 @@
1
- import csv
2
- import json
3
- import os
4
-
5
- import datasets
6
- import pandas as pd
7
- import numpy as np
8
-
9
-
10
- class ImageCaptionBuilderConfig(datasets.BuilderConfig):
11
-
12
- def __init__(self, name, splits, zfill, langs, **kwargs):
13
-
14
- super().__init__(name, **kwargs)
15
-
16
- self.splits = splits
17
- self.zfill = zfill
18
- self.langs = langs
19
-
20
-
21
- # TODO: Add BibTeX citation
22
- # Find for instance the citation on arxiv or on the dataset repo/website
23
- _CITATION = """\
24
- @InProceedings{None,
25
- title = {Generic images to captions dataset},
26
- author={Yih-Dar SHIEH},
27
- year={2020}
28
- }
29
- """
30
-
31
- # TODO: Add description of the dataset here
32
- # You can copy an official description
33
- _DESCRIPTION = """\
34
-
35
- """
36
-
37
- # TODO: Add a link to an official homepage for the dataset here
38
- _HOMEPAGE = ""
39
-
40
- # TODO: Add the licence for the dataset here if you can find it
41
- _LICENSE = ""
42
-
43
- # TODO: Add link to the official dataset URLs here
44
- # The HuggingFace dataset library don't host the datasets but only point to the original files
45
- # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
46
- _URLs = {}
47
-
48
-
49
- # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
50
- class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
51
- """TODO: Short description of my dataset."""
52
-
53
- VERSION = datasets.Version("0.0.0")
54
-
55
- BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
56
- BUILDER_CONFIGS = [
57
- ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
58
- ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr']),
59
- ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], zfill=12, langs=['en', 'fr'])
60
- ]
61
- DEFAULT_CONFIG_NAME = "coco_2017"
62
-
63
- def _info(self):
64
- # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
65
-
66
- feature_dict = {
67
- "image_id": datasets.Value("int64"),
68
- "id": datasets.Value("int64"),
69
- "caption": datasets.Value("string"),
70
- }
71
- for lang in self.config.langs:
72
- feature_dict[lang] = datasets.Value("string")
73
- feature_dict["image_url"] = datasets.Value("string")
74
- feature_dict["image_file"] = datasets.Value("string")
75
-
76
- features = datasets.Features(feature_dict)
77
-
78
- return datasets.DatasetInfo(
79
- # This is the description that will appear on the datasets page.
80
- description=_DESCRIPTION,
81
- # This defines the different columns of the dataset and their types
82
- features=features, # Here we define them above because they are different between the two configurations
83
- # If there's a common (input, target) tuple from the features,
84
- # specify them here. They'll be used if as_supervised=True in
85
- # builder.as_dataset.
86
- supervised_keys=None,
87
- # Homepage of the dataset for documentation
88
- homepage=_HOMEPAGE,
89
- # License for the dataset if available
90
- license=_LICENSE,
91
- # Citation for the dataset
92
- citation=_CITATION,
93
- )
94
-
95
- def _split_generators(self, dl_manager):
96
- """Returns SplitGenerators."""
97
- # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
98
- # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
99
-
100
- data_dir = self.config.data_dir
101
-
102
- splits = []
103
- for split in self.config.splits:
104
- if split == 'train':
105
- dataset = datasets.SplitGenerator(
106
- name=datasets.Split.TRAIN,
107
- # These kwargs will be passed to _generate_examples
108
- gen_kwargs={
109
- "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_train.jsonl'),
110
- "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_train'),
111
- "split": "train",
112
- }
113
- )
114
- elif split in ['val', 'valid', 'validation', 'dev']:
115
- dataset = datasets.SplitGenerator(
116
- name=datasets.Split.VALIDATION,
117
- # These kwargs will be passed to _generate_examples
118
- gen_kwargs={
119
- "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_valid.jsonl'),
120
- "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_valid'),
121
- "split": "valid",
122
- },
123
- )
124
- elif split == 'test':
125
- dataset = datasets.SplitGenerator(
126
- name=datasets.Split.TEST,
127
- # These kwargs will be passed to _generate_examples
128
- gen_kwargs={
129
- "jsonl_file": os.path.join(data_dir, f'{self.config.name}_translated_test.jsonl'),
130
- "image_dir": os.path.join(data_dir, f'{self.config.name}_images', f'{self.config.name}_test'),
131
- "split": "test",
132
- },
133
- )
134
- else:
135
- continue
136
-
137
- splits.append(dataset)
138
-
139
- return splits
140
-
141
- def _generate_examples(
142
- # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
143
- self, jsonl_file, image_dir, split
144
- ):
145
- """ Yields examples as (key, example) tuples. """
146
- # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
147
- # The `key` is here for legacy reason (tfds) and is not important in itself.
148
-
149
- if split == 'dev':
150
- split = 'valid'
151
-
152
- with open(jsonl_file, 'r', encoding='UTF-8') as fp:
153
-
154
- for id_, line in enumerate(fp):
155
-
156
- ex = json.loads(line)
157
-
158
- example = {
159
- "image_id": ex['image_id'],
160
- "id": ex["id"],
161
- "caption": ex["caption"],
162
- }
163
-
164
- for lang in self.config.langs:
165
- example[lang] = ex[lang]
166
-
167
- if 'image_url' in ex:
168
- example['image_url'] = ex['image_url']
169
- else:
170
- example['image_url'] = ''
171
-
172
- # fn = f'{self.config.name}_{split}_{str(image_id).zfill(self.config.zfill)}.jpg'
173
- fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
174
-
175
- image_file = os.path.join(image_dir, fn)
176
- example['image_file'] = image_file
177
-
178
- yield id_, example