# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: Address all TODOs and remove all explanatory comments """TODO: Add a description here.""" import zipfile import os import datasets from PIL import Image from io import BytesIO # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case class sdbias(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" VERSION = datasets.Version("1.1.0") # This is an example of a dataset with multiple configurations. # If you don't want/need to define several sub-sets in your dataset, # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. # If you need to make complex sub-parts in the datasets with configurable options # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig # BUILDER_CONFIG_CLASS = MyBuilderConfig # You will be able to load one or the other configurations in the following list with # data = datasets.load_dataset('my_dataset', 'first_domain') # data = datasets.load_dataset('my_dataset', 'second_domain') BUILDER_CONFIGS = [ datasets.BuilderConfig(name="first_domain", version=VERSION, description="This part of my dataset covers a first domain"), ] DEFAULT_CONFIG_NAME = "first_domain" # It's not mandatory to have a default configuration. Just use one if it make sense. def _info(self): if self.config.name == "first_domain": # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features( { "adjective": datasets.Value("string"), "profession": datasets.Value("string"), "seed": datasets.Value("int32"), "image": datasets.Image() # These are the features of your dataset like images, labels ... } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description="bla", # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and # specify them. They'll be used if as_supervised=True in builder.as_dataset. # supervised_keys=("sentence", "label"), # Homepage of the dataset for documentation homepage="bla", # License for the dataset if available license="bla", # Citation for the dataset citation="bli", ) def _split_generators(self, dl_manager): # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive data_dir = "/mnt/1da05489-3812-4f15-a6e5-c8d3c57df39e/StableDiffusionBiasExplorer/zipped_images" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ "filepath":data_dir, "split": "train", }, ), ] # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` def _generate_examples(self, filepath, split): zip_files = os.listdir(filepath) key = 0 for zip_file in zip_files: with zipfile.ZipFile(filepath + "/" + zip_file, "r") as zf: for f in zf.filelist: if ".jpg" in f.filename: jpg_content = BytesIO(zf.read(f)) with Image.open(jpg_content) as image: yield key, { "adjective": zip_file.split("_", 1)[0], "profession": zip_file.split("_", 1)[-1].replace(".zip",""), "seed": int(f.filename.split("Seed_")[-1].split("/")[0]), "image": image, } key+=1