File size: 3,993 Bytes
09b13b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import argparse
import logging
import random
import cv2
import jsonlines
import numpy as np
import requests
from datasets import load_dataset
from PIL import Image
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Example of a data preprocessing script."
)
parser.add_argument(
"--train_data_dir",
type=str,
required=True,
help="The directory to store the dataset",
)
parser.add_argument(
"--cache_dir",
type=str,
required=True,
help="The directory to store cache",
)
parser.add_argument(
"--max_train_samples",
type=int,
default=None,
help="number of examples in the dataset",
)
parser.add_argument(
"--num_proc",
type=int,
default=1,
help="number of processors to use in `dataset.map()`",
)
args = parser.parse_args()
return args
# filter for `max_train_samples``
def filter_function(example):
if example["clip_similarity_vitb32"] < 0.3:
return False
if example["watermark_score"] > 0.4:
return False
if example["aesthetic_score_laion_v2"] < 6.0:
return False
return True
def filter_dataset(dataset, max_train_samples):
small_dataset = dataset.select(range(max_train_samples)).filter(filter_function)
return small_dataset
if __name__ == "__main__":
args = parse_args()
# load coyo-700
dataset = load_dataset(
"kakaobrain/coyo-700m",
cache_dir=args.cache_dir,
split="train",
)
# estimation the % of images filtered
filter_ratio = len(filter_dataset(dataset, 20000)) / 20000
# esimate max_train_samples based on
# (1) filter_ratio we calculuted with 20k examples
# (2) assumption that only 80% of the URLs are still valid
max_train_samples = int(args.max_train_samples / filter_ratio / 0.8)
# filter dataset down to 1 million
small_dataset = filter_dataset(dataset, max_train_samples)
def preprocess_and_save(example):
image_url = example["url"]
try:
# download original image
image = Image.open(requests.get(image_url, stream=True, timeout=5).raw)
image_path = f"{args.train_data_dir}/images/{example['id']}.png"
image.save(image_path)
# generate and save canny image
processed_image = np.array(image)
# apply random threholds
# note that this should normally be applied on the fly during training.
# But that's fine when dealing with a larger dataset like here.
threholds = (
random.randint(0, 255),
random.randint(0, 255),
)
processed_image = cv2.Canny(processed_image, min(threholds), max(threholds))
processed_image = processed_image[:, :, None]
processed_image = np.concatenate(
[processed_image, processed_image, processed_image], axis=2
)
processed_image = Image.fromarray(processed_image)
processed_image_path = (
f"{args.train_data_dir}/processed_images/{example['id']}.png"
)
processed_image.save(processed_image_path)
# write to meta.jsonl
meta = {
"image": image_path,
"conditioning_image": processed_image_path,
"caption": example["text"],
}
with jsonlines.open(
f"{args.train_data_dir}/meta.jsonl", "a"
) as writer: # for writing
writer.write(meta)
except Exception as e:
logger.error(f"Failed to process image{image_url}: {str(e)}")
# preprocess -> image, processed image and meta.jsonl
small_dataset.map(preprocess_and_save, num_proc=args.num_proc)
print(f"created data folder at: {args.train_data_dir}")
|