File size: 3,993 Bytes
878dbce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import argparse
import logging
import random

import cv2
import jsonlines
import numpy as np
import requests
from datasets import load_dataset
from PIL import Image

logger = logging.getLogger(__name__)


def parse_args():
    parser = argparse.ArgumentParser(
        description="Example of a data preprocessing script."
    )
    parser.add_argument(
        "--train_data_dir",
        type=str,
        required=True,
        help="The directory to store the dataset",
    )
    parser.add_argument(
        "--cache_dir",
        type=str,
        required=True,
        help="The directory to store cache",
    )
    parser.add_argument(
        "--max_train_samples",
        type=int,
        default=None,
        help="number of examples in the dataset",
    )
    parser.add_argument(
        "--num_proc",
        type=int,
        default=1,
        help="number of processors to use in `dataset.map()`",
    )

    args = parser.parse_args()
    return args


# filter for `max_train_samples``
def filter_function(example):
    if example["clip_similarity_vitb32"] < 0.3:
        return False
    if example["watermark_score"] > 0.4:
        return False
    if example["aesthetic_score_laion_v2"] < 6.0:
        return False
    return True


def filter_dataset(dataset, max_train_samples):
    small_dataset = dataset.select(range(max_train_samples)).filter(filter_function)
    return small_dataset


if __name__ == "__main__":
    args = parse_args()

    # load coyo-700
    dataset = load_dataset(
        "kakaobrain/coyo-700m",
        cache_dir=args.cache_dir,
        split="train",
    )

    # estimation the % of images filtered
    filter_ratio = len(filter_dataset(dataset, 20000)) / 20000

    # esimate max_train_samples based on
    #   (1) filter_ratio we calculuted with 20k examples
    #   (2) assumption that only 80% of the URLs are still valid
    max_train_samples = int(args.max_train_samples / filter_ratio / 0.8)

    # filter dataset down to 1 million
    small_dataset = filter_dataset(dataset, max_train_samples)

    def preprocess_and_save(example):
        image_url = example["url"]
        try:
            # download original image
            image = Image.open(requests.get(image_url, stream=True, timeout=5).raw)
            image_path = f"{args.train_data_dir}/images/{example['id']}.png"
            image.save(image_path)

            # generate and save canny image
            processed_image = np.array(image)

            # apply random threholds
            #   note that this should normally be applied on the fly during training.
            #   But that's fine when dealing with a larger dataset like here.
            threholds = (
                random.randint(0, 255),
                random.randint(0, 255),
            )
            processed_image = cv2.Canny(processed_image, min(threholds), max(threholds))
            processed_image = processed_image[:, :, None]
            processed_image = np.concatenate(
                [processed_image, processed_image, processed_image], axis=2
            )
            processed_image = Image.fromarray(processed_image)
            processed_image_path = (
                f"{args.train_data_dir}/processed_images/{example['id']}.png"
            )
            processed_image.save(processed_image_path)

            # write to meta.jsonl
            meta = {
                "image": image_path,
                "conditioning_image": processed_image_path,
                "caption": example["text"],
            }
            with jsonlines.open(
                f"{args.train_data_dir}/meta.jsonl", "a"
            ) as writer:  # for writing
                writer.write(meta)

        except Exception as e:
            logger.error(f"Failed to process image{image_url}: {str(e)}")

    # preprocess -> image, processed image and meta.jsonl
    small_dataset.map(preprocess_and_save, num_proc=args.num_proc)

    print(f"created data folder at: {args.train_data_dir}")