Spaces:

byeongjun-park
/

HarmonyView

Sleeping

App Files Files Community

HarmonyView / ldm /data /laion.py

byeongjun-park

error resolve

fe3e74d over 1 year ago

raw

history blame contribute delete

19.1 kB

	import webdataset as wds
	import kornia
	from PIL import Image
	import io
	import os
	import torchvision
	from PIL import Image
	import glob
	import random
	import numpy as np
	import pytorch_lightning as pl
	from tqdm import tqdm
	from omegaconf import OmegaConf
	from einops import rearrange
	import torch
	from webdataset.handlers import warn_and_continue


	from ldm.util import instantiate_from_config
	from ldm.data.inpainting.synthetic_mask import gen_large_mask, MASK_MODES
	from ldm.data.base import PRNGMixin


	class DataWithWings(torch.utils.data.IterableDataset):
	def __init__(self, min_size, transform=None, target_transform=None):
	self.min_size = min_size
	self.transform = transform if transform is not None else nn.Identity()
	self.target_transform = target_transform if target_transform is not None else nn.Identity()
	self.kv = OnDiskKV(file='/home/ubuntu/laion5B-watermark-safety-ordered', key_format='q', value_format='ee')
	self.kv_aesthetic = OnDiskKV(file='/home/ubuntu/laion5B-aesthetic-tags-kv', key_format='q', value_format='e')
	self.pwatermark_threshold = 0.8
	self.punsafe_threshold = 0.5
	self.aesthetic_threshold = 5.
	self.total_samples = 0
	self.samples = 0
	location = 'pipe:aws s3 cp --quiet s3://s-datasets/laion5b/laion2B-data/{000000..231349}.tar -'

	self.inner_dataset = wds.DataPipeline(
	wds.ResampledShards(location),
	wds.tarfile_to_samples(handler=wds.warn_and_continue),
	wds.shuffle(1000, handler=wds.warn_and_continue),
	wds.decode('pilrgb', handler=wds.warn_and_continue),
	wds.map(self._add_tags, handler=wds.ignore_and_continue),
	wds.select(self._filter_predicate),
	wds.map_dict(jpg=self.transform, txt=self.target_transform, punsafe=self._punsafe_to_class, handler=wds.warn_and_continue),
	wds.to_tuple('jpg', 'txt', 'punsafe', handler=wds.warn_and_continue),
	)

	@staticmethod
	def _compute_hash(url, text):
	if url is None:
	url = ''
	if text is None:
	text = ''
	total = (url + text).encode('utf-8')
	return mmh3.hash64(total)[0]

	def _add_tags(self, x):
	hsh = self._compute_hash(x['json']['url'], x['txt'])
	pwatermark, punsafe = self.kv[hsh]
	aesthetic = self.kv_aesthetic[hsh][0]
	return {**x, 'pwatermark': pwatermark, 'punsafe': punsafe, 'aesthetic': aesthetic}

	def _punsafe_to_class(self, punsafe):
	return torch.tensor(punsafe >= self.punsafe_threshold).long()

	def _filter_predicate(self, x):
	try:
	return x['pwatermark'] < self.pwatermark_threshold and x['aesthetic'] >= self.aesthetic_threshold and x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size
	except:
	return False

	def __iter__(self):
	return iter(self.inner_dataset)


	def dict_collation_fn(samples, combine_tensors=True, combine_scalars=True):
	"""Take a list of samples (as dictionary) and create a batch, preserving the keys.
	If `tensors` is True, `ndarray` objects are combined into
	tensor batches.
	:param dict samples: list of samples
	:param bool tensors: whether to turn lists of ndarrays into a single ndarray
	:returns: single sample consisting of a batch
	:rtype: dict
	"""
	keys = set.intersection(*[set(sample.keys()) for sample in samples])
	batched = {key: [] for key in keys}

	for s in samples:
	[batched[key].append(s[key]) for key in batched]

	result = {}
	for key in batched:
	if isinstance(batched[key][0], (int, float)):
	if combine_scalars:
	result[key] = np.array(list(batched[key]))
	elif isinstance(batched[key][0], torch.Tensor):
	if combine_tensors:
	result[key] = torch.stack(list(batched[key]))
	elif isinstance(batched[key][0], np.ndarray):
	if combine_tensors:
	result[key] = np.array(list(batched[key]))
	else:
	result[key] = list(batched[key])
	return result


	class WebDataModuleFromConfig(pl.LightningDataModule):
	def __init__(self, tar_base, batch_size, train=None, validation=None,
	test=None, num_workers=4, multinode=True, min_size=None,
	max_pwatermark=1.0,
	**kwargs):
	super().__init__(self)
	print(f'Setting tar base to {tar_base}')
	self.tar_base = tar_base
	self.batch_size = batch_size
	self.num_workers = num_workers
	self.train = train
	self.validation = validation
	self.test = test
	self.multinode = multinode
	self.min_size = min_size # filter out very small images
	self.max_pwatermark = max_pwatermark # filter out watermarked images

	def make_loader(self, dataset_config, train=True):
	if 'image_transforms' in dataset_config:
	image_transforms = [instantiate_from_config(tt) for tt in dataset_config.image_transforms]
	else:
	image_transforms = []

	image_transforms.extend([torchvision.transforms.ToTensor(),
	torchvision.transforms.Lambda(lambda x: rearrange(x * 2. - 1., 'c h w -> h w c'))])
	image_transforms = torchvision.transforms.Compose(image_transforms)

	if 'transforms' in dataset_config:
	transforms_config = OmegaConf.to_container(dataset_config.transforms)
	else:
	transforms_config = dict()

	transform_dict = {dkey: load_partial_from_config(transforms_config[dkey])
	if transforms_config[dkey] != 'identity' else identity
	for dkey in transforms_config}
	img_key = dataset_config.get('image_key', 'jpeg')
	transform_dict.update({img_key: image_transforms})

	if 'postprocess' in dataset_config:
	postprocess = instantiate_from_config(dataset_config['postprocess'])
	else:
	postprocess = None

	shuffle = dataset_config.get('shuffle', 0)
	shardshuffle = shuffle > 0

	nodesplitter = wds.shardlists.split_by_node if self.multinode else wds.shardlists.single_node_only

	if self.tar_base == "__improvedaesthetic__":
	print("## Warning, loading the same improved aesthetic dataset "
	"for all splits and ignoring shards parameter.")
	tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{000000..060207}.tar -"
	else:
	tars = os.path.join(self.tar_base, dataset_config.shards)

	dset = wds.WebDataset(
	tars,
	nodesplitter=nodesplitter,
	shardshuffle=shardshuffle,
	handler=wds.warn_and_continue).repeat().shuffle(shuffle)
	print(f'Loading webdataset with {len(dset.pipeline[0].urls)} shards.')

	dset = (dset
	.select(self.filter_keys)
	.decode('pil', handler=wds.warn_and_continue)
	.select(self.filter_size)
	.map_dict(**transform_dict, handler=wds.warn_and_continue)
	)
	if postprocess is not None:
	dset = dset.map(postprocess)
	dset = (dset
	.batched(self.batch_size, partial=False,
	collation_fn=dict_collation_fn)
	)

	loader = wds.WebLoader(dset, batch_size=None, shuffle=False,
	num_workers=self.num_workers)

	return loader

	def filter_size(self, x):
	try:
	valid = True
	if self.min_size is not None and self.min_size > 1:
	try:
	valid = valid and x['json']['original_width'] >= self.min_size and x['json']['original_height'] >= self.min_size
	except Exception:
	valid = False
	if self.max_pwatermark is not None and self.max_pwatermark < 1.0:
	try:
	valid = valid and x['json']['pwatermark'] <= self.max_pwatermark
	except Exception:
	valid = False
	return valid
	except Exception:
	return False

	def filter_keys(self, x):
	try:
	return ("jpg" in x) and ("txt" in x)
	except Exception:
	return False

	def train_dataloader(self):
	return self.make_loader(self.train)

	def val_dataloader(self):
	return self.make_loader(self.validation, train=False)

	def test_dataloader(self):
	return self.make_loader(self.test, train=False)


	from ldm.modules.image_degradation import degradation_fn_bsr_light
	import cv2

	class AddLR(object):
	def __init__(self, factor, output_size, initial_size=None, image_key="jpg"):
	self.factor = factor
	self.output_size = output_size
	self.image_key = image_key
	self.initial_size = initial_size

	def pt2np(self, x):
	x = ((x+1.0)*127.5).clamp(0, 255).to(dtype=torch.uint8).detach().cpu().numpy()
	return x

	def np2pt(self, x):
	x = torch.from_numpy(x)/127.5-1.0
	return x

	def __call__(self, sample):
	# sample['jpg'] is tensor hwc in [-1, 1] at this point
	x = self.pt2np(sample[self.image_key])
	if self.initial_size is not None:
	x = cv2.resize(x, (self.initial_size, self.initial_size), interpolation=2)
	x = degradation_fn_bsr_light(x, sf=self.factor)['image']
	x = cv2.resize(x, (self.output_size, self.output_size), interpolation=2)
	x = self.np2pt(x)
	sample['lr'] = x
	return sample

	class AddBW(object):
	def __init__(self, image_key="jpg"):
	self.image_key = image_key

	def pt2np(self, x):
	x = ((x+1.0)*127.5).clamp(0, 255).to(dtype=torch.uint8).detach().cpu().numpy()
	return x

	def np2pt(self, x):
	x = torch.from_numpy(x)/127.5-1.0
	return x

	def __call__(self, sample):
	# sample['jpg'] is tensor hwc in [-1, 1] at this point
	x = sample[self.image_key]
	w = torch.rand(3, device=x.device)
	w /= w.sum()
	out = torch.einsum('hwc,c->hw', x, w)

	# Keep as 3ch so we can pass to encoder, also we might want to add hints
	sample['lr'] = out.unsqueeze(-1).tile(1,1,3)
	return sample

	class AddMask(PRNGMixin):
	def __init__(self, mode="512train", p_drop=0.):
	super().__init__()
	assert mode in list(MASK_MODES.keys()), f'unknown mask generation mode "{mode}"'
	self.make_mask = MASK_MODES[mode]
	self.p_drop = p_drop

	def __call__(self, sample):
	# sample['jpg'] is tensor hwc in [-1, 1] at this point
	x = sample['jpg']
	mask = self.make_mask(self.prng, x.shape[0], x.shape[1])
	if self.prng.choice(2, p=[1 - self.p_drop, self.p_drop]):
	mask = np.ones_like(mask)
	mask[mask < 0.5] = 0
	mask[mask > 0.5] = 1
	mask = torch.from_numpy(mask[..., None])
	sample['mask'] = mask
	sample['masked_image'] = x * (mask < 0.5)
	return sample


	class AddEdge(PRNGMixin):
	def __init__(self, mode="512train", mask_edges=True):
	super().__init__()
	assert mode in list(MASK_MODES.keys()), f'unknown mask generation mode "{mode}"'
	self.make_mask = MASK_MODES[mode]
	self.n_down_choices = [0]
	self.sigma_choices = [1, 2]
	self.mask_edges = mask_edges

	@torch.no_grad()
	def __call__(self, sample):
	# sample['jpg'] is tensor hwc in [-1, 1] at this point
	x = sample['jpg']

	mask = self.make_mask(self.prng, x.shape[0], x.shape[1])
	mask[mask < 0.5] = 0
	mask[mask > 0.5] = 1
	mask = torch.from_numpy(mask[..., None])
	sample['mask'] = mask

	n_down_idx = self.prng.choice(len(self.n_down_choices))
	sigma_idx = self.prng.choice(len(self.sigma_choices))

	n_choices = len(self.n_down_choices)*len(self.sigma_choices)
	raveled_idx = np.ravel_multi_index((n_down_idx, sigma_idx),
	(len(self.n_down_choices), len(self.sigma_choices)))
	normalized_idx = raveled_idx/max(1, n_choices-1)

	n_down = self.n_down_choices[n_down_idx]
	sigma = self.sigma_choices[sigma_idx]

	kernel_size = 4*sigma+1
	kernel_size = (kernel_size, kernel_size)
	sigma = (sigma, sigma)
	canny = kornia.filters.Canny(
	low_threshold=0.1,
	high_threshold=0.2,
	kernel_size=kernel_size,
	sigma=sigma,
	hysteresis=True,
	)
	y = (x+1.0)/2.0 # in 01
	y = y.unsqueeze(0).permute(0, 3, 1, 2).contiguous()

	# down
	for i_down in range(n_down):
	size = min(y.shape[-2], y.shape[-1])//2
	y = kornia.geometry.transform.resize(y, size, antialias=True)

	# edge
	_, y = canny(y)

	if n_down > 0:
	size = x.shape[0], x.shape[1]
	y = kornia.geometry.transform.resize(y, size, interpolation="nearest")

	y = y.permute(0, 2, 3, 1)[0].expand(-1, -1, 3).contiguous()
	y = y*2.0-1.0

	if self.mask_edges:
	sample['masked_image'] = y * (mask < 0.5)
	else:
	sample['masked_image'] = y
	sample['mask'] = torch.zeros_like(sample['mask'])

	# concat normalized idx
	sample['smoothing_strength'] = torch.ones_like(sample['mask'])*normalized_idx

	return sample


	def example00():
	url = "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/000000.tar -"
	dataset = wds.WebDataset(url)
	example = next(iter(dataset))
	for k in example:
	print(k, type(example[k]))

	print(example["__key__"])
	for k in ["json", "txt"]:
	print(example[k].decode())

	image = Image.open(io.BytesIO(example["jpg"]))
	outdir = "tmp"
	os.makedirs(outdir, exist_ok=True)
	image.save(os.path.join(outdir, example["__key__"] + ".png"))


	def load_example(example):
	return {
	"key": example["__key__"],
	"image": Image.open(io.BytesIO(example["jpg"])),
	"text": example["txt"].decode(),
	}


	for i, example in tqdm(enumerate(dataset)):
	ex = load_example(example)
	print(ex["image"].size, ex["text"])
	if i >= 100:
	break


	def example01():
	# the first laion shards contain ~10k examples each
	url = "pipe:aws s3 cp s3://s-datasets/laion5b/laion2B-data/{000000..000002}.tar -"

	batch_size = 3
	shuffle_buffer = 10000
	dset = wds.WebDataset(
	url,
	nodesplitter=wds.shardlists.split_by_node,
	shardshuffle=True,
	)
	dset = (dset
	.shuffle(shuffle_buffer, initial=shuffle_buffer)
	.decode('pil', handler=warn_and_continue)
	.batched(batch_size, partial=False,
	collation_fn=dict_collation_fn)
	)

	num_workers = 2
	loader = wds.WebLoader(dset, batch_size=None, shuffle=False, num_workers=num_workers)

	batch_sizes = list()
	keys_per_epoch = list()
	for epoch in range(5):
	keys = list()
	for batch in tqdm(loader):
	batch_sizes.append(len(batch["__key__"]))
	keys.append(batch["__key__"])

	for bs in batch_sizes:
	assert bs==batch_size
	print(f"{len(batch_sizes)} batches of size {batch_size}.")
	batch_sizes = list()

	keys_per_epoch.append(keys)
	for i_batch in [0, 1, -1]:
	print(f"Batch {i_batch} of epoch {epoch}:")
	print(keys[i_batch])
	print("next epoch.")


	def example02():
	from omegaconf import OmegaConf
	from torch.utils.data.distributed import DistributedSampler
	from torch.utils.data import IterableDataset
	from torch.utils.data import DataLoader, RandomSampler, Sampler, SequentialSampler
	from pytorch_lightning.trainer.supporters import CombinedLoader, CycleIterator

	#config = OmegaConf.load("configs/stable-diffusion/txt2img-1p4B-multinode-clip-encoder-high-res-512.yaml")
	#config = OmegaConf.load("configs/stable-diffusion/txt2img-upscale-clip-encoder-f16-1024.yaml")
	config = OmegaConf.load("configs/stable-diffusion/txt2img-v2-clip-encoder-improved_aesthetics-256.yaml")
	datamod = WebDataModuleFromConfig(**config["data"]["params"])
	dataloader = datamod.train_dataloader()

	for batch in dataloader:
	print(batch.keys())
	print(batch["jpg"].shape)
	break


	def example03():
	# improved aesthetics
	tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{000000..060207}.tar -"
	dataset = wds.WebDataset(tars)

	def filter_keys(x):
	try:
	return ("jpg" in x) and ("txt" in x)
	except Exception:
	return False

	def filter_size(x):
	try:
	return x['json']['original_width'] >= 512 and x['json']['original_height'] >= 512
	except Exception:
	return False

	def filter_watermark(x):
	try:
	return x['json']['pwatermark'] < 0.5
	except Exception:
	return False

	dataset = (dataset
	.select(filter_keys)
	.decode('pil', handler=wds.warn_and_continue))
	n_save = 20
	n_total = 0
	n_large = 0
	n_large_nowm = 0
	for i, example in enumerate(dataset):
	n_total += 1
	if filter_size(example):
	n_large += 1
	if filter_watermark(example):
	n_large_nowm += 1
	if n_large_nowm < n_save+1:
	image = example["jpg"]
	image.save(os.path.join("tmp", f"{n_large_nowm-1:06}.png"))

	if i%500 == 0:
	print(i)
	print(f"Large: {n_large}/{n_total} \| {n_large/n_total*100:.2f}%")
	if n_large > 0:
	print(f"No Watermark: {n_large_nowm}/{n_large} \| {n_large_nowm/n_large*100:.2f}%")



	def example04():
	# improved aesthetics
	for i_shard in range(60208)[::-1]:
	print(i_shard)
	tars = "pipe:aws s3 cp s3://s-laion/improved-aesthetics-laion-2B-en-subsets/aesthetics_tars/{:06}.tar -".format(i_shard)
	dataset = wds.WebDataset(tars)

	def filter_keys(x):
	try:
	return ("jpg" in x) and ("txt" in x)
	except Exception:
	return False

	def filter_size(x):
	try:
	return x['json']['original_width'] >= 512 and x['json']['original_height'] >= 512
	except Exception:
	return False

	dataset = (dataset
	.select(filter_keys)
	.decode('pil', handler=wds.warn_and_continue))
	try:
	example = next(iter(dataset))
	except Exception:
	print(f"Error @ {i_shard}")


	if __name__ == "__main__":
	#example01()
	#example02()
	example03()
	#example04()