Spaces:

yuewu
/

glyffuser

Running

App Files Files Community

yue-here commited on May 28

Commit

5edc0a2

•

1 Parent(s): d5851a1

first commit

Browse files

Files changed (3) hide show

app.py +17 -0
glyffuser_utils.py +174 -0
t5.py +119 -0

app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import gradio as gr
+from glyffuser_utils import GlyffuserPipeline
+pipeline = GlyffuserPipeline.from_pretrained("yuewu/glyffuser")
+def infer(text):
+    generated_images = pipeline(
+        texts,
+        batch_size=1,  # Generate one image at a time for each step
+        # generator=torch.Generator(device='cuda').manual_seed(config.seed),  # Generator can be on GPU here
+        num_inference_steps=50
+    ).images
+    return generated_images[0]
+demo = gr.Interface(fn=infer, inputs="text", outputs="image")
+demo.launch()

glyffuser_utils.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms as T
+import t5
+from torch.nn.utils.rnn import pad_sequence
+from PIL import Image
+from datasets import load_dataset
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from typing import List, Optional, Tuple, Union
+from diffusers.utils.torch_utils import randn_tensor
+# Collator adjusted for local dataset
+class Collator:
+    def __init__(self, image_size, text_label, image_label, name, channels):
+        self.text_label = text_label
+        self.image_label = image_label
+        self.name = name
+        self.channels = channels
+        self.transform = T.Compose([
+            T.Resize((image_size, image_size)),
+            T.ToTensor(),
+        ])
+    def __call__(self, batch):
+        texts = []
+        masks = []
+        images = []
+        for item in batch:
+            try:
+                # Load image from local file
+                image_path = 'data/'+item[self.image_label]  # Assuming this is a path to the image file
+                with Image.open(image_path) as img:
+                    image = self.transform(img.convert(self.channels))
+            except Exception as e:
+                print(f"Failed to process image {image_path}: {e}")
+                continue
+            # Encode the text
+            text, mask = t5.t5_encode_text(
+                [item[self.text_label]],
+                name=self.name,
+                return_attn_mask=True
+                )
+            texts.append(torch.squeeze(text))
+            masks.append(torch.squeeze(mask))
+            images.append(image)
+        if len(texts) == 0:
+            return None
+        # Are these strictly necessary?
+        texts = pad_sequence(texts, True)
+        masks = pad_sequence(masks, True)
+        newbatch = []
+        for i in range(len(texts)):
+            newbatch.append((images[i], texts[i], masks[i]))
+        return torch.utils.data.dataloader.default_collate(newbatch)
+class GlyffuserPipeline(DiffusionPipeline):
+    r'''
+    Pipeline for text-to-image generation from the glyffuser model
+    Parameters:
+        unet (['UNet2DConditionModel'])
+        scheduler (['SchedulerMixin'])
+        text_encoder (['TextEncoder']) - T5 small
+    '''
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            )
+    @torch.no_grad()
+    def __call__(
+        self,
+        texts: List[str],
+        text_encoder: str = "google-t5/t5-small",
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        '''
+        Docstring
+        '''
+        # Get text embeddings
+        # Encode the text
+        # text_embeddings = []
+        # for text in texts:
+        #     embedding = t5.t5_encode_text(text, name=text_encoder)
+        #     text_embeddings.append(torch.squeeze(embedding))
+        # text_embeddings = pad_sequence(text_embeddings, True)
+        batch_size = len(texts)
+        text_embeddings, masks = t5.t5_encode_text(texts, name=text_encoder, return_attn_mask=True)
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+        # if self.device.type == "mps": # MPS is apple silicon
+        #     # randn does not work reproducibly on mps
+        #     image = randn_tensor(image_shape, generator=generator)
+        #     image = image.to(self.device)
+        # else:
+        image = randn_tensor(image_shape, generator=generator, device=self.device)
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(
+                image,
+                t,
+                encoder_hidden_states=text_embeddings, # Add text encoding input
+                encoder_attention_mask=masks, # Add attention mask
+                return_dict=False
+                )[0] # <-- sample is an attribute of the BaseOutClass of type torch.FloatTensor
+            # 2. compute previous image: x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image, generator=generator, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)
+def make_grid(images, rows, cols):
+    w, h = images[0].size
+    grid = Image.new('RGB', size=(cols*w, rows*h))
+    for i, image in enumerate(images):
+        grid.paste(image, box=(i%cols*w, i//cols*h))
+    return grid
+def evaluate(config, epoch, texts, pipeline):
+    images = pipeline(
+        texts,
+        batch_size = config.eval_batch_size,
+        generator=torch.Generator(device='cpu').manual_seed(config.seed), # Generator must be on CPU for sampling during training
+    ).images
+    # Make a grid out of the images
+    image_grid = make_grid(images, rows=4, cols=4)
+    # Save the images
+    test_dir = os.path.join(config.output_dir, "samples")
+    os.makedirs(test_dir, exist_ok=True)
+    image_grid.save(f"{test_dir}/{epoch:04d}.png")

t5.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import transformers
+from typing import List
+from transformers import T5Tokenizer, T5EncoderModel, T5Config
+from einops import rearrange
+transformers.logging.set_verbosity_error()
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+# config
+MAX_LENGTH = 256
+DEFAULT_T5_NAME = 'google/t5-v1_1-base'
+T5_CONFIGS = {}
+# singleton globals
+def get_tokenizer(name):
+    tokenizer = T5Tokenizer.from_pretrained(name, model_max_length=MAX_LENGTH)
+    return tokenizer
+def get_model(name):
+    model = T5EncoderModel.from_pretrained(name)
+    return model
+def get_model_and_tokenizer(name):
+    global T5_CONFIGS
+    if name not in T5_CONFIGS:
+        T5_CONFIGS[name] = dict()
+    if "model" not in T5_CONFIGS[name]:
+        T5_CONFIGS[name]["model"] = get_model(name)
+    if "tokenizer" not in T5_CONFIGS[name]:
+        T5_CONFIGS[name]["tokenizer"] = get_tokenizer(name)
+    return T5_CONFIGS[name]['model'], T5_CONFIGS[name]['tokenizer']
+def get_encoded_dim(name):
+    if name not in T5_CONFIGS:
+        # avoids loading the model if we only want to get the dim
+        config = T5Config.from_pretrained(name)
+        T5_CONFIGS[name] = dict(config=config)
+    elif "config" in T5_CONFIGS[name]:
+        config = T5_CONFIGS[name]["config"]
+    elif "model" in T5_CONFIGS[name]:
+        config = T5_CONFIGS[name]["model"].config
+    else:
+        assert False
+    return config.d_model
+# encoding text
+def t5_tokenize(
+    texts: List[str],
+    name = DEFAULT_T5_NAME
+):
+    t5, tokenizer = get_model_and_tokenizer(name)
+    if torch.cuda.is_available():
+        t5 = t5.cuda()
+    device = next(t5.parameters()).device
+    encoded = tokenizer.batch_encode_plus(
+        texts,
+        return_tensors = "pt",
+        padding = 'longest',
+        max_length = MAX_LENGTH,
+        truncation = True
+    )
+    input_ids = encoded.input_ids.to(device)
+    attn_mask = encoded.attention_mask.to(device)
+    return input_ids, attn_mask
+def t5_encode_tokenized_text(
+    token_ids,
+    attn_mask = None,
+    pad_id = None,
+    name = DEFAULT_T5_NAME
+):
+    assert exists(attn_mask) or exists(pad_id)
+    t5, _ = get_model_and_tokenizer(name)
+    attn_mask = default(attn_mask, lambda: (token_ids != pad_id).long())
+    t5.eval()
+    with torch.no_grad():
+        output = t5(input_ids = token_ids, attention_mask = attn_mask)
+        encoded_text = output.last_hidden_state.detach()
+    attn_mask = attn_mask.bool()
+    encoded_text = encoded_text.masked_fill(~rearrange(attn_mask, '... -> ... 1'), 0.) # just force all embeddings that is padding to be equal to 0.
+    return encoded_text
+def t5_encode_text(
+    texts: List[str],
+    name = DEFAULT_T5_NAME,
+    return_attn_mask = False
+):
+    token_ids, attn_mask = t5_tokenize(texts, name = name)
+    encoded_text = t5_encode_tokenized_text(token_ids, attn_mask = attn_mask, name = name)
+    if return_attn_mask:
+        attn_mask = attn_mask.bool()
+        return encoded_text, attn_mask
+    return encoded_text