Spaces:

yizhezhu
/

MoMA_zeroGPU

Running on Zero

App Files Files Community

Kunpeng Song commited on Jun 6

Commit

b5f6f82

•

1 Parent(s): 7c69fc1

fix zero

Browse files

Files changed (3) hide show

.DS_Store +0 -0
app.py +0 -5
dataset_lib/dataset_eval_MoMA.py +153 -2

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numpy as np
 import torch
 from pytorch_lightning import seed_everything
 from model_lib.utils import parse_args
-# from llava.mm_utils import process_image
 os.environ["CUDA_VISIBLE_DEVICES"]="0"
@@ -18,10 +17,6 @@ args = parse_args()
 model = None
-def my_process_image(a, b, c):
-    # return process_image(a, b, c)
-    return (a, b, c)
 @spaces.GPU
 def inference(rgb, subject, prompt, strength, seed):
     seed = int(seed) if seed else 0

 import torch
 from pytorch_lightning import seed_everything
 from model_lib.utils import parse_args
 os.environ["CUDA_VISIBLE_DEVICES"]="0"
 model = None
 @spaces.GPU
 def inference(rgb, subject, prompt, strength, seed):
     seed = int(seed) if seed else 0

dataset_lib/dataset_eval_MoMA.py CHANGED Viewed

@@ -2,8 +2,159 @@ from PIL import Image
 import numpy as np
 import torch
 from torchvision import transforms
-from ..app import my_process_image
 from rembg import remove
 def create_binary_mask(image):
     grayscale = image.convert("L")
@@ -38,7 +189,7 @@ def Dataset_evaluate_MoMA(image_pil, prompt,subject, moMA_main_modal):
     image_wb = image * mask + torch.ones_like(image)* (1-mask)*255
     image_pil = Image.fromarray(image_wb.permute(1,2,0).numpy().astype(np.uint8))
-    res['llava_processed'] = my_process_image([image_pil], LLaVa_processor, llava_config)
     res['label'] = [subject]
     return res

 import numpy as np
 import torch
 from torchvision import transforms
 from rembg import remove
+import ast
+import math
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size['height'])
+    image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                     for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres":
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
 def create_binary_mask(image):
     grayscale = image.convert("L")
     image_wb = image * mask + torch.ones_like(image)* (1-mask)*255
     image_pil = Image.fromarray(image_wb.permute(1,2,0).numpy().astype(np.uint8))
+    res['llava_processed'] = process_images([image_pil], LLaVa_processor, llava_config)
     res['label'] = [subject]
     return res