Spaces:

JingyeChen22
/

TextDiffuser

Sleeping

File size: 12,041 Bytes

# ------------------------------------------
# TextDiffuser: Diffusion Models as Text Painters
# Paper Link: https://arxiv.org/abs/2305.10855
# Code Link: https://github.com/microsoft/unilm/tree/master/textdiffuser
# Copyright (c) Microsoft Corporation.
# This file defines a set of commonly used utility functions.
# ------------------------------------------

import os
import re
import cv2
import math
import shutil
import string
import textwrap
import numpy as np
from PIL import Image, ImageFont, ImageDraw, ImageOps

from typing import *

# define alphabet and alphabet_dic
alphabet = string.digits + string.ascii_lowercase + string.ascii_uppercase + string.punctuation + ' ' # len(aphabet) = 95
alphabet_dic = {}
for index, c in enumerate(alphabet):
    alphabet_dic[c] = index + 1 # the index 0 stands for non-character
    


def transform_mask_pil(mask_root, size):
    """
    This function extracts the mask area and text area from the images.
    
    Args:
        mask_root (str): The path of mask image.
            * The white area is the unmasked area
            * The gray area is the masked area
            * The white area is the text area
    """
    img = np.array(mask_root)
    img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold
    return 1 - (binary.astype(np.float32) / 255) 
    

def transform_mask(mask_root, size):
    """
    This function extracts the mask area and text area from the images.
    
    Args:
        mask_root (str): The path of mask image.
            * The white area is the unmasked area
            * The gray area is the masked area
            * The white area is the text area
    """
    img = cv2.imread(mask_root)
    img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, binary = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY) # pixel value is set to 0 or 255 according to the threshold
    return 1 - (binary.astype(np.float32) / 255) 


def segmentation_mask_visualization(font_path: str, segmentation_mask: np.array):
    """
    This function visualizes the segmentaiton masks with characters.
    
    Args:
        font_path (str): The path of font. We recommand to use Arial.ttf
        segmentation_mask (np.array): The character-level segmentation mask.
    """
    segmentation_mask = cv2.resize(segmentation_mask, (64, 64), interpolation=cv2.INTER_NEAREST)
    font = ImageFont.truetype(font_path, 8)
    blank = Image.new('RGB', (512,512), (0,0,0))
    d = ImageDraw.Draw(blank)
    for i in range(64):
        for j in range(64):
            if int(segmentation_mask[i][j]) == 0 or int(segmentation_mask[i][j])-1 >= len(alphabet): 
                continue
            else:
                d.text((j*8, i*8), alphabet[int(segmentation_mask[i][j])-1], font=font, fill=(0, 255, 0))
    return blank


def make_caption_pil(font_path: str, captions: List[str]):
    """
    This function converts captions into pil images.
    
    Args:
        font_path (str): The path of font. We recommand to use Arial.ttf
        captions (List[str]): List of captions.
    """
    caption_pil_list = []
    font = ImageFont.truetype(font_path, 18)

    for caption in captions:
        border_size = 2
        img = Image.new('RGB', (512-4,48-4), (255,255,255)) 
        img = ImageOps.expand(img, border=(border_size, border_size, border_size, border_size), fill=(127, 127, 127))
        draw = ImageDraw.Draw(img)
        border_size = 2
        text = caption
        lines = textwrap.wrap(text, width=40)
        x, y = 4, 4
        line_height = font.getsize('A')[1] + 4 

        start = 0
        for line in lines:
            draw.text((x, y+start), line, font=font, fill=(200, 127, 0))
            y += line_height

        caption_pil_list.append(img)
    return caption_pil_list


def filter_segmentation_mask(segmentation_mask: np.array):
    """
    This function removes some noisy predictions of segmentation masks.
    
    Args:
        segmentation_mask (np.array): The character-level segmentation mask.
    """
    segmentation_mask[segmentation_mask==alphabet_dic['-']] = 0
    segmentation_mask[segmentation_mask==alphabet_dic[' ']] = 0
    return segmentation_mask
    
    

def combine_image(args, resolution, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List):
    """
    This function combines all the outputs and useful inputs together.
    
    Args:
        args (argparse.ArgumentParser): The arguments.
        pred_image_list (List): List of predicted images.
        image_pil (Image): The original image.
        character_mask_pil (Image): The character-level segmentation mask.
        character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color.
        caption_pil_list (List): List of captions.
    """
    
    
    size = len(pred_image_list)
    
    if size == 1:
        return pred_image_list[0]
    elif size == 2:
        blank = Image.new('RGB', (resolution*2, resolution), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(resolution,0))
    elif size == 3:
        blank = Image.new('RGB', (resolution*3, resolution), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(resolution,0))
        blank.paste(pred_image_list[2],(resolution*2,0))
    elif size == 4:
        blank = Image.new('RGB', (resolution*2, resolution*2), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(resolution,0))
        blank.paste(pred_image_list[2],(0,resolution))
        blank.paste(pred_image_list[3],(resolution,resolution))

    
    return blank
    
    
def combine_image_gradio(args, size, sub_output_dir: str, pred_image_list: List, image_pil: Image, character_mask_pil: Image, character_mask_highlight_pil: Image, caption_pil_list: List):
    """
    This function combines all the outputs and useful inputs together.
    
    Args:
        args (argparse.ArgumentParser): The arguments.
        pred_image_list (List): List of predicted images.
        image_pil (Image): The original image.
        character_mask_pil (Image): The character-level segmentation mask.
        character_mask_highlight_pil (Image): The character-level segmentation mask highlighting character regions with green color.
        caption_pil_list (List): List of captions.
    """
    
    size = len(pred_image_list)
    
    if size == 1:
        return pred_image_list[0]
    elif size == 2:
        blank = Image.new('RGB', (size*2, size), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(size,0))
    elif size == 3:
        blank = Image.new('RGB', (size*3, size), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(size,0))
        blank.paste(pred_image_list[2],(size*2,0))
    elif size == 4:
        blank = Image.new('RGB', (size*2, size*2), (0,0,0))
        blank.paste(pred_image_list[0],(0,0))
        blank.paste(pred_image_list[1],(size,0))
        blank.paste(pred_image_list[2],(0,size))
        blank.paste(pred_image_list[3],(size,size))

    
    return blank
    
def get_width(font_path, text):
    """
    This function calculates the width of the text.
    
    Args:
        font_path (str): user prompt.
        text (str): user prompt.
    """
    font = ImageFont.truetype(font_path, 24)
    width, _ = font.getsize(text)
    return width



def get_key_words(text: str):
    """
    This function detect keywords (enclosed by quotes) from user prompts. The keywords are used to guide the layout generation.
    
    Args:
        text (str): user prompt.
    """

    words = []
    text = text
    matches = re.findall(r"'(.*?)'", text) # find the keywords enclosed by ''
    if matches:
        for match in matches:
            words.extend(match.split())
            
    if len(words) >= 8:
        return []
   
    return words


def adjust_overlap_box(box_output, current_index):
    """
    This function adjust the overlapping boxes.
    
    Args:
        box_output (List): List of predicted boxes.
        current_index (int): the index of current box.
    """
    
    if current_index == 0:
        return box_output
    else:
        # judge whether it contains overlap with the last output
        last_box = box_output[0, current_index-1, :]
        xmin_last, ymin_last, xmax_last, ymax_last = last_box
        
        current_box = box_output[0, current_index, :]
        xmin, ymin, xmax, ymax = current_box
        
        if xmin_last <= xmin <= xmax_last and ymin_last <= ymin <= ymax_last:
            print('adjust overlapping')
            distance_x = xmax_last - xmin
            distance_y = ymax_last - ymin
            if distance_x <= distance_y:
                # avoid overlap
                new_x_min = xmax_last + 0.025
                new_x_max = xmax - xmin + xmax_last + 0.025
                box_output[0,current_index,0] = new_x_min
                box_output[0,current_index,2] = new_x_max
            else:
                new_y_min = ymax_last + 0.025
                new_y_max = ymax - ymin + ymax_last + 0.025
                box_output[0,current_index,1] = new_y_min
                box_output[0,current_index,3] = new_y_max  
                
        elif xmin_last <= xmin <= xmax_last and ymin_last <= ymax <= ymax_last:
            print('adjust overlapping')
            new_x_min = xmax_last + 0.05
            new_x_max = xmax - xmin + xmax_last + 0.05
            box_output[0,current_index,0] = new_x_min
            box_output[0,current_index,2] = new_x_max
                    
        return box_output
    
    
def shrink_box(box, scale_factor = 0.9):
    """
    This function shrinks the box.
    
    Args:
        box (List): List of predicted boxes.
        scale_factor (float): The scale factor of shrinking.
    """
    
    x1, y1, x2, y2 = box
    x1_new = x1 + (x2 - x1) * (1 - scale_factor) / 2
    y1_new = y1 + (y2 - y1) * (1 - scale_factor) / 2
    x2_new = x2 - (x2 - x1) * (1 - scale_factor) / 2
    y2_new = y2 - (y2 - y1) * (1 - scale_factor) / 2
    return (x1_new, y1_new, x2_new, y2_new)


def adjust_font_size(args, width, height, draw, text):
    """
    This function adjusts the font size.
    
    Args:
        args (argparse.ArgumentParser): The arguments.
        width (int): The width of the text.
        height (int): The height of the text.
        draw (ImageDraw): The ImageDraw object.
        text (str): The text.
    """
    
    size_start = height
    while True:
        font = ImageFont.truetype(args.font_path, size_start)
        text_width, _ = draw.textsize(text, font=font)
        if text_width >= width:
            size_start = size_start - 1
        else:
            return size_start
    
    
def inpainting_merge_image(original_image, mask_image, inpainting_image):
    """
    This function merges the original image, mask image and inpainting image.
        
    Args:
        original_image (PIL.Image): The original image.
        mask_image (PIL.Image): The mask images.
        inpainting_image (PIL.Image): The inpainting images.
    """
    
    original_image = original_image.resize((512, 512))
    mask_image = mask_image.resize((512, 512))
    inpainting_image = inpainting_image.resize((512, 512))
    mask_image.convert('L')
    threshold = 250 
    table = []
    for i in range(256):
        if i < threshold:
            table.append(1)
        else:
            table.append(0)
    mask_image = mask_image.point(table, "1")
    merged_image = Image.composite(inpainting_image, original_image, mask_image)
    return merged_image